<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 02:01:28 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-6584] OSS hit LBUG and crash</title>
                <link>https://jira.whamcloud.com/browse/LU-6584</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>
&lt;p&gt;No sign or indication, ie lustre-log or error messages, OSS unexpectantly crash (please see console image).&lt;/p&gt;

&lt;p&gt;/var/log/messages is attached&lt;/p&gt;
</description>
                <environment>[&lt;a href=&apos;mailto:root@panda-oss-25-4&apos;&gt;root@panda-oss-25-4&lt;/a&gt; ~]# uname -a&lt;br/&gt;
Linux panda-oss-25-4.sdsc.edu 3.10.73-1.el6.elrepo.x86_64 #1 SMP Thu Mar 26 16:28:30 EDT 2015 x86_64 x86_64 x86_64 GNU/Linux&lt;br/&gt;
[&lt;a href=&apos;mailto:root@panda-oss-25-4&apos;&gt;root@panda-oss-25-4&lt;/a&gt; ~]# rpm -aq | grep lustre&lt;br/&gt;
lustre-2.7.51-3.10.73_1.el6.elrepo.x86_64_gb019b03.x86_64&lt;br/&gt;
lustre-osd-zfs-mount-2.7.51-3.10.73_1.el6.elrepo.x86_64_gb019b03.x86_64&lt;br/&gt;
lustre-iokit-2.7.51-3.10.73_1.el6.elrepo.x86_64_gb019b03.x86_64&lt;br/&gt;
lustre-source-2.7.51-3.10.73_1.el6.elrepo.x86_64_gb019b03.x86_64&lt;br/&gt;
lustre-osd-zfs-2.7.51-3.10.73_1.el6.elrepo.x86_64_gb019b03.x86_64&lt;br/&gt;
lustre-modules-2.7.51-3.10.73_1.el6.elrepo.x86_64_gb019b03.x86_64&lt;br/&gt;
lustre-tests-2.7.51-3.10.73_1.el6.elrepo.x86_64_gb019b03.x86_64&lt;br/&gt;
&lt;br/&gt;
</environment>
        <key id="29990">LU-6584</key>
            <summary>OSS hit LBUG and crash</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="2" iconUrl="https://jira.whamcloud.com/images/icons/priorities/critical.svg">Critical</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="tappro">Mikhail Pershin</assignee>
                                    <reporter username="haisong">Haisong Cai</reporter>
                        <labels>
                            <label>sdsc</label>
                    </labels>
                <created>Fri, 8 May 2015 00:05:20 +0000</created>
                <updated>Mon, 21 Dec 2015 18:49:00 +0000</updated>
                            <resolved>Wed, 7 Oct 2015 18:11:35 +0000</resolved>
                                    <version>Lustre 2.7.0</version>
                                    <fixVersion>Lustre 2.8.0</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>15</watches>
                                                                            <comments>
                            <comment id="114684" author="haisong" created="Fri, 8 May 2015 00:13:39 +0000"  >
&lt;p&gt;like to add an observation:&lt;/p&gt;

&lt;p&gt;In 2 cases happened to 2 separated OSS so far, an OSS crashed and was brought back, OSTs mounted, then almost immediately it crashed again. &lt;/p&gt;

&lt;p&gt;The console image was taken from second crash.&lt;/p&gt;

&lt;p&gt;Haisong&lt;/p&gt;</comment>
                            <comment id="114716" author="bfaccini" created="Fri, 8 May 2015 12:35:00 +0000"  >&lt;p&gt;Hello Haisong,&lt;br/&gt;
According to your messages/syslog file attached, seems that there is no crash-dump tool (kdump?) enabled on the OSS, right? If this is the case, is it possible to have this setup in order to take a vmcore image for post-mortem analysis upon a new occurrence?&lt;/p&gt;

&lt;p&gt;Only interesting infos found in the messages file is :&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;May  7 15:18:13 panda-oss-23-6 kernel: LustreError: 26222:0:(client.c:173:__ptlrpc_prep_bulk_page()) ASSERTION( pageoffset + len &amp;lt;= ((1UL) &amp;lt;&amp;lt; 12) ) failed:
May  7 15:18:13 panda-oss-23-6 kernel: LustreError: 26222:0:(client.c:173:__ptlrpc_prep_bulk_page()) LBUG
May  7 15:18:13 panda-oss-23-6 kernel: Pid: 26222, comm: ll_ost_io00_013
May  7 15:18:13 panda-oss-23-6 kernel:
May  7 15:18:13 panda-oss-23-6 kernel: Call Trace:
May  7 15:18:13 panda-oss-23-6 kernel: [&amp;lt;ffffffffa094b857&amp;gt;] libcfs_debug_dumpstack+0x57/0x80 [libcfs]
May  7 15:18:13 panda-oss-23-6 kernel: [&amp;lt;ffffffffa094bdd7&amp;gt;] lbug_with_loc+0x47/0xc0 [libcfs]
May  7 15:18:13 panda-oss-23-6 kernel: [&amp;lt;ffffffffa0fe921b&amp;gt;] __ptlrpc_prep_bulk_page+0xcb/0x190 [ptlrpc]
May  7 15:18:13 panda-oss-23-6 kernel: [&amp;lt;ffffffffa105f590&amp;gt;] tgt_brw_read+0xab0/0x11d0 [ptlrpc]
May  7 15:18:13 panda-oss-23-6 kernel: [&amp;lt;ffffffffa0ffa486&amp;gt;] ? lustre_pack_reply_flags+0xa6/0x1e0 [ptlrpc]
May  7 15:18:13 panda-oss-23-6 kernel: [&amp;lt;ffffffff8109518d&amp;gt;] ? sched_clock_cpu+0xcd/0x110
May  7 15:18:13 panda-oss-23-6 kernel: [&amp;lt;ffffffffa105c7ae&amp;gt;] tgt_handle_request0+0x9e/0x3f0 [ptlrpc]
May  7 15:18:13 panda-oss-23-6 kernel: [&amp;lt;ffffffffa10576c0&amp;gt;] ? tgt_handle_recovery+0x30/0x360 [ptlrpc]
May  7 15:18:13 panda-oss-23-6 kernel: [&amp;lt;ffffffffa105d371&amp;gt;] tgt_request_handle+0x1c1/0x770 [ptlrpc]
May  7 15:18:13 panda-oss-23-6 kernel: [&amp;lt;ffffffffa100a5e3&amp;gt;] ptlrpc_server_handle_request+0x2e3/0xbc0 [ptlrpc]
May  7 15:18:13 panda-oss-23-6 kernel: [&amp;lt;ffffffffa094c3de&amp;gt;] ? cfs_timer_arm+0xe/0x10 [libcfs]
May  7 15:18:13 panda-oss-23-6 kernel: [&amp;lt;ffffffffa095ba0a&amp;gt;] ? lc_watchdog_touch+0x7a/0x190 [libcfs]
May  7 15:18:13 panda-oss-23-6 kernel: [&amp;lt;ffffffffa1003209&amp;gt;] ? ptlrpc_wait_event+0xa9/0x2f0 [ptlrpc]
May  7 15:18:13 panda-oss-23-6 kernel: [&amp;lt;ffffffff8108d273&amp;gt;] ? __wake_up+0x53/0x70
May  7 15:18:13 panda-oss-23-6 kernel: [&amp;lt;ffffffffa100cd3c&amp;gt;] ptlrpc_main+0x9dc/0xd90 [ptlrpc]
May  7 15:18:13 panda-oss-23-6 kernel: [&amp;lt;ffffffffa100c360&amp;gt;] ? ptlrpc_main+0x0/0xd90 [ptlrpc]
May  7 15:18:13 panda-oss-23-6 kernel: [&amp;lt;ffffffff810821be&amp;gt;] kthread+0xce/0xe0
May  7 15:18:13 panda-oss-23-6 kernel: [&amp;lt;ffffffff810820f0&amp;gt;] ? kthread+0x0/0xe0
May  7 15:18:13 panda-oss-23-6 kernel: [&amp;lt;ffffffff815f93c8&amp;gt;] ret_from_fork+0x58/0x90
May  7 15:18:13 panda-oss-23-6 kernel: [&amp;lt;ffffffff810820f0&amp;gt;] ? kthread+0x0/0xe0
May  7 15:18:13 panda-oss-23-6 kernel:
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;so, did something go wrong during remote &amp;lt;-&amp;gt; local  buffers mapping?&lt;/p&gt;</comment>
                            <comment id="114739" author="haisong" created="Fri, 8 May 2015 16:28:35 +0000"  >&lt;p&gt;One of the OSS crashed 2 days ago crashed again this morning. I am attaching dmesg here.&lt;br/&gt;
This dmesg was taken between the first and second crash the scenario appears at every time now. &lt;/p&gt;

&lt;p&gt;Just to list all LBUG we have encounter on this filesystem so far (this morning was panda-oss-25-4)&lt;/p&gt;

&lt;p&gt;May  6 09:06:46 panda-oss-25-4 kernel: LustreError: 30076:0:(client.c:173:__ptlrpc_prep_bulk_page()) LBUG&lt;br/&gt;
May  6 10:18:54 panda-oss-25-4 kernel: LustreError: 11585:0:(client.c:173:__ptlrpc_prep_bulk_page()) LBUG&lt;br/&gt;
May  7 15:18:13 panda-oss-23-6 kernel: LustreError: 26222:0:(client.c:173:__ptlrpc_prep_bulk_page()) LBUG&lt;br/&gt;
May  7 16:47:23 panda-oss-23-6 kernel: LustreError: 10154:0:(client.c:173:__ptlrpc_prep_bulk_page()) LBUG&lt;br/&gt;
May  8 07:45:59 panda-oss-25-4 kernel: LustreError: 12448:0:(client.c:173:__ptlrpc_prep_bulk_page()) LBUG&lt;br/&gt;
May  8 09:12:59 panda-oss-25-4 kernel: LustreError: 10032:0:(client.c:173:__ptlrpc_prep_bulk_page()) LBUG&lt;/p&gt;


&lt;p&gt;To answer your question about kdump, no we don&apos;t have kdump enable on our OSS/MDS. &lt;br/&gt;
We will have to exam our environment before we can enable it. At the meantime, please suggest alternative way to collect debug information, if there is any.&lt;/p&gt;

&lt;p&gt;thanks,&lt;br/&gt;
Haisong&lt;/p&gt;</comment>
                            <comment id="114740" author="haisong" created="Fri, 8 May 2015 16:32:46 +0000"  >&lt;p&gt;Adding 2 lustre-logs for this morning&apos;s crash, on panda-oss-25-4&lt;/p&gt;</comment>
                            <comment id="114759" author="haisong" created="Fri, 8 May 2015 20:06:55 +0000"  >&lt;p&gt;Another OSS crashed, third OSS. Same pattern: the first crash, brought back OSS/OST, in less then a minite it comes with the second crash. It generally stay on and functional for hours until next crash. &lt;/p&gt;</comment>
                            <comment id="114781" author="pjones" created="Fri, 8 May 2015 22:53:35 +0000"  >&lt;p&gt;Mike&lt;/p&gt;

&lt;p&gt;Could you please advise on this one?&lt;/p&gt;

&lt;p&gt;Thanks&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="114805" author="rpwagner" created="Sat, 9 May 2015 04:24:51 +0000"  >&lt;p&gt;Haisong,&lt;/p&gt;

&lt;p&gt;This is hitting wombat (our test partition) when I&apos;m just running IOR. Both partitions have the same stack, now, so we may able reproduce it quickly. This also eliminates it being cause by pathological client IO on the production side.&lt;/p&gt;

&lt;p&gt;--Rick&lt;/p&gt;</comment>
                            <comment id="114807" author="rpwagner" created="Sat, 9 May 2015 04:33:08 +0000"  >&lt;p&gt;Mike,&lt;/p&gt;

&lt;p&gt;If it&apos;s any help, we were running v2.6.92 with a handful of patches related to large block support with ZFS (&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-4820&quot; title=&quot;extra memcpy in read path&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-4820&quot;&gt;&lt;del&gt;LU-4820&lt;/del&gt;&lt;/a&gt;, &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-5278&quot; title=&quot;ZFS - many OST watchdogs with IOR&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-5278&quot;&gt;&lt;del&gt;LU-5278&lt;/del&gt;&lt;/a&gt;, &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-6038&quot; title=&quot;ZFS 0.6.4 Compatibility&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-6038&quot;&gt;&lt;del&gt;LU-6038&lt;/del&gt;&lt;/a&gt;, &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-6038&quot; title=&quot;ZFS 0.6.4 Compatibility&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-6038&quot;&gt;&lt;del&gt;LU-6038&lt;/del&gt;&lt;/a&gt;, &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-6152&quot; title=&quot;zfs large block support&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-6152&quot;&gt;&lt;del&gt;LU-6152&lt;/del&gt;&lt;/a&gt;, &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-6155&quot; title=&quot;osd_count_not_mapped() calls dbuf_hold_impl() without the lock&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-6155&quot;&gt;&lt;del&gt;LU-6155&lt;/del&gt;&lt;/a&gt;) and did not see this. After 2.7 was released, we moved to v2.7.52 with the patches that hadn&apos;t landed yet. So, whatever&apos;s going on crept in between v2.6.92 and v2.7.52.&lt;/p&gt;

&lt;p&gt;--Rick&lt;/p&gt;</comment>
                            <comment id="115005" author="tappro" created="Tue, 12 May 2015 11:19:23 +0000"  >&lt;p&gt;Thanks for info, I am investigating this.&lt;/p&gt;</comment>
                            <comment id="116316" author="gerrit" created="Mon, 25 May 2015 09:37:20 +0000"  >&lt;p&gt;Mike Pershin (mike.pershin@intel.com) uploaded a new patch: &lt;a href=&quot;http://review.whamcloud.com/14926&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/14926&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-6584&quot; title=&quot;OSS hit LBUG and crash&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-6584&quot;&gt;&lt;del&gt;LU-6584&lt;/del&gt;&lt;/a&gt; osd: add more debug to check buffer size&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 9a7be00ed34a2c7f44a42f2603200977b47118d4&lt;/p&gt;</comment>
                            <comment id="116317" author="tappro" created="Mon, 25 May 2015 09:38:55 +0000"  >&lt;p&gt;I wasn&apos;t able to find exact place of problem by inspecting related code. There is patch to get a little bit more info when this bug happens.&lt;/p&gt;</comment>
                            <comment id="117072" author="adilger" created="Mon, 1 Jun 2015 17:46:43 +0000"  >&lt;p&gt;Mike, could you please take a look though the patches between 2.6.92 and 2.7.52 to see if there are any likely candidates?&lt;/p&gt;</comment>
                            <comment id="118398" author="haisong" created="Fri, 12 Jun 2015 17:26:47 +0000"  >
&lt;p&gt;We hit another LBUG and it looks like the same kind.&lt;br/&gt;
====&lt;/p&gt;

&lt;p&gt;Jun 12 01:01:45 panda-oss-25-2 kernel: LustreError: 29442:0:(client.c:173:__ptlrpc_prep_bulk_page()) ASSERTION( pageoffset + len &amp;lt;= ((1UL) &amp;lt;&amp;lt; 12) ) failed: &lt;br/&gt;
Jun 12 01:01:45 panda-oss-25-2 kernel: LustreError: 29442:0:(client.c:173:__ptlrpc_prep_bulk_page()) LBUG&lt;br/&gt;
Jun 12 01:01:45 panda-oss-25-2 kernel: Pid: 29442, comm: ll_ost_io01_105&lt;br/&gt;
Jun 12 01:01:45 panda-oss-25-2 kernel: &lt;br/&gt;
Jun 12 01:01:45 panda-oss-25-2 kernel: Call Trace:&lt;br/&gt;
Jun 12 01:01:45 panda-oss-25-2 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0940857&amp;gt;&amp;#93;&lt;/span&gt; libcfs_debug_dumpstack+0x57/0x80 &lt;span class=&quot;error&quot;&gt;&amp;#91;libcfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jun 12 01:01:45 panda-oss-25-2 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0940dd7&amp;gt;&amp;#93;&lt;/span&gt; lbug_with_loc+0x47/0xc0 &lt;span class=&quot;error&quot;&gt;&amp;#91;libcfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jun 12 01:01:45 panda-oss-25-2 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0fde21b&amp;gt;&amp;#93;&lt;/span&gt; __ptlrpc_prep_bulk_page+0xcb/0x190 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jun 12 01:01:45 panda-oss-25-2 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa1054590&amp;gt;&amp;#93;&lt;/span&gt; tgt_brw_read+0xab0/0x11d0 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jun 12 01:01:45 panda-oss-25-2 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0fef486&amp;gt;&amp;#93;&lt;/span&gt; ? lustre_pack_reply_flags+0xa6/0x1e0 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jun 12 01:01:45 panda-oss-25-2 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8109518d&amp;gt;&amp;#93;&lt;/span&gt; ? sched_clock_cpu+0xcd/0x110&lt;br/&gt;
Jun 12 01:01:45 panda-oss-25-2 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa10517ae&amp;gt;&amp;#93;&lt;/span&gt; tgt_handle_request0+0x9e/0x3f0 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jun 12 01:01:46 panda-oss-25-2 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa104c6c0&amp;gt;&amp;#93;&lt;/span&gt; ? tgt_handle_recovery+0x30/0x360 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jun 12 01:01:46 panda-oss-25-2 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa1052371&amp;gt;&amp;#93;&lt;/span&gt; tgt_request_handle+0x1c1/0x770 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jun 12 01:01:46 panda-oss-25-2 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0fff5e3&amp;gt;&amp;#93;&lt;/span&gt; ptlrpc_server_handle_request+0x2e3/0xbc0 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jun 12 01:01:46 panda-oss-25-2 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa09413de&amp;gt;&amp;#93;&lt;/span&gt; ? cfs_timer_arm+0xe/0x10 &lt;span class=&quot;error&quot;&gt;&amp;#91;libcfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jun 12 01:01:46 panda-oss-25-2 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0950a0a&amp;gt;&amp;#93;&lt;/span&gt; ? lc_watchdog_touch+0x7a/0x190 &lt;span class=&quot;error&quot;&gt;&amp;#91;libcfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jun 12 01:01:46 panda-oss-25-2 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0ff8209&amp;gt;&amp;#93;&lt;/span&gt; ? ptlrpc_wait_event+0xa9/0x2f0 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jun 12 01:01:46 panda-oss-25-2 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8108d273&amp;gt;&amp;#93;&lt;/span&gt; ? __wake_up+0x53/0x70&lt;br/&gt;
Jun 12 01:01:46 panda-oss-25-2 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa1001d3c&amp;gt;&amp;#93;&lt;/span&gt; ptlrpc_main+0x9dc/0xd90 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jun 12 01:01:46 panda-oss-25-2 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa1001360&amp;gt;&amp;#93;&lt;/span&gt; ? ptlrpc_main+0x0/0xd90 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jun 12 01:01:46 panda-oss-25-2 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff810821be&amp;gt;&amp;#93;&lt;/span&gt; kthread+0xce/0xe0&lt;br/&gt;
Jun 12 01:01:46 panda-oss-25-2 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff810820f0&amp;gt;&amp;#93;&lt;/span&gt; ? kthread+0x0/0xe0&lt;br/&gt;
Jun 12 01:01:46 panda-oss-25-2 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff815f93c8&amp;gt;&amp;#93;&lt;/span&gt; ret_from_fork+0x58/0x90&lt;br/&gt;
Jun 12 01:01:46 panda-oss-25-2 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff810820f0&amp;gt;&amp;#93;&lt;/span&gt; ? kthread+0x0/0xe0&lt;br/&gt;
Jun 12 01:01:46 panda-oss-25-2 kernel: &lt;br/&gt;
Jun 12 01:01:46 panda-oss-25-2 kernel: Kernel panic - not syncing: LBUG&lt;br/&gt;
Jun 12 01:01:46 panda-oss-25-2 kernel: CPU: 11 PID: 29442 Comm: ll_ost_io01_105 Tainted: P           O 3.10.73-1.el6.elrepo.x86_64 #1&lt;br/&gt;
Jun 12 01:01:46 panda-oss-25-2 kernel: Hardware name: Intel Corporation S2600CP/S2600CP, BIOS SE5C600.86B.02.04.0003.102320141138 10/23/2014&lt;br/&gt;
Jun 12 01:01:46 panda-oss-25-2 kernel: 000000006fe000c0 ffff881e2f76fa88 ffffffff815ece4a ffff881e2f76fb08&lt;br/&gt;
Jun 12 01:01:46 panda-oss-25-2 kernel: ffffffff815ecbeb ffffffff00000008 ffff881e2f76fb18 ffff881e2f76fab8&lt;br/&gt;
Jun 12 01:01:46 panda-oss-25-2 kernel: ffffffffa106cb20 ffff881e2f76fac8 0000000000000000 ffffffffa095f5b1&lt;br/&gt;
Jun 12 01:01:46 panda-oss-25-2 kernel: Call Trace:&lt;br/&gt;
Jun 12 01:01:46 panda-oss-25-2 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff815ece4a&amp;gt;&amp;#93;&lt;/span&gt; dump_stack+0x19/0x1f&lt;br/&gt;
Jun 12 01:01:46 panda-oss-25-2 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff815ecbeb&amp;gt;&amp;#93;&lt;/span&gt; panic+0xc4/0x1e4&lt;br/&gt;
Jun 12 01:01:46 panda-oss-25-2 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0940e3b&amp;gt;&amp;#93;&lt;/span&gt; lbug_with_loc+0xab/0xc0 &lt;span class=&quot;error&quot;&gt;&amp;#91;libcfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jun 12 01:01:46 panda-oss-25-2 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0fde21b&amp;gt;&amp;#93;&lt;/span&gt; __ptlrpc_prep_bulk_page+0xcb/0x190 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jun 12 01:01:46 panda-oss-25-2 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa1054590&amp;gt;&amp;#93;&lt;/span&gt; tgt_brw_read+0xab0/0x11d0 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jun 12 01:01:46 panda-oss-25-2 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0fef486&amp;gt;&amp;#93;&lt;/span&gt; ? lustre_pack_reply_flags+0xa6/0x1e0 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jun 12 01:01:46 panda-oss-25-2 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8109518d&amp;gt;&amp;#93;&lt;/span&gt; ? sched_clock_cpu+0xcd/0x110&lt;br/&gt;
Jun 12 01:01:46 panda-oss-25-2 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa10517ae&amp;gt;&amp;#93;&lt;/span&gt; tgt_handle_request0+0x9e/0x3f0 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jun 12 01:01:46 panda-oss-25-2 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa104c6c0&amp;gt;&amp;#93;&lt;/span&gt; ? tgt_handle_recovery+0x30/0x360 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;/p&gt;</comment>
                            <comment id="118560" author="adilger" created="Mon, 15 Jun 2015 17:34:39 +0000"  >&lt;p&gt;Haisong, it looks like you are not yet running the debug patch from Mike (&lt;a href=&quot;http://review.whamcloud.com/14926&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/14926&lt;/a&gt;) on your systems.  It would be useful if you applied that patch (only needed on the servers) so that we can capture more information about this failure.&lt;/p&gt;</comment>
                            <comment id="119772" author="tappro" created="Mon, 29 Jun 2015 05:52:36 +0000"  >&lt;p&gt;Rick, what version of ZFS are you using, 0.6.4?&lt;/p&gt;</comment>
                            <comment id="119895" author="rpwagner" created="Mon, 29 Jun 2015 23:10:08 +0000"  >&lt;p&gt;Hi Mikhail,&lt;/p&gt;

&lt;p&gt;Yes, 0.6.4, but with the first large block support pull request, 2865.&lt;/p&gt;

&lt;p&gt;Here&apos;s the SPL and ZFS build process I used.&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;git clone https://github.com/zfsonlinux/zpl.git
cd spl
git checkout spl-0.6.4
./autogen.sh
./configure --disable-debug
make pkg
rpm -ivh *x86_64.rpm


git clone https://github.com/zfsonlinux/zfs.git
cd zfs
git fetch -t https://github.com/zfsonlinux/zfs.git refs/pull/2865/head:lgblock
git checkout zfs-0.6.4
git merge lgblock
./autogen.sh
./configure --disable-debug
make pkg
rpm -ivh *x86_64.rpm
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="120131" author="tappro" created="Thu, 2 Jul 2015 09:56:01 +0000"  >&lt;p&gt;Rick, thank you for that info, I am trying to reproduce that situation. Meanwhile, were there any other specific tuning in ZFS, e.g. maximum block size or anything related to the block size?&lt;/p&gt;

&lt;p&gt;Also what Lustre patches are you using over the stock Lustre 2.7.52? I am asking because 2.7.52 can&apos;t be built with zfs version you are using, because of SPA_MAXBLOCKSHIFT is bigger now&lt;/p&gt;</comment>
                            <comment id="120169" author="rpwagner" created="Thu, 2 Jul 2015 16:43:54 +0000"  >&lt;p&gt;Mikhail,&lt;/p&gt;

&lt;p&gt;For ZFS with large block support, our record size is set to 1024k. On the Lustre side we&apos;ve got a few patches applied, one or more of which has not landed:&lt;/p&gt;

&lt;ul&gt;
	&lt;li&gt;&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-4820&quot; title=&quot;extra memcpy in read path&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-4820&quot;&gt;&lt;del&gt;LU-4820&lt;/del&gt;&lt;/a&gt; osd: drop memcpy in zfs osd&lt;/li&gt;
	&lt;li&gt;&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-6038&quot; title=&quot;ZFS 0.6.4 Compatibility&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-6038&quot;&gt;&lt;del&gt;LU-6038&lt;/del&gt;&lt;/a&gt; osd-zfs: Avoid redefining KM_SLEEP&lt;/li&gt;
	&lt;li&gt;&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-6038&quot; title=&quot;ZFS 0.6.4 Compatibility&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-6038&quot;&gt;&lt;del&gt;LU-6038&lt;/del&gt;&lt;/a&gt; osd-zfs: sa_spill_alloc()/sa_spill_free() compat&lt;/li&gt;
	&lt;li&gt;&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-6152&quot; title=&quot;zfs large block support&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-6152&quot;&gt;&lt;del&gt;LU-6152&lt;/del&gt;&lt;/a&gt; osd-zfs: ZFS large block compat&lt;/li&gt;
	&lt;li&gt;&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-6155&quot; title=&quot;osd_count_not_mapped() calls dbuf_hold_impl() without the lock&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-6155&quot;&gt;&lt;del&gt;LU-6155&lt;/del&gt;&lt;/a&gt; osd-zfs: dbuf_hold_impl() called without the lock&lt;/li&gt;
&lt;/ul&gt;


&lt;p&gt;&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-6152&quot; title=&quot;zfs large block support&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-6152&quot;&gt;&lt;del&gt;LU-6152&lt;/del&gt;&lt;/a&gt; in particular is the one that deals with SPA_MAXBLOCKSIZE and SPA_MAXBLOCKSHIFT.&lt;/p&gt;

&lt;p&gt;At our site, I&apos;ve handed over the build process to Dima Mishin for maintenance. Dima, can you pass on your current build (base commit, cherry picks, and any manual patches) to Mikhail?&lt;/p&gt;</comment>
                            <comment id="120182" author="dimm" created="Thu, 2 Jul 2015 17:42:07 +0000"  >&lt;p&gt;I have a build from commit 8a11cb62 in master, with patch from a44175d (&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-6155&quot; title=&quot;osd_count_not_mapped() calls dbuf_hold_impl() without the lock&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-6155&quot;&gt;&lt;del&gt;LU-6155&lt;/del&gt;&lt;/a&gt;), and cherry-picked c39e02e8 (&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-6584&quot; title=&quot;OSS hit LBUG and crash&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-6584&quot;&gt;&lt;del&gt;LU-6584&lt;/del&gt;&lt;/a&gt; - debug), and latest master commits of zfs and spl (a7b10a931, 77ab5dd33a)&lt;/p&gt;</comment>
                            <comment id="121009" author="mdiep" created="Fri, 10 Jul 2015 17:31:27 +0000"  >&lt;p&gt;Haisong,&lt;/p&gt;

&lt;p&gt;How often do you see this in production? could you apply the debug patch  (&lt;a href=&quot;http://review.whamcloud.com/14926)?&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/14926)?&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;Thanks&lt;br/&gt;
-Minh&lt;/p&gt;</comment>
                            <comment id="121014" author="haisong" created="Fri, 10 Jul 2015 17:57:19 +0000"  >&lt;p&gt;Hi Minh,&lt;/p&gt;

&lt;p&gt;Depending on how file-system is being used, we hit the bug between every day (as I described when I created this ticket) to several weeks apart.&lt;/p&gt;

&lt;p&gt;We are in the process of applying the patch on to our production file-system.&lt;/p&gt;

&lt;p&gt;Haisong&lt;/p&gt;</comment>
                            <comment id="121226" author="haisong" created="Tue, 14 Jul 2015 06:42:51 +0000"  >
&lt;p&gt;We are very close to get the debug patch deployed onto our production systems.&lt;br/&gt;
But a FYI, the following is a lit of all the hits of the LBUG and each hit is an OSS downtime:&lt;/p&gt;

&lt;p&gt;This is the number of times we have hit the lbug &lt;em&gt;today&lt;/em&gt; Each hit is an OSS downtime.&lt;/p&gt;

&lt;p&gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;root@oasis-panda cai&amp;#93;&lt;/span&gt;# grep -i lbug /var/log/messages&lt;br/&gt;
Jul 13 12:27:36 panda-oss-24-4 kernel: LustreError: 29775:0:(client.c:173:__ptlrpc_prep_bulk_page()) LBUG&lt;br/&gt;
Jul 13 12:27:37 panda-oss-24-4 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0951dd7&amp;gt;&amp;#93;&lt;/span&gt; lbug_with_loc+0x47/0xc0 &lt;span class=&quot;error&quot;&gt;&amp;#91;libcfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jul 13 12:27:37 panda-oss-24-4 kernel: Kernel panic - not syncing: LBUG&lt;br/&gt;
Jul 13 12:48:52 panda-oss-24-4 kernel: LustreError: 10351:0:(client.c:173:__ptlrpc_prep_bulk_page()) LBUG&lt;br/&gt;
Jul 13 13:12:49 panda-oss-22-4 kernel: LustreError: 30300:0:(client.c:173:__ptlrpc_prep_bulk_page()) LBUG&lt;br/&gt;
Jul 13 13:12:49 panda-oss-22-4 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa094edd7&amp;gt;&amp;#93;&lt;/span&gt; lbug_with_loc+0x47/0xc0 &lt;span class=&quot;error&quot;&gt;&amp;#91;libcfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jul 13 13:46:07 panda-oss-22-4 kernel: LustreError: 10383:0:(client.c:173:__ptlrpc_prep_bulk_page()) LBUG&lt;br/&gt;
Jul 13 13:46:07 panda-oss-22-4 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa091bdd7&amp;gt;&amp;#93;&lt;/span&gt; lbug_with_loc+0x47/0xc0 &lt;span class=&quot;error&quot;&gt;&amp;#91;libcfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jul 13 17:00:40 panda-oss-25-4 kernel: LustreError: 10468:0:(client.c:173:__ptlrpc_prep_bulk_page()) LBUG&lt;br/&gt;
Jul 13 17:00:40 panda-oss-25-4 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa090fdd7&amp;gt;&amp;#93;&lt;/span&gt; lbug_with_loc+0x47/0xc0 &lt;span class=&quot;error&quot;&gt;&amp;#91;libcfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jul 13 17:35:02 panda-oss-25-4 kernel: LustreError: 10310:0:(client.c:173:__ptlrpc_prep_bulk_page()) LBUG&lt;br/&gt;
Jul 13 17:47:21 panda-oss-24-1 kernel: LustreError: 29652:0:(client.c:173:__ptlrpc_prep_bulk_page()) LBUG&lt;br/&gt;
Jul 13 17:47:21 panda-oss-24-1 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0944dd7&amp;gt;&amp;#93;&lt;/span&gt; lbug_with_loc+0x47/0xc0 &lt;span class=&quot;error&quot;&gt;&amp;#91;libcfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jul 13 19:14:13 panda-oss-24-1 kernel: LustreError: 10101:0:(client.c:173:__ptlrpc_prep_bulk_page()) LBUG&lt;br/&gt;
Jul 13 19:14:13 panda-oss-24-1 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa091edd7&amp;gt;&amp;#93;&lt;/span&gt; lbug_with_loc+0x47/0xc0 &lt;span class=&quot;error&quot;&gt;&amp;#91;libcfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jul 13 19:14:14 panda-oss-24-1 kernel: Kernel panic - not syncing: LBUG&lt;br/&gt;
Jul 13 19:14:14 panda-oss-24-1 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa091ee3b&amp;gt;&amp;#93;&lt;/span&gt; lbug_with_loc+0xab/0xc0 &lt;span class=&quot;error&quot;&gt;&amp;#91;libcfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jul 13 22:48:29 panda-oss-25-4 kernel: LustreError: 10161:0:(client.c:173:__ptlrpc_prep_bulk_page()) LBUG&lt;br/&gt;
Jul 13 22:48:29 panda-oss-25-4 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0914dd7&amp;gt;&amp;#93;&lt;/span&gt; lbug_with_loc+0x47/0xc0 &lt;span class=&quot;error&quot;&gt;&amp;#91;libcfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jul 13 23:15:00 panda-oss-25-4 kernel: LustreError: 10185:0:(client.c:173:__ptlrpc_prep_bulk_page()) LBUG&lt;br/&gt;
Jul 13 23:15:01 panda-oss-25-4 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa090edd7&amp;gt;&amp;#93;&lt;/span&gt; lbug_with_loc+0x47/0xc0 &lt;span class=&quot;error&quot;&gt;&amp;#91;libcfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jul 13 23:15:01 panda-oss-25-4 kernel: Kernel panic - not syncing: LBUG&lt;br/&gt;
Jul 13 23:15:01 panda-oss-25-4 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa090ee3b&amp;gt;&amp;#93;&lt;/span&gt; lbug_with_loc+0xab/0xc0 &lt;span class=&quot;error&quot;&gt;&amp;#91;libcfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jul 13 23:21:26 panda-oss-25-4 kernel: LustreError: 10265:0:(client.c:173:__ptlrpc_prep_bulk_page()) LBUG&lt;br/&gt;
Jul 13 23:21:26 panda-oss-25-4 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0916dd7&amp;gt;&amp;#93;&lt;/span&gt; lbug_with_loc+0x47/0xc0 &lt;span class=&quot;error&quot;&gt;&amp;#91;libcfs&amp;#93;&lt;/span&gt;&lt;/p&gt;</comment>
                            <comment id="121246" author="haisong" created="Tue, 14 Jul 2015 16:44:23 +0000"  >
&lt;p&gt;In all crash cases these 2 lines always coupled:&lt;/p&gt;

&lt;p&gt;Jul 13 23:21:26 panda-oss-25-4 kernel: LustreError: 10265:0:(client.c:173:__ptlrpc_prep_bulk_page()) ASSERTION( pageoffset + len &amp;lt;= ((1UL) &amp;lt;&amp;lt; 12) ) failed: &lt;br/&gt;
Jul 13 23:21:26 panda-oss-25-4 kernel: LustreError: 10265:0:(client.c:173:__ptlrpc_prep_bulk_page()) LBUG&lt;/p&gt;

&lt;p&gt;and most of the time follow by this line:&lt;br/&gt;
Jul 13 23:21:26 panda-oss-25-4 kernel: Pid: 10265, comm: ll_ost_io00_031&lt;/p&gt;

&lt;p&gt;I am attaching /var/log/messages here that include all incidences from yesterday.&lt;/p&gt;</comment>
                            <comment id="121247" author="haisong" created="Tue, 14 Jul 2015 16:45:51 +0000"  >&lt;p&gt;collected with command line:&lt;/p&gt;

&lt;p&gt;cat /var/log/messages | egrep -iv &quot;sshd|cron|run-parts|postfix|rsyslog|audispd|named|rockscommand|channel|alert-handler|411-alert&quot; &amp;gt; /tmp/log.$$&lt;/p&gt;


&lt;p&gt;thanks,&lt;br/&gt;
Haisong&lt;/p&gt;</comment>
                            <comment id="121610" author="tappro" created="Sat, 18 Jul 2015 19:46:06 +0000"  >&lt;p&gt;Haisong, what do you mean by &apos;OSS downtime&apos;? &lt;/p&gt;</comment>
                            <comment id="121611" author="haisong" created="Sat, 18 Jul 2015 21:12:41 +0000"  >&lt;p&gt;Hi Mikhail,&lt;/p&gt;

&lt;p&gt;As I described at the very beginning of this ticket, when LBUG is hit and OSS crashes. &lt;br/&gt;
Machine will have to be power reset. During that time OSS will be unavailable.&lt;/p&gt;

&lt;p&gt;Haisong&lt;/p&gt;


</comment>
                            <comment id="121967" author="bobijam" created="Thu, 23 Jul 2015 01:46:46 +0000"  >&lt;p&gt;Hi, what the progress of applying the debug patch and collect the log again? The latest log is still without the debug patch.&lt;/p&gt;</comment>
                            <comment id="122053" author="rpwagner" created="Thu, 23 Jul 2015 19:16:15 +0000"  >&lt;p&gt;Zhenyu, we&apos;re rolling out an update to our production file system today. After discussion with Minh we&apos;ve decided to rebase our code on later releases of ZFS and Lustre that have our necessary patches. The only additional patch we&apos;ve added is the debugging one for this ticket. After we&apos;re done, I&apos;ll ask Dima Mishin to post the exact releases that we&apos;re working from.&lt;/p&gt;</comment>
                            <comment id="122084" author="dimm" created="Thu, 23 Jul 2015 23:27:08 +0000"  >&lt;p&gt;We used:&lt;br/&gt;
kernel-lt-3.10.73-1.el6.elrepo from kernel-lt&lt;br/&gt;
spl 77ab5dd33a99&lt;br/&gt;
zfs f1512ee61e2f22&lt;br/&gt;
lustre 1ef0185e8c12 with applied dba1c12e debug patch&lt;/p&gt;</comment>
                            <comment id="122791" author="haisong" created="Thu, 30 Jul 2015 22:26:35 +0000"  >&lt;p&gt;We hit another LBUG today. OSS is running debug patch. &lt;br/&gt;
Here is the /var/log/message and let me know what else you want me to collect.&lt;/p&gt;

&lt;p&gt;Jul 30 15:08:25 panda-oss-25-4 kernel: LustreError: 11719:0:(client.c:211:__ptlrpc_prep_bulk_page()) ASSERTION( pageoffset + len &amp;lt;= PAGE_CACHE_SIZE ) failed: offset 0, len 1913970688&lt;br/&gt;
Jul 30 15:08:25 panda-oss-25-4 kernel: LustreError: 11719:0:(client.c:211:__ptlrpc_prep_bulk_page()) LBUG&lt;br/&gt;
Jul 30 15:08:26 panda-oss-25-4 kernel: Pid: 11719, comm: ll_ost_io00_045&lt;br/&gt;
Jul 30 15:08:26 panda-oss-25-4 kernel: &lt;br/&gt;
Jul 30 15:08:26 panda-oss-25-4 kernel: Call Trace:&lt;br/&gt;
Jul 30 15:08:26 panda-oss-25-4 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0919857&amp;gt;&amp;#93;&lt;/span&gt; libcfs_debug_dumpstack+0x57/0x80 &lt;span class=&quot;error&quot;&gt;&amp;#91;libcfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jul 30 15:08:26 panda-oss-25-4 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0919dd7&amp;gt;&amp;#93;&lt;/span&gt; lbug_with_loc+0x47/0xc0 &lt;span class=&quot;error&quot;&gt;&amp;#91;libcfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jul 30 15:08:26 panda-oss-25-4 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0ca02fc&amp;gt;&amp;#93;&lt;/span&gt; __ptlrpc_prep_bulk_page+0x12c/0x1f0 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jul 30 15:08:26 panda-oss-25-4 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0ca03cc&amp;gt;&amp;#93;&lt;/span&gt; ptlrpc_prep_bulk_page_nopin+0xc/0x10 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jul 30 15:08:26 panda-oss-25-4 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0d1d432&amp;gt;&amp;#93;&lt;/span&gt; tgt_brw_read+0xab2/0x1210 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jul 30 15:08:26 panda-oss-25-4 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0cb06b6&amp;gt;&amp;#93;&lt;/span&gt; ? lustre_pack_reply_flags+0xa6/0x1e0 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jul 30 15:08:26 panda-oss-25-4 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8109518d&amp;gt;&amp;#93;&lt;/span&gt; ? sched_clock_cpu+0xcd/0x110&lt;br/&gt;
Jul 30 15:08:26 panda-oss-25-4 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0d1aca6&amp;gt;&amp;#93;&lt;/span&gt; tgt_handle_request0+0x2c6/0x430 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jul 30 15:08:26 panda-oss-25-4 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0d1567a&amp;gt;&amp;#93;&lt;/span&gt; ? tgt_handle_recovery+0x14a/0x310 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jul 30 15:08:26 panda-oss-25-4 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0d1b025&amp;gt;&amp;#93;&lt;/span&gt; tgt_request_handle+0x215/0x990 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jul 30 15:08:26 panda-oss-25-4 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0cc07c3&amp;gt;&amp;#93;&lt;/span&gt; ptlrpc_server_handle_request+0x2e3/0xbc0 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jul 30 15:08:26 panda-oss-25-4 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa092760a&amp;gt;&amp;#93;&lt;/span&gt; ? lc_watchdog_touch+0x7a/0x190 &lt;span class=&quot;error&quot;&gt;&amp;#91;libcfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jul 30 15:08:26 panda-oss-25-4 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0cb9ba9&amp;gt;&amp;#93;&lt;/span&gt; ? ptlrpc_wait_event+0xa9/0x300 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jul 30 15:08:26 panda-oss-25-4 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8108d273&amp;gt;&amp;#93;&lt;/span&gt; ? __wake_up+0x53/0x70&lt;br/&gt;
Jul 30 15:08:26 panda-oss-25-4 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0cc22cc&amp;gt;&amp;#93;&lt;/span&gt; ptlrpc_main+0x9ec/0xd30 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jul 30 15:08:26 panda-oss-25-4 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0cc18e0&amp;gt;&amp;#93;&lt;/span&gt; ? ptlrpc_main+0x0/0xd30 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jul 30 15:08:26 panda-oss-25-4 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff810821be&amp;gt;&amp;#93;&lt;/span&gt; kthread+0xce/0xe0&lt;br/&gt;
Jul 30 15:08:26 panda-oss-25-4 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff810820f0&amp;gt;&amp;#93;&lt;/span&gt; ? kthread+0x0/0xe0&lt;br/&gt;
Jul 30 15:08:26 panda-oss-25-4 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff815f93c8&amp;gt;&amp;#93;&lt;/span&gt; ret_from_fork+0x58/0x90&lt;br/&gt;
Jul 30 15:08:26 panda-oss-25-4 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff810820f0&amp;gt;&amp;#93;&lt;/span&gt; ? kthread+0x0/0xe0&lt;br/&gt;
Jul 30 15:08:26 panda-oss-25-4 kernel: &lt;/p&gt;</comment>
                            <comment id="122794" author="haisong" created="Thu, 30 Jul 2015 23:34:38 +0000"  >
&lt;p&gt;In little more than an hour:&lt;/p&gt;

&lt;p&gt;Jul 30 15:08:25 panda-oss-25-4 kernel: LustreError: 11719:0:(client.c:211:__ptlrpc_prep_bulk_page()) LBUG&lt;br/&gt;
Jul 30 15:25:13 panda-oss-25-4 kernel: LustreError: 9907:0:(client.c:211:__ptlrpc_prep_bulk_page()) LBUG&lt;br/&gt;
Jul 30 15:25:13 panda-oss-25-4 kernel: Kernel panic - not syncing: LBUG&lt;br/&gt;
Jul 30 16:16:39 panda-oss-25-4 kernel: LustreError: 10230:0:(client.c:211:__ptlrpc_prep_bulk_page()) LBUG&lt;/p&gt;</comment>
                            <comment id="123611" author="adilger" created="Fri, 7 Aug 2015 17:59:25 +0000"  >&lt;p&gt;Is it worthwhile to print out the whole lnb struct (maybe with neighboring values also) to see if there is memory corruption that can be identified?  Is the oops always in the same place? Then it seems likely that there is some systematic memory corruption (stack overflow, out of bounds array access, etc) rather than random corruption from another thread, which would cause crashes in other parts of the code. &lt;/p&gt;</comment>
                            <comment id="123925" author="bobijam" created="Wed, 12 Aug 2015 06:23:03 +0000"  >&lt;p&gt;Hi Haisong,&lt;/p&gt;

&lt;p&gt;&lt;a href=&quot;http://review.whamcloud.com/#/c/14926/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#/c/14926/&lt;/a&gt; has been updated to include more debug information to catch the LBUG, would you please apply it and re-hit the issue and collect core dump? Please add &quot;ha&quot; debug before re-run the test, as &lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;lctl set_param debug=&lt;span class=&quot;code-quote&quot;&gt;&quot;+ha&quot;&lt;/span&gt; &lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="123975" author="haisong" created="Wed, 12 Aug 2015 18:03:35 +0000"  >&lt;p&gt;Hi Zhenyu,&lt;/p&gt;

&lt;p&gt;We will try to apply the new debug patch. But since this is our production file-system, we will have to plan it.&lt;br/&gt;
We hit this LBUG randomly depend on how the file-system is used. For example we just had 7 hits during last 24hrs, &lt;br/&gt;
after about 12 days quiet period.  &lt;/p&gt;

&lt;p&gt;So I have 2 questions:&lt;/p&gt;

&lt;p&gt;1) by any chance you know a way to reproduce the LBUG? The reason I ask is that we have a non-production file-system that&lt;br/&gt;
    we may use for this debugging.&lt;br/&gt;
2) about collecting core dump, do you mean the core dump of specific lustre processes or the entire system? When LBUG is hit, &lt;br/&gt;
    the OSS servers panic. I want to make sure we setup so that core dump can be saved properly. &lt;/p&gt;

&lt;p&gt;I like to remind everyone that this LBUG event usually occurs 2 or 3 consecutively, meaning OSS crash, standing OSS up, &lt;br/&gt;
crash again, and standing OSS up again, and the third time, before it stabilized.  Here are examples we had for the last 24hr.  &lt;br/&gt;
Please notice the server names and time stamps:&lt;/p&gt;

&lt;p&gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;root@oasis-panda log&amp;#93;&lt;/span&gt;# grep -i lbug messages&lt;br/&gt;
Aug 11 12:32:47 panda-oss-25-4 kernel: LustreError: 10267:0:(client.c:211:__ptlrpc_prep_bulk_page()) LBUG&lt;br/&gt;
Aug 11 12:32:47 panda-oss-25-4 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0920dd7&amp;gt;&amp;#93;&lt;/span&gt; lbug_with_loc+0x47/0xc0 &lt;span class=&quot;error&quot;&gt;&amp;#91;libcfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
Aug 11 12:32:47 panda-oss-25-4 kernel: Kernel panic - not syncing: LBUG&lt;br/&gt;
Aug 11 12:32:47 panda-oss-25-4 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0920e3b&amp;gt;&amp;#93;&lt;/span&gt; lbug_with_loc+0xab/0xc0 &lt;span class=&quot;error&quot;&gt;&amp;#91;libcfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
Aug 11 13:29:54 panda-oss-25-4 kernel: LustreError: 10206:0:(client.c:211:__ptlrpc_prep_bulk_page()) LBUG&lt;br/&gt;
Aug 11 13:29:54 panda-oss-25-4 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0927dd7&amp;gt;&amp;#93;&lt;/span&gt; lbug_with_loc+0x47/0xc0 &lt;span class=&quot;error&quot;&gt;&amp;#91;libcfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
Aug 11 13:29:55 panda-oss-25-4 kernel: Kernel panic - not syncing: LBUG&lt;br/&gt;
Aug 11 13:29:55 panda-oss-25-4 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0927e3b&amp;gt;&amp;#93;&lt;/span&gt; lbug_with_loc+0xab/0xc0 &lt;span class=&quot;error&quot;&gt;&amp;#91;libcfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
Aug 11 13:36:50 panda-oss-24-4 kernel: LustreError: 11848:0:(client.c:211:__ptlrpc_prep_bulk_page()) LBUG&lt;br/&gt;
Aug 11 13:45:55 panda-oss-24-4 kernel: LustreError: 10229:0:(client.c:211:__ptlrpc_prep_bulk_page()) LBUG&lt;br/&gt;
Aug 11 13:45:55 panda-oss-24-4 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0923dd7&amp;gt;&amp;#93;&lt;/span&gt; lbug_with_loc+0x47/0xc0 &lt;span class=&quot;error&quot;&gt;&amp;#91;libcfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
Aug 12 08:09:43 panda-oss-25-3 kernel: LustreError: 11728:0:(client.c:211:__ptlrpc_prep_bulk_page()) LBUG&lt;br/&gt;
Aug 12 09:48:28 panda-oss-25-3 kernel: LustreError: 11232:0:(client.c:211:__ptlrpc_prep_bulk_page()) LBUG&lt;br/&gt;
Aug 12 09:48:29 panda-oss-25-3 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0919dd7&amp;gt;&amp;#93;&lt;/span&gt; lbug_with_loc+0x47/0xc0 &lt;span class=&quot;error&quot;&gt;&amp;#91;libcfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
Aug 12 09:48:29 panda-oss-25-3 kernel: Kernel panic - not syncing: LBUG&lt;br/&gt;
Aug 12 09:48:29 panda-oss-25-3 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0919e3b&amp;gt;&amp;#93;&lt;/span&gt; lbug_with_loc+0xab/0xc0 &lt;span class=&quot;error&quot;&gt;&amp;#91;libcfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
Aug 12 09:52:56 panda-oss-25-3 kernel: LustreError: 10548:0:(client.c:211:__ptlrpc_prep_bulk_page()) LBUG&lt;br/&gt;
Aug 12 09:52:56 panda-oss-25-3 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0916dd7&amp;gt;&amp;#93;&lt;/span&gt; lbug_with_loc+0x47/0xc0 &lt;span class=&quot;error&quot;&gt;&amp;#91;libcfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
Aug 12 09:52:58 panda-oss-25-3 kernel: Kernel panic - not syncing: LBUG&lt;br/&gt;
Aug 12 09:52:58 panda-oss-25-3 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0916e3b&amp;gt;&amp;#93;&lt;/span&gt; lbug_with_loc+0xab/0xc0 &lt;span class=&quot;error&quot;&gt;&amp;#91;libcfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;root@oasis-panda log&amp;#93;&lt;/span&gt;# &lt;/p&gt;

&lt;p&gt;thank,&lt;br/&gt;
Haisong&lt;/p&gt;</comment>
                            <comment id="124012" author="bobijam" created="Thu, 13 Aug 2015 01:48:00 +0000"  >&lt;p&gt;1. Unfortunately we don&apos;t know the reason of the LBUG, we just saw that some IO has invalid page length, and it seems there is some specific memory corruption there, so the debug patch is try to print out IO pages information if abnormal happens.&lt;/p&gt;

&lt;p&gt;2. You can setup kdump to capture the kernel crash dump as &lt;a href=&quot;http://fedoraproject.org/wiki/How_to_use_kdump_to_debug_kernel_crashes&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://fedoraproject.org/wiki/How_to_use_kdump_to_debug_kernel_crashes&lt;/a&gt; guides.&lt;/p&gt;</comment>
                            <comment id="124078" author="adilger" created="Thu, 13 Aug 2015 17:50:19 +0000"  >&lt;p&gt;If the OSS is crashing repeatedly after startup, it may mean that the bad data is arriving from the client during replay and is not being verified properly?  Are there checks up in the ost layer to verify the niobuf_remote contains valid data before it is used in the OSD?  It may be that the corruption is happening on the network or on the client.&lt;/p&gt;</comment>
                            <comment id="124726" author="adilger" created="Thu, 20 Aug 2015 18:46:37 +0000"  >&lt;p&gt;In the boot log I see you are using ZFS 0.6.4.  It looks like there may be fixes in 0.6.4.1 and 0.6.4.2 that may be helpful in this case and &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-6607&quot; title=&quot;MDS ( 2 node DNE) running out of memory and crash&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-6607&quot;&gt;&lt;del&gt;LU-6607&lt;/del&gt;&lt;/a&gt; (reducing CPU usage on metadata intensive workloads):&lt;br/&gt;
&lt;a href=&quot;https://github.com/zfsonlinux/zfs/releases/tag/zfs-0.6.4.1&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://github.com/zfsonlinux/zfs/releases/tag/zfs-0.6.4.1&lt;/a&gt;&lt;/p&gt;
&lt;blockquote&gt;
&lt;p&gt;    Fixed io-spare.sh script for ZED.&lt;br/&gt;
    Fixed multiple deadlocks which might occur when reclaiming memory.&lt;br/&gt;
    Fixed excessive CPU usage for meta data heavy workloads when reclaiming the ARC.&lt;/p&gt;&lt;/blockquote&gt;
&lt;p&gt;&lt;a href=&quot;https://github.com/zfsonlinux/zfs/releases/tag/zfs-0.6.4.2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://github.com/zfsonlinux/zfs/releases/tag/zfs-0.6.4.2&lt;/a&gt;&lt;/p&gt;
&lt;blockquote&gt;
&lt;p&gt;    Fix panic due to corrupt nvlist when running utilities zfsonlinux/zfs#3335&lt;br/&gt;
    Fix hard lockup due to infinite loop in zfs_zget() zfsonlinux/zfs#3349&lt;br/&gt;
    Fix panic on unmount due to iput taskq zfsonlinux/zfs#3281&lt;br/&gt;
    Improve metadata shrinker performance on pre-3.1 kernels zfsonlinux/zfs#3501&lt;br/&gt;
    Linux 4.1 compat: use read_iter() / write_iter()&lt;br/&gt;
    Linux 3.12 compat: NUMA-aware per-superblock shrinker&lt;br/&gt;
    Fix spurious hung task watchdog stack traces zfsonlinux/zfs#3402&lt;br/&gt;
    Fix module loading in zfs import systemd service zfsonlinux/zfs#3440&lt;br/&gt;
    Fix intermittent libzfs_init() failure to open /dev/zfs zfsonlinux/zfs#2556&lt;/p&gt;&lt;/blockquote&gt;

&lt;p&gt;Also, just to confirm - are these OSS nodes running with 1MB ZFS blocksize?&lt;/p&gt;</comment>
                            <comment id="124728" author="dimm" created="Thu, 20 Aug 2015 19:05:16 +0000"  >&lt;p&gt;We&apos;re using the f1512ee61e commit from master ZFS branch (large block support). It&apos;s later than 0.6.4.1, and I had problems running with latest master version.&lt;/p&gt;</comment>
                            <comment id="124949" author="rpwagner" created="Mon, 24 Aug 2015 20:12:49 +0000"  >&lt;p&gt;Andreas, yes, we&apos;re using 1MB block sizes on the ZFS datasets that handle the OSTs.&lt;/p&gt;</comment>
                            <comment id="125056" author="adilger" created="Tue, 25 Aug 2015 17:46:44 +0000"  >&lt;p&gt;Bobijam,&lt;br/&gt;
can you please make a new patch that checks the contents of niobuf_remote when it is first accessed by the OST (&lt;tt&gt;tgt_brw_read()&lt;/tt&gt; and &lt;tt&gt;tgt_brw_write()&lt;/tt&gt;) to verify that the contents are sane, and print out all the values under D_BUFFS debugging.  If the values are incorrect a CERROR() should be printed and an &lt;tt&gt;-EPROTO&lt;/tt&gt; error returned to the client, and we can debug this problem as a network corruption.&lt;/p&gt;

&lt;p&gt;This niobuf verification should be in a helper function that can also be called before the currently-failing LASSERT() checks are being handled (and elsewhere in the code if you think it is helpful), and those functions can return an &lt;tt&gt;&lt;del&gt;EIO&lt;/tt&gt; error to the caller rather than triggering the LASSERT.  At that point the client should resend the BRW RPC due to &lt;tt&gt;brw_interpret()&lt;/del&gt;&amp;gt;osc_recoverable_error()&lt;/tt&gt; and hopefully it will succeed on the second try.&lt;/p&gt;

&lt;p&gt;While I don&apos;t think this is a proper solution, it will at least tell us if the corruption is happening on the client and/or on the network, or in memory on the OSS, and it will potentially allow debugging to continue without the high frequency of OSS failures.&lt;/p&gt;</comment>
                            <comment id="125057" author="adilger" created="Tue, 25 Aug 2015 17:49:14 +0000"  >&lt;p&gt;Rick, the other possible avenue for debugging is to disable the 1MB blocksize tunable on one or more of your OST datasets, and see if this correlates to a reduction or elimination of the occurrence of this failure.  This is one of the main deltas between your ZFS environment and other ZFS users, so this would allow us to isolate the memory corruption to the code handling 1MB blocksize.&lt;/p&gt;</comment>
                            <comment id="125149" author="bobijam" created="Wed, 26 Aug 2015 06:38:25 +0000"  >&lt;p&gt;&lt;a href=&quot;http://review.whamcloud.com/#/c/14926/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#/c/14926/&lt;/a&gt; has been updated to add more remote/local buffer check.&lt;/p&gt;</comment>
                            <comment id="125388" author="rpwagner" created="Thu, 27 Aug 2015 15:34:44 +0000"  >&lt;p&gt;We&apos;ve scheduled a maintenance window for Sep. 8 to roll out this latest patch after testing.&lt;/p&gt;

&lt;p&gt;Andreas, I&apos;ll consider changing the recordsize on some of the OSTs. The most likely scenario where we get solid information from this is if the LBUG is still hit on one of the OSSes with the changed setting. I am being a little cautious considering this since it will mean having a ZFS dataset with varying recordsizes. I don&apos;t believe the ZFS layer will care, but it&apos;s not something I&apos;ve dealt with before.&lt;/p&gt;
</comment>
                            <comment id="127658" author="adilger" created="Thu, 17 Sep 2015 17:34:26 +0000"  >&lt;p&gt;Hi Rick, any news on this front?  Have you looked into upgrading to ZFS 0.6.5 to get the native large block support?  The patch &lt;a href=&quot;http://review.whamcloud.com/15127&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/15127&lt;/a&gt; &quot;&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-4865&quot; title=&quot;osd-zfs: increase object block size dynamically as object grows&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-4865&quot;&gt;&lt;del&gt;LU-4865&lt;/del&gt;&lt;/a&gt; zfs: grow block size by write pattern&quot; should also help performance when dealing with files under 1MB in size.&lt;/p&gt;</comment>
                            <comment id="127743" author="rpwagner" created="Fri, 18 Sep 2015 04:07:24 +0000"  >&lt;p&gt;Hi Andreas, since our last update to the code tree based on &lt;a href=&quot;http://review.whamcloud.com/#/c/14926/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#/c/14926/&lt;/a&gt; we&apos;ve been stable. It&apos;s possible that we&apos;ve pulled in a bugfix along with the debugging patch although I couldn&apos;t point to a specific one.&lt;/p&gt;

&lt;p&gt;We are looking at ZFS 0.6.5 to get away from the unreleased version for ZFS we&apos;ve had to run. I would probably do that along with another rebase to a later unpatched tag of Lustre, maybe once &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-4865&quot; title=&quot;osd-zfs: increase object block size dynamically as object grows&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-4865&quot;&gt;&lt;del&gt;LU-4865&lt;/del&gt;&lt;/a&gt; is included.&lt;/p&gt;

&lt;p&gt;On a related note, I think this issue could be removed from the 2.8 blocker list, since we started with patched versions of Lustre and ZFS.&lt;/p&gt;</comment>
                            <comment id="128943" author="gerrit" created="Wed, 30 Sep 2015 19:06:23 +0000"  >&lt;p&gt;Mike Pershin (mike.pershin@intel.com) uploaded a new patch: &lt;a href=&quot;http://review.whamcloud.com/16685&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/16685&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-6584&quot; title=&quot;OSS hit LBUG and crash&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-6584&quot;&gt;&lt;del&gt;LU-6584&lt;/del&gt;&lt;/a&gt; osd: prevent int type overflow in osd_read_prep()&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 687338302147dad5b09b964b8615a3b3adb78a7d&lt;/p&gt;</comment>
                            <comment id="128944" author="tappro" created="Wed, 30 Sep 2015 19:08:48 +0000"  >&lt;p&gt;It seems the reason of this issue is the int type overflow in lnb_rc. Instead of writing the (eof - file_offset) right into lnb_rc we have to check first it is not negative.&lt;/p&gt;</comment>
                            <comment id="129238" author="pjones" created="Sat, 3 Oct 2015 14:19:15 +0000"  >&lt;p&gt;Will SDSC be able to try this patch out to confirm whether it fixes the issues that they have been experiencing?&lt;/p&gt;</comment>
                            <comment id="129240" author="rpwagner" created="Sat, 3 Oct 2015 14:55:23 +0000"  >&lt;p&gt;Yes, we&apos;re scheduling a PM and push this out. Could this patch be related to &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-7106&quot; title=&quot;Lustre client fail with error vvp_io.c:1081:vvp_io_commit_write()) even went there are space in OST and MDT&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-7106&quot;&gt;&lt;del&gt;LU-7106&lt;/del&gt;&lt;/a&gt;? In other words could the current code create an error that propagates back to the client as ENOSPC even when there&apos;s capacity on the OST?&lt;/p&gt;</comment>
                            <comment id="129243" author="tappro" created="Sat, 3 Oct 2015 19:43:05 +0000"  >&lt;p&gt;Rick, this particular issue existed in IO READ code path and doesn&apos;t related to &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-7106&quot; title=&quot;Lustre client fail with error vvp_io.c:1081:vvp_io_commit_write()) even went there are space in OST and MDT&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-7106&quot;&gt;&lt;del&gt;LU-7106&lt;/del&gt;&lt;/a&gt;. I check OSD code quickly and didn&apos;t notice other similar issues at first glance.&lt;/p&gt;</comment>
                            <comment id="129709" author="gerrit" created="Wed, 7 Oct 2015 17:39:16 +0000"  >&lt;p&gt;Oleg Drokin (oleg.drokin@intel.com) merged in patch &lt;a href=&quot;http://review.whamcloud.com/16685/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/16685/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-6584&quot; title=&quot;OSS hit LBUG and crash&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-6584&quot;&gt;&lt;del&gt;LU-6584&lt;/del&gt;&lt;/a&gt; osd: prevent int type overflow in osd_read_prep()&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: efe3842c76b8041a048457779554ffa5ba76567d&lt;/p&gt;</comment>
                            <comment id="129722" author="pjones" created="Wed, 7 Oct 2015 18:11:35 +0000"  >&lt;p&gt;Fix landed for 2.8. We&apos;ll reopen if this issue still is hit on Hyperion. If there is still an issue at SDSC and it is not, as hoped, a duplicate of this issue then please open a new ticket to track that issue.&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10010">
                    <name>Duplicate</name>
                                                                <inwardlinks description="is duplicated by">
                                                        </inwardlinks>
                                    </issuelinktype>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                                        </outwardlinks>
                                                                <inwardlinks description="is related to">
                                                        </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                            <attachment id="17704" name="23-6.png" size="48101" author="haisong" created="Fri, 8 May 2015 00:05:20 +0000"/>
                            <attachment id="18420" name="log.28119.gz" size="397716" author="haisong" created="Tue, 14 Jul 2015 16:45:51 +0000"/>
                            <attachment id="17710" name="lustre-logs.tgz" size="223" author="haisong" created="Fri, 8 May 2015 16:33:05 +0000"/>
                            <attachment id="17709" name="messages13" size="277499" author="haisong" created="Fri, 8 May 2015 16:28:54 +0000"/>
                            <attachment id="17703" name="panda-oss-23-6_messages" size="1026683" author="haisong" created="Fri, 8 May 2015 00:05:20 +0000"/>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzxctb:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10023"><![CDATA[4]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>