<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 01:13:42 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-1121] recovery-mds-scale (FLAVOR=OSS): tar: Wrote only 4096 of 7168 bytes</title>
                <link>https://jira.whamcloud.com/browse/LU-1121</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;While running recovery-mds-scale with FLAVOR=OSS, it failed as follows:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;==== Checking the clients loads AFTER  failover -- failure NOT OK
ost3 has failed over 1 times, and counting...
sleeping 582 seconds ... 
tar: etc/selinux/targeted/modules/active/modules/sandbox.pp: Wrote only 4096 of 7168 bytes
tar: Exiting with failure status due to previous errors
Found the END_RUN_FILE file: /home/yujian/test_logs/end_run_file
client-1-ib
Client load failed on node client-1-ib

client client-1-ib load stdout and debug files :
              /tmp/recovery-mds-scale.log_run_tar.sh-client-1-ib
              /tmp/recovery-mds-scale.log_run_tar.sh-client-1-ib.debug
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;/tmp/recovery-mds-scale.log_run_tar.sh-client-1-ib:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;tar: etc/selinux/targeted/modules/active/modules/sandbox.pp: Wrote only 4096 of 7168 bytes
tar: Exiting with failure status due to previous errors
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;/tmp/recovery-mds-scale.log_run_tar.sh-client-1-ib.debug:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;&amp;lt;~snip~&amp;gt;
2012-02-18 22:30:41: tar run starting
+ mkdir -p /mnt/lustre/d0.tar-client-1-ib
+ cd /mnt/lustre/d0.tar-client-1-ib
+ wait 7567
+ do_tar
+ tar cf - /etc
+ tar xf -
+ tee /tmp/recovery-mds-scale.log_run_tar.sh-client-1-ib
tar: Removing leading `/&apos; from member names
+ return 2
+ RC=2
++ grep &apos;exit delayed from previous errors&apos; /tmp/recovery-mds-scale.log_run_tar.sh-client-1-ib
+ PREV_ERRORS=
+ true
+ &apos;[&apos; 2 -ne 0 -a &apos;&apos; -a &apos;&apos; &apos;]&apos;
+ &apos;[&apos; 2 -eq 0 &apos;]&apos;
++ date &apos;+%F %H:%M:%S&apos;
+ echoerr &apos;2012-02-18 22:37:10: tar failed&apos;
+ echo &apos;2012-02-18 22:37:10: tar failed&apos;
2012-02-18 22:37:10: tar failed
&amp;lt;~snip~&amp;gt;
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Syslog on client node client-1-ib showed that:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;Feb 18 22:34:54 client-1 kernel: INFO: task flush-lustre-1:3510 blocked for more than 120 seconds.
Feb 18 22:34:54 client-1 kernel: &quot;echo 0 &amp;gt; /proc/sys/kernel/hung_task_timeout_secs&quot; disables this message.
Feb 18 22:34:54 client-1 kernel: flush-lustre- D 0000000000000000     0  3510      2 0x00000080
Feb 18 22:34:54 client-1 kernel: ffff8801f70e99a0 0000000000000046 ffff8801f70e9920 ffffffffa0942434
Feb 18 22:34:54 client-1 kernel: 0000000000000000 ffff880331d24980 ffff8801f70e9930 0000000000000000
Feb 18 22:34:54 client-1 kernel: ffff88027d12b0b8 ffff8801f70e9fd8 000000000000f4e8 ffff88027d12b0b8
Feb 18 22:34:54 client-1 kernel: Call Trace:
Feb 18 22:34:54 client-1 kernel: [&amp;lt;ffffffffa0942434&amp;gt;] ? cfs_hash_dual_bd_unlock+0x34/0x60 [libcfs]
Feb 18 22:34:54 client-1 kernel: [&amp;lt;ffffffff8109b809&amp;gt;] ? ktime_get_ts+0xa9/0xe0
Feb 18 22:34:54 client-1 kernel: [&amp;lt;ffffffff81110b10&amp;gt;] ? sync_page+0x0/0x50
Feb 18 22:34:54 client-1 kernel: [&amp;lt;ffffffff814ed1c3&amp;gt;] io_schedule+0x73/0xc0
Feb 18 22:34:54 client-1 kernel: [&amp;lt;ffffffff81110b4d&amp;gt;] sync_page+0x3d/0x50
Feb 18 22:34:54 client-1 kernel: [&amp;lt;ffffffff814eda2a&amp;gt;] __wait_on_bit_lock+0x5a/0xc0
Feb 18 22:34:54 client-1 kernel: [&amp;lt;ffffffff81110ae7&amp;gt;] __lock_page+0x67/0x70
Feb 18 22:34:54 client-1 kernel: [&amp;lt;ffffffff81090c30&amp;gt;] ? wake_bit_function+0x0/0x50
Feb 18 22:34:54 client-1 kernel: [&amp;lt;ffffffff81124c97&amp;gt;] ? __writepage+0x17/0x40
Feb 18 22:34:54 client-1 kernel: [&amp;lt;ffffffff811261f2&amp;gt;] write_cache_pages+0x392/0x4a0
Feb 18 22:34:54 client-1 kernel: [&amp;lt;ffffffff81052600&amp;gt;] ? __dequeue_entity+0x30/0x50
Feb 18 22:34:54 client-1 kernel: [&amp;lt;ffffffff81124c80&amp;gt;] ? __writepage+0x0/0x40
Feb 18 22:34:54 client-1 kernel: [&amp;lt;ffffffff8126a5c9&amp;gt;] ? cpumask_next_and+0x29/0x50
Feb 18 22:34:54 client-1 kernel: [&amp;lt;ffffffff81054754&amp;gt;] ? find_busiest_group+0x244/0xb20
Feb 18 22:34:54 client-1 kernel: [&amp;lt;ffffffff81126324&amp;gt;] generic_writepages+0x24/0x30
Feb 18 22:34:54 client-1 kernel: [&amp;lt;ffffffff81126351&amp;gt;] do_writepages+0x21/0x40
Feb 18 22:34:54 client-1 kernel: [&amp;lt;ffffffff811a046d&amp;gt;] writeback_single_inode+0xdd/0x2c0
Feb 18 22:34:54 client-1 kernel: [&amp;lt;ffffffff811a08ae&amp;gt;] writeback_sb_inodes+0xce/0x180
Feb 18 22:34:54 client-1 kernel: [&amp;lt;ffffffff811a0a0b&amp;gt;] writeback_inodes_wb+0xab/0x1b0
Feb 18 22:34:54 client-1 kernel: [&amp;lt;ffffffff811a0dab&amp;gt;] wb_writeback+0x29b/0x3f0
Feb 18 22:34:54 client-1 kernel: [&amp;lt;ffffffff814eca20&amp;gt;] ? thread_return+0x4e/0x77e
Feb 18 22:34:54 client-1 kernel: [&amp;lt;ffffffff8107cc02&amp;gt;] ? del_timer_sync+0x22/0x30
Feb 18 22:34:54 client-1 kernel: [&amp;lt;ffffffff811a1099&amp;gt;] wb_do_writeback+0x199/0x240
Feb 18 22:34:54 client-1 kernel: [&amp;lt;ffffffff811a11a3&amp;gt;] bdi_writeback_task+0x63/0x1b0
Feb 18 22:34:54 client-1 kernel: [&amp;lt;ffffffff81090ab7&amp;gt;] ? bit_waitqueue+0x17/0xd0
Feb 18 22:34:54 client-1 kernel: [&amp;lt;ffffffff81134d40&amp;gt;] ? bdi_start_fn+0x0/0x100
Feb 18 22:34:54 client-1 kernel: [&amp;lt;ffffffff81134dc6&amp;gt;] bdi_start_fn+0x86/0x100
Feb 18 22:34:54 client-1 kernel: [&amp;lt;ffffffff81134d40&amp;gt;] ? bdi_start_fn+0x0/0x100
Feb 18 22:34:54 client-1 kernel: [&amp;lt;ffffffff81090886&amp;gt;] kthread+0x96/0xa0
Feb 18 22:34:54 client-1 kernel: [&amp;lt;ffffffff8100c14a&amp;gt;] child_rip+0xa/0x20
Feb 18 22:34:54 client-1 kernel: [&amp;lt;ffffffff810907f0&amp;gt;] ? kthread+0x0/0xa0
Feb 18 22:34:54 client-1 kernel: [&amp;lt;ffffffff8100c140&amp;gt;] ? child_rip+0x0/0x20
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Maloo report: &lt;a href=&quot;https://maloo.whamcloud.com/test_sets/f3b4fe94-5af9-11e1-8801-5254004bbbd3&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/f3b4fe94-5af9-11e1-8801-5254004bbbd3&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;Please refer to the attached recovery-oss-scale.1329633991.log.tar.bz2 for more logs.&lt;/p&gt;

&lt;p&gt;It seems this is issue &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-874&quot; title=&quot;Client eviction on lock callback timeout &quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-874&quot;&gt;&lt;del&gt;LU-874&lt;/del&gt;&lt;/a&gt;.&lt;/p&gt;</description>
                <environment>&lt;br/&gt;
Lustre Tag: v2_1_1_0_RC4&lt;br/&gt;
Lustre Build: &lt;a href=&quot;http://build.whamcloud.com/job/lustre-b2_1/44/&quot;&gt;http://build.whamcloud.com/job/lustre-b2_1/44/&lt;/a&gt;&lt;br/&gt;
e2fsprogs Build: &lt;a href=&quot;http://build.whamcloud.com/job/e2fsprogs-master/217/&quot;&gt;http://build.whamcloud.com/job/e2fsprogs-master/217/&lt;/a&gt;&lt;br/&gt;
Distro/Arch: RHEL6/x86_64 (kernel version: 2.6.32-220.el6)&lt;br/&gt;
Network: IB (in-kernel OFED)&lt;br/&gt;
ENABLE_QUOTA=yes&lt;br/&gt;
FAILURE_MODE=HARD&lt;br/&gt;
FLAVOR=OSS&lt;br/&gt;
&lt;br/&gt;
MGS/MDS Nodes: client-8-ib&lt;br/&gt;
&lt;br/&gt;
OSS Nodes: client-18-ib(active), client-19-ib(active)&lt;br/&gt;
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;\ /&lt;br/&gt;
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;OST1 (active in client-18-ib)&lt;br/&gt;
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;OST2 (active in client-19-ib)&lt;br/&gt;
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;OST3 (active in client-18-ib)&lt;br/&gt;
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;OST4 (active in client-19-ib)&lt;br/&gt;
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;OST5 (active in client-18-ib)&lt;br/&gt;
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;OST6 (active in client-19-ib)&lt;br/&gt;
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;client-9-ib(OST7)&lt;br/&gt;
&lt;br/&gt;
Client Nodes: client-[1,4,17],fat-amd-2,fat-intel-2&lt;br/&gt;
&lt;br/&gt;
Network Addresses:&lt;br/&gt;
client-1-ib: 192.168.4.1&lt;br/&gt;
client-4-ib: 192.168.4.4&lt;br/&gt;
client-8-ib: 192.168.4.8&lt;br/&gt;
client-9-ib: 192.168.4.9&lt;br/&gt;
client-17-ib: 192.168.4.17&lt;br/&gt;
client-18-ib: 192.168.4.18&lt;br/&gt;
client-19-ib: 192.168.4.19&lt;br/&gt;
fat-amd-2-ib: 192.168.4.133&lt;br/&gt;
fat-intel-2-ib: 192.168.4.129&lt;br/&gt;
</environment>
        <key id="13238">LU-1121</key>
            <summary>recovery-mds-scale (FLAVOR=OSS): tar: Wrote only 4096 of 7168 bytes</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="1" iconUrl="https://jira.whamcloud.com/images/icons/priorities/blocker.svg">Blocker</priority>
                        <status id="6" iconUrl="https://jira.whamcloud.com/images/icons/statuses/closed.png" description="The issue is considered finished, the resolution is correct. Issues which are closed can be reopened.">Closed</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="2">Won&apos;t Fix</resolution>
                                        <assignee username="green">Oleg Drokin</assignee>
                                    <reporter username="yujian">Jian Yu</reporter>
                        <labels>
                    </labels>
                <created>Sun, 19 Feb 2012 08:39:03 +0000</created>
                <updated>Sun, 14 Aug 2016 17:18:01 +0000</updated>
                            <resolved>Sun, 14 Aug 2016 17:18:01 +0000</resolved>
                                    <version>Lustre 2.1.1</version>
                    <version>Lustre 2.1.3</version>
                                                        <due></due>
                            <votes>0</votes>
                                    <watches>5</watches>
                                                                            <comments>
                            <comment id="29447" author="yujian" created="Sun, 19 Feb 2012 08:41:04 +0000"  >&lt;p&gt;Hi Jay,&lt;br/&gt;
Could you please take a look to see whether this is a duplicate of &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-874&quot; title=&quot;Client eviction on lock callback timeout &quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-874&quot;&gt;&lt;del&gt;LU-874&lt;/del&gt;&lt;/a&gt; or not? Thanks.&lt;/p&gt;</comment>
                            <comment id="29450" author="green" created="Sun, 19 Feb 2012 10:22:13 +0000"  >&lt;p&gt;Looking at the logs, I am not all that sure it&apos;s related to lu-874 after all.&lt;/p&gt;

&lt;p&gt;I don&apos;t see the affected filename ever being written too, so possibly the logs are too short to see the problem, or we just don&apos;t have the necessary debug level enabled to see where the write failed to look around that time and the tar output itself is not timed.&lt;/p&gt;

&lt;p&gt;Additionally the last part of client-1 log is pretty strange. It appear that after 22;36, where there is last cd into d0-tar... dir (after the failure was already registered), a period of very frequent pings to OSTs begin for some reason, no other activity happens.&lt;br/&gt;
It continues for ~561 seconds, I am not really sure what is that part about.&lt;/p&gt;</comment>
                            <comment id="29468" author="jay" created="Mon, 20 Feb 2012 01:57:03 +0000"  >&lt;p&gt;In my humble opinion, the stack trace is not related with the issue here, neither is it related to &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-874&quot; title=&quot;Client eviction on lock callback timeout &quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-874&quot;&gt;&lt;del&gt;LU-874&lt;/del&gt;&lt;/a&gt;. The real issue is due to short write where 7168 bytes was supposed to be written but actually 4096 bytes was written. I didn&apos;t find any clue from logs. I suspect that there must exist issues with kms handling.&lt;/p&gt;

&lt;p&gt;The reason why there was no activity on client-1 after 22:36 is because the test loop exited due to failure on tar.&lt;/p&gt;</comment>
                            <comment id="29477" author="yujian" created="Mon, 20 Feb 2012 09:56:50 +0000"  >&lt;p&gt;Thanks Oleg and Jinshan for the analysis. I&apos;m going to reproduce this issue and gather more logs. The current debug level is:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[root@client-18-ib ~]# lctl get_param debug
debug=
super warning dlmtrace error emerg ha rpctrace vfstrace config console
[root@client-18-ib ~]# lctl get_param subsystem_debug
subsystem_debug=
undefined mdc mds osc ost class log llite rpc filter echo ldlm lov lquota lmv sec gss mgc mgs fid fld
[root@client-18-ib ~]# lctl get_param debug_mb
debug_mb=48
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Could you please suggest what extra debug code I should add to gather more logs? Thanks.&lt;/p&gt;</comment>
                            <comment id="29480" author="green" created="Mon, 20 Feb 2012 10:45:30 +0000"  >&lt;p&gt;Jinshan, please note the abnormal high rate of pngs on all osts after 22:36, it&apos;s unrelated to the bug at hand, but sill looks strange.&lt;/p&gt;

&lt;p&gt;YuJian, I imagine having vfstrace enabled would have been helpful, also somewhat increased debug buffer size.&lt;br/&gt;
The kms tracking is done under D_INODE flag, though we usually don&apos;t run with it and I don&apos;t know how much it will impact testing.&lt;br/&gt;
If you are loking to do extending testing, +vfstrace +inode +ha look like sensible options to me. Also a 1024 in debug_mb if you have the RAM. &lt;/p&gt;</comment>
                            <comment id="29487" author="jay" created="Mon, 20 Feb 2012 14:12:39 +0000"  >&lt;blockquote&gt;
&lt;p&gt;Could you please suggest what extra debug code I should add to gather more logs? Thanks.&lt;/p&gt;&lt;/blockquote&gt;

&lt;p&gt;On the client side, I always do the following:&lt;/p&gt;

&lt;p&gt;lctl set_param debug=-1&lt;br/&gt;
lctl set_param debug=-trace&lt;/p&gt;

&lt;p&gt;In this way, I can get reasonable log and not need a super size debug memory. Just a hint &lt;img class=&quot;emoticon&quot; src=&quot;https://jira.whamcloud.com/images/icons/emoticons/wink.png&quot; height=&quot;16&quot; width=&quot;16&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;/p&gt;</comment>
                            <comment id="29532" author="yujian" created="Wed, 22 Feb 2012 02:55:13 +0000"  >&lt;p&gt;After I set the debug level and size as follows on all of the nodes:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;debug=-1
subsystem_debug=0xffb7e3ff
debug_mb=200
debug=-trace
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Running recovery-mds-scale with FLAVOR=OSS hit the following LBUG on OSS client-18-ib:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;LustreError: 5227:0:(filter.c:4141:filter_destroy())  lustre-OST0003: can not find olg of group 0^M
LustreError: 5227:0:(filter.c:3683:filter_handle_precreate()) ASSERTION(diff &amp;gt;= 0) failed: lustre-OST0003: 163 - 225 = -62^M
LustreError: 5227:0:(filter.c:3683:filter_handle_precreate()) LBUG^M
Pid: 5227, comm: tgt_recov^M
^M
Call Trace:^M
 [&amp;lt;ffffffffa0636855&amp;gt;] libcfs_debug_dumpstack+0x55/0x80 [libcfs]^M
 [&amp;lt;ffffffffa0636e95&amp;gt;] lbug_with_loc+0x75/0xe0 [libcfs]^M
 [&amp;lt;ffffffffa090ec4d&amp;gt;] filter_create+0x160d/0x1640 [obdfilter]^M
 [&amp;lt;ffffffffa0919b11&amp;gt;] filter_preprw_write+0x6c1/0x1f10 [obdfilter]^M
 [&amp;lt;ffffffffa083e87b&amp;gt;] ? _debug_req+0x51b/0x660 [ptlrpc]^M
 [&amp;lt;ffffffffa091c258&amp;gt;] filter_preprw+0x68/0x90 [obdfilter]^M
 [&amp;lt;ffffffffa07c110e&amp;gt;] obd_preprw+0x11e/0x420 [ost]^M
 [&amp;lt;ffffffffa07cb18c&amp;gt;] ost_brw_write+0x98c/0x1870 [ost]^M
 [&amp;lt;ffffffffa06416a1&amp;gt;] ? libcfs_debug_vmsg2+0x4d1/0xb50 [libcfs]^M
 [&amp;lt;ffffffffa083b264&amp;gt;] ? lustre_msg_get_opc+0x94/0x100 [ptlrpc]^M
 [&amp;lt;ffffffffa07cfce5&amp;gt;] ost_handle+0x3325/0x4b90 [ost]^M
 [&amp;lt;ffffffffa0b16b16&amp;gt;] ? vvp_session_key_init+0xd6/0x1d0 [lustre]^M
 [&amp;lt;ffffffffa07cc9c0&amp;gt;] ? ost_handle+0x0/0x4b90 [ost]^M
 [&amp;lt;ffffffffa0800cf6&amp;gt;] handle_recovery_req+0x1f6/0x330 [ptlrpc]^M
 [&amp;lt;ffffffffa0801257&amp;gt;] target_recovery_thread+0x3a7/0xf50 [ptlrpc]^M
 [&amp;lt;ffffffffa0800eb0&amp;gt;] ? target_recovery_thread+0x0/0xf50 [ptlrpc]^M
 [&amp;lt;ffffffff8100c14a&amp;gt;] child_rip+0xa/0x20^M
 [&amp;lt;ffffffffa0800eb0&amp;gt;] ? target_recovery_thread+0x0/0xf50 [ptlrpc]^M
 [&amp;lt;ffffffffa0800eb0&amp;gt;] ? target_recovery_thread+0x0/0xf50 [ptlrpc]^M
 [&amp;lt;ffffffff8100c140&amp;gt;] ? child_rip+0x0/0x20^M
^M
Kernel panic - not syncing: LBUG^M
Pid: 5227, comm: tgt_recov Not tainted 2.6.32-220.el6_lustre.g4554b65.x86_64 #1^M
Call Trace:^M
 [&amp;lt;ffffffff814ec661&amp;gt;] ? panic+0x78/0x143^M
 [&amp;lt;ffffffffa0636eeb&amp;gt;] ? lbug_with_loc+0xcb/0xe0 [libcfs]^M
 [&amp;lt;ffffffffa090ec4d&amp;gt;] ? filter_create+0x160d/0x1640 [obdfilter]^M
 [&amp;lt;ffffffffa0919b11&amp;gt;] ? filter_preprw_write+0x6c1/0x1f10 [obdfilter]^M
 [&amp;lt;ffffffffa083e87b&amp;gt;] ? _debug_req+0x51b/0x660 [ptlrpc]^M
 [&amp;lt;ffffffffa091c258&amp;gt;] ? filter_preprw+0x68/0x90 [obdfilter]^M
 [&amp;lt;ffffffffa07c110e&amp;gt;] ? obd_preprw+0x11e/0x420 [ost]^M
 [&amp;lt;ffffffffa07cb18c&amp;gt;] ? ost_brw_write+0x98c/0x1870 [ost]^M
 [&amp;lt;ffffffffa06416a1&amp;gt;] ? libcfs_debug_vmsg2+0x4d1/0xb50 [libcfs]^M
 [&amp;lt;ffffffffa083b264&amp;gt;] ? lustre_msg_get_opc+0x94/0x100 [ptlrpc]^M
 [&amp;lt;ffffffffa07cfce5&amp;gt;] ? ost_handle+0x3325/0x4b90 [ost]^M
 [&amp;lt;ffffffffa0b16b16&amp;gt;] ? vvp_session_key_init+0xd6/0x1d0 [lustre]^M
 [&amp;lt;ffffffffa07cc9c0&amp;gt;] ? ost_handle+0x0/0x4b90 [ost]^M
 [&amp;lt;ffffffffa0800cf6&amp;gt;] ? handle_recovery_req+0x1f6/0x330 [ptlrpc]^M
 [&amp;lt;ffffffffa0801257&amp;gt;] ? target_recovery_thread+0x3a7/0xf50 [ptlrpc]^M
 [&amp;lt;ffffffffa0800eb0&amp;gt;] ? target_recovery_thread+0x0/0xf50 [ptlrpc]^M
 [&amp;lt;ffffffff8100c14a&amp;gt;] ? child_rip+0xa/0x20^M
 [&amp;lt;ffffffffa0800eb0&amp;gt;] ? target_recovery_thread+0x0/0xf50 [ptlrpc]^M
 [&amp;lt;ffffffffa0800eb0&amp;gt;] ? target_recovery_thread+0x0/0xf50 [ptlrpc]^M
 [&amp;lt;ffffffff8100c140&amp;gt;] ? child_rip+0x0/0x20^M
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Please refer to /scratch/logs/2.1.1/recovery-oss-scale.1329896304.log.tar.bz2 on brent node for more logs.&lt;/p&gt;</comment>
                            <comment id="29533" author="yujian" created="Wed, 22 Feb 2012 03:31:56 +0000"  >&lt;p&gt;I&apos;m disabling panic_on_lbug to get debug log...&lt;/p&gt;</comment>
                            <comment id="29541" author="yujian" created="Wed, 22 Feb 2012 07:51:33 +0000"  >&lt;blockquote&gt;&lt;p&gt;I&apos;m disabling panic_on_lbug to get debug log...&lt;/p&gt;&lt;/blockquote&gt;

&lt;p&gt;After disabling panic_on_lbug, I could not reproduce the above LBUG and the original issue of this ticket, but kept hitting the known issue: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-463&quot; title=&quot;orphan recovery happens too late, causing writes to fail with ENOENT after recovery&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-463&quot;&gt;&lt;del&gt;LU-463&lt;/del&gt;&lt;/a&gt;.&lt;/p&gt;</comment>
                            <comment id="29614" author="green" created="Thu, 23 Feb 2012 00:02:58 +0000"  >&lt;p&gt;The second crash is now tracked under &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-1129&quot; title=&quot;filter_handle_precreate()) ASSERTION(diff &amp;gt;= 0) failed&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-1129&quot;&gt;&lt;del&gt;LU-1129&lt;/del&gt;&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="43174" author="yujian" created="Tue, 14 Aug 2012 07:18:01 +0000"  >&lt;p&gt;Lustre Tag: v2_1_3_RC1&lt;br/&gt;
Lustre Build: &lt;a href=&quot;http://build.whamcloud.com/job/lustre-b2_1/113/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://build.whamcloud.com/job/lustre-b2_1/113/&lt;/a&gt;&lt;br/&gt;
Distro/Arch: RHEL6.3/x86_64 (kernel version: 2.6.32-279.2.1.el6)&lt;br/&gt;
Network: IB (in-kernel OFED)&lt;br/&gt;
ENABLE_QUOTA=yes&lt;br/&gt;
FAILURE_MODE=HARD&lt;/p&gt;

&lt;p&gt;The original issue occurred again while running recovery-mds-scale failover_ost test:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;tar: etc/selinux/targeted/modules/active/modules/rhgb.pp: Wrote only 4096 of 7680 bytes
tar: Exiting with failure status due to previous errors
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;&lt;a href=&quot;https://maloo.whamcloud.com/test_sets/dc54205c-e534-11e1-ae4e-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/dc54205c-e534-11e1-ae4e-52540035b04c&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;After setting PTLDEBUG=-1 and DEBUG_SIZE=200 to reproduce the issue and gather more logs, I hit &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-463&quot; title=&quot;orphan recovery happens too late, causing writes to fail with ENOENT after recovery&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-463&quot;&gt;&lt;del&gt;LU-463&lt;/del&gt;&lt;/a&gt; again:&lt;br/&gt;
&lt;a href=&quot;https://maloo.whamcloud.com/test_sets/b18a1330-e5ad-11e1-ae4e-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/b18a1330-e5ad-11e1-ae4e-52540035b04c&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="161836" author="simmonsja" created="Sun, 14 Aug 2016 17:18:01 +0000"  >&lt;p&gt;Really old blocker for unsupported version&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                        <issuelink>
            <issuekey id="13269">LU-1129</issuekey>
        </issuelink>
                            </outwardlinks>
                                                        </issuelinktype>
                    </issuelinks>
                <attachments>
                            <attachment id="10861" name="recovery-oss-scale.1329633991.log.tar.bz2" size="1108666" author="yujian" created="Sun, 19 Feb 2012 08:39:03 +0000"/>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzv31r:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>3993</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>