<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 01:43:08 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-4483]  sanity test 133f: Panic</title>
                <link>https://jira.whamcloud.com/browse/LU-4483</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;Lustre: DEBUG MARKER: == sanity test 133f: Check for LBUGs/Oopses/unreadable files in /proc == 13:22:32 (1389691352)&lt;br/&gt;
proc_file_read: Apparent buffer overflow!&lt;br/&gt;
proc_file_read: Apparent buffer overflow!&lt;br/&gt;
proc_file_read: Apparent buffer overflow!&lt;br/&gt;
proc_file_read: Apparent buffer overflow!&lt;br/&gt;
proc_file_read: Apparent buffer overflow!&lt;br/&gt;
proc_file_read: Apparent buffer overflow!&lt;br/&gt;
proc_file_read: Apparent buffer overflow!&lt;br/&gt;
proc_file_read: Apparent buffer overflow!&lt;br/&gt;
proc_file_read: Apparent buffer overflow!&lt;br/&gt;
general protection fault: 0000 &lt;a href=&quot;#1&quot; target=&quot;_blank&quot; rel=&quot;noopener&quot;&gt;1&lt;/a&gt; SMP &lt;br/&gt;
last sysfs file: /sys/devices/system/cpu/online&lt;br/&gt;
CPU 0 &lt;br/&gt;
Modules linked in: ext2 lustre ofd osp lod ost mdt mdd mgs nodemap osd_ldiskfs ldiskfs exportfs lquota lfsck jbd obdecho mgc lov osc mdc lmv fid fld ptlrpc obdclass ksocklnd lnet sha512_generic sha256_generic crc32c_intel libcfs nfs lockd auth_rpcgss nfs_acl sunrpc cachefiles fscache(T) ib_ipoib ib_cm ipv6 ib_uverbs ib_umad mlx4_ib ib_sa ib_mad ib_core mlx4_en mlx4_core dm_mirror dm_region_hash dm_log dm_mod ppdev parport_pc parport vmware_balloon microcode vmxnet3 sg i2c_piix4 i2c_core shpchp ext4 jbd2 mbcache sd_mod crc_t10dif sr_mod cdrom vmw_pvscsi pata_acpi ata_generic ata_piix &lt;span class=&quot;error&quot;&gt;&amp;#91;last unloaded: scsi_wait_scan&amp;#93;&lt;/span&gt;&lt;/p&gt;

&lt;p&gt;Pid: 35, comm: events/0 Tainted: G           ---------------  T 2.6.32-358.18.1-lustre #0 VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform&lt;br/&gt;
RIP: 0010:&lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff812a3650&amp;gt;&amp;#93;&lt;/span&gt;  &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff812a3650&amp;gt;&amp;#93;&lt;/span&gt; list_del+0x10/0xa0&lt;br/&gt;
RSP: 0018:ffff88013ddbdc80  EFLAGS: 00010082&lt;br/&gt;
RAX: 302f302f302f302f RBX: ffff880011c86040 RCX: ffff880138903c50&lt;br/&gt;
RDX: 0020000000000080 RSI: ffff880091faf000 RDI: ffff880011c86040&lt;br/&gt;
RBP: ffff88013ddbdc90 R08: ffff880138903c50 R09: 0000000000000001&lt;br/&gt;
R10: 0000000000000000 R11: ffff880090218820 R12: ffff880011c86040&lt;br/&gt;
R13: ffff8801399817c0 R14: 0000000000000002 R15: 000000000000101a&lt;br/&gt;
FS:  0000000000000000(0000) GS:ffff88002be00000(0000) knlGS:0000000000000000&lt;br/&gt;
CS:  0010 DS: 0018 ES: 0018 CR0: 000000008005003b&lt;br/&gt;
CR2: 00000000022715a8 CR3: 000000008514e000 CR4: 00000000001407f0&lt;br/&gt;
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000&lt;br/&gt;
DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400&lt;br/&gt;
Process events/0 (pid: 35, threadinfo ffff88013ddbc000, task ffff88013ddba1c0)&lt;br/&gt;
Stack:&lt;br/&gt;
 ffff8800487fb040 ffff8801385726c0 ffff88013ddbdcf0 ffffffff8117aab6&lt;br/&gt;
&amp;lt;d&amp;gt; ffff880138903ca8 0000000300000000 ffff880011c86850 ffff880138903c50&lt;br/&gt;
&amp;lt;d&amp;gt; ffff88013ddbdcf0 ffff880139981760 ffff8801385726c0 0000000000000003&lt;br/&gt;
Call Trace:&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8117aab6&amp;gt;&amp;#93;&lt;/span&gt; free_block+0xc6/0x230&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8117ae51&amp;gt;&amp;#93;&lt;/span&gt; drain_array+0xc1/0xf0&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8117eb3f&amp;gt;&amp;#93;&lt;/span&gt; ? cache_reap+0x2f/0x2e0&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8117ebbe&amp;gt;&amp;#93;&lt;/span&gt; cache_reap+0xae/0x2e0&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff810931bb&amp;gt;&amp;#93;&lt;/span&gt; ? worker_thread+0x1cb/0x3d0&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8117eb10&amp;gt;&amp;#93;&lt;/span&gt; ? cache_reap+0x0/0x2e0&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8109320c&amp;gt;&amp;#93;&lt;/span&gt; worker_thread+0x21c/0x3d0&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff810931bb&amp;gt;&amp;#93;&lt;/span&gt; ? worker_thread+0x1cb/0x3d0&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff81099a00&amp;gt;&amp;#93;&lt;/span&gt; ? autoremove_wake_function+0x0/0x40&lt;/p&gt;</description>
                <environment>OSTCOUNT=40 sh sanity.sh</environment>
        <key id="22732">LU-4483</key>
            <summary> sanity test 133f: Panic</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="2" iconUrl="https://jira.whamcloud.com/images/icons/priorities/critical.svg">Critical</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="wc-triage">WC Triage</assignee>
                                    <reporter username="shadow">Alexey Lyashkov</reporter>
                        <labels>
                    </labels>
                <created>Tue, 14 Jan 2014 09:26:52 +0000</created>
                <updated>Wed, 23 Dec 2015 19:02:15 +0000</updated>
                            <resolved>Tue, 23 Dec 2014 09:42:15 +0000</resolved>
                                    <version>Lustre 2.6.0</version>
                                    <fixVersion>Lustre 2.6.0</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>5</watches>
                                                                            <comments>
                            <comment id="74919" author="simmonsja" created="Tue, 14 Jan 2014 12:24:54 +0000"  >&lt;p&gt;Somebody is attempting to read data using too little memory using a non seq_file read routine. If this started to just happen it might be NRS tbf or nodemap. Since it crashed on a list_del I only see NRS tbf working with list. Can you post a link to the maloo test that failed so I can see which proc entry is the source of the problem.&lt;/p&gt;</comment>
                            <comment id="74948" author="shadow" created="Tue, 14 Jan 2014 17:38:40 +0000"  >&lt;p&gt;James,&lt;/p&gt;

&lt;p&gt;it&apos;s may local sanity.sh run. console output:&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;cat: /proc/fs/lustre/osc/lustre-OST0005-osc-ffff8800aff251b8/ping: Input/output error
cat: /proc/fs/lustre/osc/lustre-OST0004-osc-ffff8800aff251b8/ping: Input/output error
cat: /proc/fs/lustre/osc/lustre-OST0003-osc-ffff8800aff251b8/ping: Input/output error
cat: /proc/fs/lustre/osc/lustre-OST0002-osc-ffff8800aff251b8/ping: Input/output error
cat: /proc/fs/lustre/osc/lustre-OST0001-osc-ffff8800aff251b8/ping: Input/output error
cat: /proc/fs/lustre/osc/lustre-OST0000-osc-ffff8800aff251b8/ping: Input/output error
cat: /proc/fs/lustre/mdc/lustre-MDT0000-mdc-ffff8800aff251b8/ping: Input/output error
cat: /proc/fs/lustre/fld/cli-lustre-clilmv-ffff8800aff251b8/cache_flush: Input/output error
Timeout, server 192.168.69.5 not responding.
berloga-mac:~ shadow$
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;from my point view - it&apos;s memory corruption issue - when someone overwrite an slab&apos;s internal list.&lt;/p&gt;</comment>
                            <comment id="75017" author="adilger" created="Wed, 15 Jan 2014 18:54:38 +0000"  >&lt;p&gt;Shadow, it would be great if you could include &quot;git describe&quot; output with your bug reports, so we can see what version of the code you are running.&lt;/p&gt;

&lt;p&gt;James, will the /proc file in question be moved over to seq_file with your patches?  Do you know which patch covers this /proc file, and has it landed yet?&lt;/p&gt;</comment>
                            <comment id="75031" author="simmonsja" created="Wed, 15 Jan 2014 20:19:36 +0000"  >&lt;p&gt;The output Alexey is seeing for the the sanity test is normal. Both ping and cache_flush are write only so reading will give a error. In the past you could perform a read on a write proc entry. Now the LBUG is not normal. Can you send me your dmesg and lustre debug logs. I like to see which proc entry is exactly crashing. I don&apos;t know if the above proc entries are the real problem. &lt;/p&gt;</comment>
                            <comment id="75118" author="simmonsja" created="Thu, 16 Jan 2014 19:04:19 +0000"  >&lt;p&gt;Today I looked to see why a test for readable proc file was selecting write only enrty. It appears to be find -readable option still selects write only files. When I used -perm 444 I had no problems. I can&apos;t reproduce the Oops you saw so far. This is testing with a unpatched copy of the lastest master from this morning.&lt;/p&gt;</comment>
                            <comment id="75121" author="shadow" created="Thu, 16 Jan 2014 19:10:12 +0000"  >&lt;p&gt;James,&lt;/p&gt;

&lt;p&gt;it&apos;s not a LBUG (aka lustre bug) it&apos;s just found a list corruption. did you run own tests with kernel with debug options enabled? I have all debug options enabled.&lt;/p&gt;</comment>
                            <comment id="75125" author="shadow" created="Thu, 16 Jan 2014 19:30:57 +0000"  >&lt;p&gt;replicated again&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[root@rhel6-64 tests]# OSTCOUNT=40 ONLY=133 REFORMAT=yes sh sanity.sh 
Logging to shared log directory: /tmp/test_logs/1389900285
== sanity test 133e: Verifying OST {read,write}_bytes nid stats =================== 23:26:08 (1389900368)
42+0 records in
42+0 records out
1376256 bytes (1.4 MB) copied, 0.0373407 s, 36.9 MB/s
 sanity test_133e: @@@@@@ FAIL: Bad write_bytes sum, expected 1376256, got 0 
  Trace dump:
  = /Users/shadow/work/lustre/work/WC-review/CLSTR-2003/lustre/tests/test-framework.sh:4430:error_noexit()
  = /Users/shadow/work/lustre/work/WC-review/CLSTR-2003/lustre/tests/test-framework.sh:4461:error()
  = sanity.sh:8714:test_133e()
  = /Users/shadow/work/lustre/work/WC-review/CLSTR-2003/lustre/tests/test-framework.sh:4701:run_one()
  = /Users/shadow/work/lustre/work/WC-review/CLSTR-2003/lustre/tests/test-framework.sh:4736:run_one_logged()
  = /Users/shadow/work/lustre/work/WC-review/CLSTR-2003/lustre/tests/test-framework.sh:4562:run_test()
  = sanity.sh:8720:main()
Dumping lctl log to /tmp/test_logs/1389900285/sanity.test_133e.*.1389900369.log
Dumping logs only on local client.
FAIL 133e (25s)

== sanity test 133f: Check for LBUGs/Oopses/unreadable files in /proc == 23:26:33 (1389900393)



Timeout, server 192.168.69.5 not responding.
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;console output&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;LustreError: Skipped 3513 previous similar messages
LustreError: 685:0:(ldlm_resource.c:1154:ldlm_resource_get()) lustre-OST001e: lvbo_init failed for resource 0x43:0x0: rc = -2
LustreError: 685:0:(ldlm_resource.c:1154:ldlm_resource_get()) Skipped 6505 previous similar messages
LustreError: 11-0: lustre-OST0018-osc-ffff88008357c488: Communicating with 0@lo, operation ldlm_enqueue failed with -12.
LustreError: Skipped 6484 previous similar messages
Lustre: DEBUG MARKER: == sanity test 133e: Verifying OST read_bytes write_bytes nid stats =================== 23:26:08 (1389900368)
Lustre: DEBUG MARKER: cancel_lru_locks osc start
Lustre: DEBUG MARKER: cancel_lru_locks osc stop
Lustre: DEBUG MARKER: cancel_lru_locks osc start
Lustre: DEBUG MARKER: cancel_lru_locks osc stop
Lustre: DEBUG MARKER: sanity test_133e: @@@@@@ FAIL: Bad write_bytes sum, expected 1376256, got 0
Lustre: DEBUG MARKER: == sanity test 133f: Check for LBUGs/Oopses/unreadable files in /proc == 23:26:33 (1389900393)
proc_file_read: Apparent buffer overflow!
proc_file_read: Apparent buffer overflow!
proc_file_read: Apparent buffer overflow!
general protection fault: 0000 [#1] SMP 
last sysfs file: /sys/devices/system/cpu/online
CPU 3 
Modules linked in: lustre ofd osp lod ost mdt mdd mgs nodemap osd_ldiskfs ldiskfs lquota lfsck obdecho mgc lov osc mdc lmv fid fld ptlrpc obdclass ksocklnd lnet libcfs exportfs jb
d sha512_generic sha256_generic crc32c_intel nfs lockd auth_rpcgss nfs_acl sunrpc cachefiles fscache(T) ib_ipoib ib_cm ipv6 ib_uverbs ib_umad mlx4_ib ib_sa ib_mad ib_core mlx4_en 
mlx4_core dm_mirror dm_region_hash dm_log dm_mod ppdev parport_pc parport microcode vmware_balloon vmxnet3 sg i2c_piix4 i2c_core shpchp ext4 jbd2 mbcache sd_mod crc_t10dif sr_mod 
cdrom vmw_pvscsi pata_acpi ata_generic ata_piix [last unloaded: libcfs]

Pid: 1727, comm: vmmemctl Tainted: G           ---------------  T 2.6.32-358.18.1-lustre #0 VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform
RIP: 0010:[&amp;lt;ffffffff811d1e0c&amp;gt;]  [&amp;lt;ffffffff811d1e0c&amp;gt;] nr_blockdev_pages+0x3c/0x80
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="75130" author="simmonsja" created="Thu, 16 Jan 2014 19:47:15 +0000"  >&lt;p&gt;Now I see the problem with test 133e with the nid stats for the obdfilter layer. I was only looking at 133f test. I have some but not all the debug flags enabled. Which one do you have so I can reproduce this?&lt;/p&gt;</comment>
                            <comment id="75133" author="shadow" created="Thu, 16 Jan 2014 20:27:48 +0000"  >&lt;p&gt;I have a few kernel modification to print sentry name have a overflow.&lt;br/&gt;
Lustre: DEBUG MARKER: == sanity test 133f: Check for LBUGs/Oopses/unreadable files in /proc == 00:17:07 (1389903427)&lt;br/&gt;
proc_file_read: &apos;hash&apos;/ffffffffa055fc60 Apparent buffer overflow!&lt;br/&gt;
proc_file_read: &apos;hash&apos;/ffffffffa055fc60 Apparent buffer overflow!&lt;br/&gt;
proc_file_read: &apos;hash&apos;/ffffffffa055fc60 Apparent buffer overflow!&lt;br/&gt;
-----------&lt;del&gt;[ cut here ]&lt;/del&gt;-----------&lt;br/&gt;
crash&amp;gt; l *(0xffffffffa055fc60)&lt;br/&gt;
0xffffffffa055fc60 is in lprocfs_exp_rd_hash (/Users/shadow/work/lustre/work/WC-review/CLSTR-2003/lustre/obdclass/lprocfs_status.c:2568).&lt;br/&gt;
2563            return 0;&lt;br/&gt;
2564    }&lt;br/&gt;
2565    &lt;br/&gt;
2566    int lprocfs_exp_rd_hash(char *page, char **start, off_t off, int count,&lt;br/&gt;
2567                            int *eof,  void *data)&lt;/p&gt;

&lt;p&gt;i hope it help you.&lt;/p&gt;</comment>
                            <comment id="75154" author="shadow" created="Fri, 17 Jan 2014 04:55:47 +0000"  >&lt;p&gt;as i see lprocfs_exp_rd_hash put a data to a fixed buffer size (page size) without a size control, it&apos;s produce a overwriting a random area in memory so random memory corruption.&lt;/p&gt;</comment>
                            <comment id="75344" author="simmonsja" created="Tue, 21 Jan 2014 13:31:59 +0000"  >&lt;p&gt;Absolute correct. The problem is that __proc_file_read allocates only a single page to use. We have to test if we overflow the page in cfs_hash_debug_str. Once we overflow we have to return which also requires use remembering were we left off. The fix is quite ugly but we can assume the source of the overflow is going to be the looping through the hash bucket. We have to pass in a offset to this function and then if we have a offset jump to the hash bucket loop and start going through the loop at the right place. The other option is to wait for the seq_file patches for server side to land. This would resolve this since seq_file buffers are handled dynamically. That is they are resized on the fly when a overflow happens.&lt;/p&gt;</comment>
                            <comment id="84294" author="simmonsja" created="Fri, 16 May 2014 19:50:54 +0000"  >&lt;p&gt;This should be fixed by the landing of &lt;a href=&quot;http://review.whamcloud.com/#/c/8049&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#/c/8049&lt;/a&gt; from &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-3319&quot; title=&quot;Adapt to 3.10 upstream kernel proc_dir_entry change&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-3319&quot;&gt;&lt;del&gt;LU-3319&lt;/del&gt;&lt;/a&gt;. Peter can you link this tick to &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-3319&quot; title=&quot;Adapt to 3.10 upstream kernel proc_dir_entry change&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-3319&quot;&gt;&lt;del&gt;LU-3319&lt;/del&gt;&lt;/a&gt;. We will need a fix for b2_5/b2_4 as well.&lt;/p&gt;</comment>
                            <comment id="102239" author="adilger" created="Tue, 23 Dec 2014 09:42:15 +0000"  >&lt;p&gt;Fixed as part of &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-3319&quot; title=&quot;Adapt to 3.10 upstream kernel proc_dir_entry change&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-3319&quot;&gt;&lt;del&gt;LU-3319&lt;/del&gt;&lt;/a&gt;.&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                        <issuelink>
            <issuekey id="18888">LU-3319</issuekey>
        </issuelink>
                            </outwardlinks>
                                                                <inwardlinks description="is related to">
                                        <issuelink>
            <issuekey id="27073">LU-5764</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzwcs7:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>12274</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>