<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 01:27:29 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-2703] racer: BUG: soft lockup - CPU#0 stuck for 67s! [dd:1404]</title>
                <link>https://jira.whamcloud.com/browse/LU-2703</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;While running racer test, the following issue occurred on one of the two clients:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;00:11:31:Lustre: DEBUG MARKER: == racer test 1: racer on clients: client-28vm1,client-28vm2.lab.whamcloud.com DURATION=900 == 00:11:30 (1359447090)
00:11:32:Lustre: DEBUG MARKER: DURATION=900 /usr/lib64/lustre/tests/racer/racer.sh /mnt/lustre/racer 
00:24:12:BUG: soft lockup - CPU#0 stuck for 67s! [dd:1404]
00:24:12:Modules linked in: mgc(U) lustre(U) lov(U) mdc(U) lquota(U) osc(U) ksocklnd(U) ptlrpc(U) obdclass(U) lnet(U) lvfs(U) libcfs(U) nfs fscache nfsd lockd nfs_acl auth_rpcgss exportfs autofs4 sunrpc ib_ipoib rdma_ucm ib_ucm ib_uverbs ib_umad rdma_cm ib_cm iw_cm ib_addr ipv6 ib_sa ib_mad ib_core microcode virtio_balloon 8139too 8139cp mii i2c_piix4 i2c_core ext3 jbd mbcache virtio_blk virtio_pci virtio_ring virtio pata_acpi ata_generic ata_piix dm_mirror dm_region_hash dm_log dm_mod [last unloaded: speedstep_lib]
00:24:12:CPU 0 
00:24:12:Modules linked in: mgc(U) lustre(U) lov(U) mdc(U) lquota(U) osc(U) ksocklnd(U) ptlrpc(U) obdclass(U) lnet(U) lvfs(U) libcfs(U) nfs fscache nfsd lockd nfs_acl auth_rpcgss exportfs autofs4 sunrpc ib_ipoib rdma_ucm ib_ucm ib_uverbs ib_umad rdma_cm ib_cm iw_cm ib_addr ipv6 ib_sa ib_mad ib_core microcode virtio_balloon 8139too 8139cp mii i2c_piix4 i2c_core ext3 jbd mbcache virtio_blk virtio_pci virtio_ring virtio pata_acpi ata_generic ata_piix dm_mirror dm_region_hash dm_log dm_mod [last unloaded: speedstep_lib]
00:24:12:
00:24:12:Pid: 1404, comm: dd Not tainted 2.6.32-279.19.1.el6.x86_64 #1 Red Hat KVM
00:24:12:RIP: 0010:[&amp;lt;ffffffff814ec53e&amp;gt;]  [&amp;lt;ffffffff814ec53e&amp;gt;] _spin_lock+0x1e/0x30
00:24:12:RSP: 0018:ffff880030ce16a8  EFLAGS: 00000206
00:24:13:RAX: 0000000000000001 RBX: ffff880030ce16a8 RCX: ffff8800783c6ba0
00:24:13:RDX: 0000000000000000 RSI: ffff88004ecec7c0 RDI: ffff88007b510a1c
00:24:13:RBP: ffffffff8100bb8e R08: 0000000000000102 R09: 0000000000000000
00:24:14:R10: 0000000003b5e000 R11: 000000000000000e R12: ffffffffa058315f
00:24:14:R13: ffff880030ce1638 R14: 0000000003b5e000 R15: 0000000003b5efff
00:24:14:FS:  00007f288786d700(0000) GS:ffff880002200000(0000) knlGS:0000000000000000
00:24:14:CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
00:24:14:CR2: 00000036526cd710 CR3: 000000003d0ee000 CR4: 00000000000006f0
00:24:14:DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
00:24:14:DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
00:24:14:Process dd (pid: 1404, threadinfo ffff880030ce0000, task ffff88007d082aa0)
00:24:14:Stack:
00:24:14: ffff880030ce17f8 ffffffffa0275889 00000000002537dc ffffea000115a9f0
00:24:14:&amp;lt;d&amp;gt; ffff88003e877950 0000000000000000 0000000003b5e000 0000000003b5efff
00:24:14:&amp;lt;d&amp;gt; ffff880030ce1998 ffffffffa079bddd f869fda2cf0897a1 0000000000000000
00:24:14:Call Trace:
00:24:15: [&amp;lt;ffffffffa0275889&amp;gt;] ? osc_queue_async_io+0x399/0x1140 [osc]
00:24:15: [&amp;lt;ffffffffa079bddd&amp;gt;] ? ll_prepare_write+0x50d/0x1230 [lustre]
00:24:15: [&amp;lt;ffffffffa072adce&amp;gt;] ? lov_stripe_offset+0x28e/0x340 [lov]
00:24:15: [&amp;lt;ffffffffa072a8db&amp;gt;] ? lov_tgt_seq_show+0x26b/0x300 [lov]
00:24:16: [&amp;lt;ffffffffa070d0a9&amp;gt;] ? lov_queue_async_io+0x149/0x4a0 [lov]
00:24:16: [&amp;lt;ffffffffa0795780&amp;gt;] ? queue_or_sync_write+0x160/0xda0 [lustre]
00:24:16: [&amp;lt;ffffffffa07a2c2b&amp;gt;] ? ll_stats_ops_tally+0x6b/0xd0 [lustre]
00:24:16: [&amp;lt;ffffffffa079cde5&amp;gt;] ? ll_commit_write+0x2e5/0x750 [lustre]
00:24:16: [&amp;lt;ffffffffa07b4333&amp;gt;] ? ll_write_begin+0x83/0x210 [lustre]
00:24:16: [&amp;lt;ffffffffa07b4280&amp;gt;] ? ll_write_end+0x30/0x60 [lustre]
00:24:16: [&amp;lt;ffffffff811107fa&amp;gt;] ? generic_file_buffered_write+0x18a/0x2e0
00:24:16: [&amp;lt;ffffffff81070f97&amp;gt;] ? current_fs_time+0x27/0x30
00:24:16: [&amp;lt;ffffffff81112130&amp;gt;] ? __generic_file_aio_write+0x250/0x480
00:24:16: [&amp;lt;ffffffffa0765dba&amp;gt;] ? ll_file_get_tree_lock_iov+0x14a/0x810 [lustre]
00:24:16: [&amp;lt;ffffffff811123cf&amp;gt;] ? generic_file_aio_write+0x6f/0xe0
00:24:16: [&amp;lt;ffffffffa0772449&amp;gt;] ? ll_file_aio_write+0xa19/0x1c60 [lustre]
00:24:16: [&amp;lt;ffffffffa0773760&amp;gt;] ? ll_file_write+0xd0/0xf0 [lustre]
00:24:16: [&amp;lt;ffffffff8105a5c3&amp;gt;] ? perf_event_task_sched_out+0x33/0x80
00:24:16: [&amp;lt;ffffffff81090990&amp;gt;] ? autoremove_wake_function+0x0/0x40
00:24:16: [&amp;lt;ffffffff8120ca26&amp;gt;] ? security_file_permission+0x16/0x20
00:24:16: [&amp;lt;ffffffff8117646d&amp;gt;] ? rw_verify_area+0x5d/0xc0
00:24:16: [&amp;lt;ffffffff81176588&amp;gt;] ? vfs_write+0xb8/0x1a0
00:24:16: [&amp;lt;ffffffff81176e81&amp;gt;] ? sys_write+0x51/0x90
00:24:16: [&amp;lt;ffffffff810d3a75&amp;gt;] ? __audit_syscall_exit+0x265/0x290
00:24:16: [&amp;lt;ffffffff8100b072&amp;gt;] ? system_call_fastpath+0x16/0x1b
00:24:16:Code: 00 00 00 01 74 05 e8 72 8c d8 ff c9 c3 55 48 89 e5 0f 1f 44 00 00 b8 00 00 01 00 3e 0f c1 07 0f b7 d0 c1 e8 10 39 c2 74 0e f3 90 &amp;lt;0f&amp;gt; 1f 44 00 00 83 3f 00 75 f4 eb df c9 c3 0f 1f 40 00 55 48 89
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Maloo report: &lt;a href=&quot;https://maloo.whamcloud.com/test_sets/83fa5d12-6a23-11e2-85d4-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/83fa5d12-6a23-11e2-85d4-52540035b04c&lt;/a&gt;&lt;/p&gt;</description>
                <environment>Lustre Branch: b1_8&lt;br/&gt;
Lustre Build: &lt;a href=&quot;http://build.whamcloud.com/job/lustre-b1_8/252&quot;&gt;http://build.whamcloud.com/job/lustre-b1_8/252&lt;/a&gt;&lt;br/&gt;
Distro/Arch: RHEL6.3 (client), RHEL5.9 (server)&lt;br/&gt;
</environment>
        <key id="17341">LU-2703</key>
            <summary>racer: BUG: soft lockup - CPU#0 stuck for 67s! [dd:1404]</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="1" iconUrl="https://jira.whamcloud.com/images/icons/priorities/blocker.svg">Blocker</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="bobijam">Zhenyu Xu</assignee>
                                    <reporter username="yujian">Jian Yu</reporter>
                        <labels>
                    </labels>
                <created>Tue, 29 Jan 2013 10:13:31 +0000</created>
                <updated>Mon, 18 Feb 2013 04:38:36 +0000</updated>
                            <resolved>Mon, 18 Feb 2013 04:38:35 +0000</resolved>
                                    <version>Lustre 1.8.9</version>
                                    <fixVersion>Lustre 1.8.9</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>9</watches>
                                                                            <comments>
                            <comment id="51394" author="yujian" created="Tue, 29 Jan 2013 10:17:25 +0000"  >&lt;p&gt;This is an regression issue after &lt;a href=&quot;http://review.whamcloud.com/5132&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/5132&lt;/a&gt; was landed.&lt;/p&gt;</comment>
                            <comment id="51441" author="pjones" created="Wed, 30 Jan 2013 01:01:14 +0000"  >&lt;p&gt;Bobijam&lt;/p&gt;

&lt;p&gt;Could you please look into this one?&lt;/p&gt;

&lt;p&gt;Thanks&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="51444" author="bobijam" created="Wed, 30 Jan 2013 02:08:18 +0000"  >&lt;p&gt;this specific case (&lt;a href=&quot;https://maloo.whamcloud.com/test_sets/83fa5d12-6a23-11e2-85d4-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/83fa5d12-6a23-11e2-85d4-52540035b04c&lt;/a&gt;) is due to MDS journal dead for some unknown reason.&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedHeader panelHeader&quot; style=&quot;border-bottom-width: 1px;&quot;&gt;&lt;b&gt;dead process on MDS&lt;/b&gt;&lt;/div&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;01:17:06:kjournald     D ffff810002536420     0   392     23           412   383 (L-TLB)
01:17:06: ffff81007f10bca0 0000000000000046 0000000000011220 0000000000000046
01:17:06: 0000000000000220 000000000000000a ffff810037fe30c0 ffffffff8031db60
01:17:06: 0000178e3069efc5 000000000000edfb ffff810037fe32a8 000000008006e6b3
01:17:06:Call Trace:
01:17:06: [&amp;lt;ffffffff8006ed48&amp;gt;] do_gettimeofday+0x40/0x90
01:17:06: [&amp;lt;ffffffff800155b4&amp;gt;] sync_buffer+0x0/0x3f
01:17:06: [&amp;lt;ffffffff800637de&amp;gt;] io_schedule+0x3f/0x67
01:17:06: [&amp;lt;ffffffff800155ef&amp;gt;] sync_buffer+0x3b/0x3f
01:17:06: [&amp;lt;ffffffff80063a0a&amp;gt;] __wait_on_bit+0x40/0x6e
01:17:06: [&amp;lt;ffffffff800155b4&amp;gt;] sync_buffer+0x0/0x3f
01:17:06: [&amp;lt;ffffffff80063aa4&amp;gt;] out_of_line_wait_on_bit+0x6c/0x78
01:17:06: [&amp;lt;ffffffff800a3c15&amp;gt;] wake_bit_function+0x0/0x23
01:17:06: [&amp;lt;ffffffff8803331c&amp;gt;] :jbd:journal_commit_transaction+0xa7f/0x132b
01:17:06: [&amp;lt;ffffffff8003dde5&amp;gt;] lock_timer_base+0x1b/0x3c
01:17:06: [&amp;lt;ffffffff88037489&amp;gt;] :jbd:kjournald+0xc1/0x213
01:17:06: [&amp;lt;ffffffff800a3be7&amp;gt;] autoremove_wake_function+0x0/0x2e
01:17:06: [&amp;lt;ffffffff800a39cf&amp;gt;] keventd_create_kthread+0x0/0xc4
01:17:06: [&amp;lt;ffffffff880373c8&amp;gt;] :jbd:kjournald+0x0/0x213
01:17:06: [&amp;lt;ffffffff800a39cf&amp;gt;] keventd_create_kthread+0x0/0xc4
01:17:06: [&amp;lt;ffffffff80032c45&amp;gt;] kthread+0xfe/0x132
01:17:06: [&amp;lt;ffffffff8005dfc1&amp;gt;] child_rip+0xa/0x11
01:17:06: [&amp;lt;ffffffff800a39cf&amp;gt;] keventd_create_kthread+0x0/0xc4
01:17:06: [&amp;lt;ffffffff80032b47&amp;gt;] kthread+0x0/0x132
01:17:06: [&amp;lt;ffffffff8005dfb7&amp;gt;] child_rip+0x0/0x11

01:17:07:syslogd       D ffff810037fe30c0     0  1607      1          1610  1585 (NOTLB)
01:17:07: ffff8100741bdd98 0000000000000082 00000000741bdd18 0000000000000001
01:17:07: 0000000000000000 0000000000000009 ffff81007ec287a0 ffff810037fe30c0
01:17:07: 0000178e2ecd14cd 000000000001708d ffff81007ec28988 0000000000000092
01:17:07:Call Trace:
01:17:07: [&amp;lt;ffffffff8002e4db&amp;gt;] __wake_up+0x38/0x4f
01:17:07: [&amp;lt;ffffffff8803682d&amp;gt;] :jbd:log_wait_commit+0xa3/0xf5
01:17:07: [&amp;lt;ffffffff800a3be7&amp;gt;] autoremove_wake_function+0x0/0x2e
01:17:07: [&amp;lt;ffffffff880307f8&amp;gt;] :jbd:journal_stop+0x22a/0x259
01:17:07: [&amp;lt;ffffffff8002ff63&amp;gt;] __writeback_single_inode+0x1dd/0x31c
01:17:07: [&amp;lt;ffffffff800e4b09&amp;gt;] do_readv_writev+0x26e/0x291
01:17:07: [&amp;lt;ffffffff800f8a7a&amp;gt;] sync_inode+0x24/0x33
01:17:07: [&amp;lt;ffffffff8804d52e&amp;gt;] :ext3:ext3_sync_file+0xce/0xf8
01:17:07: [&amp;lt;ffffffff800504b1&amp;gt;] do_fsync+0x52/0xa4
01:17:07: [&amp;lt;ffffffff800e5424&amp;gt;] __do_fsync+0x23/0x36
01:17:07: [&amp;lt;ffffffff8005d29e&amp;gt;] tracesys+0xd5/0xdf
01:17:07:
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="51536" author="johann" created="Thu, 31 Jan 2013 11:29:08 +0000"  >&lt;p&gt;syslogd is waiting for kjournald and kjournald is waiting for I/O to complete. What do you mean by journal is dead?&lt;/p&gt;

&lt;p&gt;From the original report, it seems that we are stuck on a spinlock for 67s, so i tend to think that someone is doing something it shouldn&apos;t (e.g. sleeping, locking ordering issue) while holding a spinlock.&lt;/p&gt;</comment>
                            <comment id="51544" author="yujian" created="Thu, 31 Jan 2013 12:01:38 +0000"  >&lt;p&gt;Lustre Branch: b1_8&lt;br/&gt;
Lustre Build: &lt;a href=&quot;http://build.whamcloud.com/job/lustre-b1_8/252&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://build.whamcloud.com/job/lustre-b1_8/252&lt;/a&gt;&lt;br/&gt;
Distro/Arch: RHEL6.3 (client), RHEL5.9 (server)&lt;/p&gt;

&lt;p&gt;This issue occurred constantly on Lustre b1_8 build #252: &lt;a href=&quot;https://maloo.whamcloud.com/test_sets/0fe4438e-6bc7-11e2-b2a5-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/0fe4438e-6bc7-11e2-b2a5-52540035b04c&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="51632" author="johann" created="Fri, 1 Feb 2013 12:37:11 +0000"  >&lt;p&gt;hm, we are looping in osc_announce_cached() in the last report and it was osc_queue_async_io() in the original one. I suspect the issue is related to messing up with the cl_loi_list_lock. Unfortunately, sysrq-t output in maloo is full of soft lockup messages. I think we should try to collect a crash dump.&lt;/p&gt;</comment>
                            <comment id="51635" author="bobijam" created="Fri, 1 Feb 2013 13:27:19 +0000"  >&lt;p&gt;For the record, I could not reproduce it for RHEL5.9 server and client (both use 2.6.18-348.1.1.el5)&lt;/p&gt;</comment>
                            <comment id="51683" author="yujian" created="Mon, 4 Feb 2013 01:01:31 +0000"  >&lt;p&gt;Hi Bobi and Johann,&lt;/p&gt;

&lt;p&gt;I reproduced this issue manually on Toro client-12vm&lt;span class=&quot;error&quot;&gt;&amp;#91;1-4&amp;#93;&lt;/span&gt; against Lustre b1_8 build #252 and got the kernel dump file of client-12vm1.&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;Client1: client-12vm1 (RHEL6.3 2.6.32-279.19.1.el6.x86_64)
Client2: client-12vm2 (RHEL6.3 2.6.32-279.19.1.el6.x86_64)
MGS/MDS: client-12vm3 (RHEL5.9 2.6.18-348.1.1.el5_lustre.g3480bb0)
OSS: client-12vm4     (RHEL5.9 2.6.18-348.1.1.el5_lustre.g3480bb0)  
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;The dump file is /scratch/logs/1.8.9/&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-2703&quot; title=&quot;racer: BUG: soft lockup - CPU#0 stuck for 67s! [dd:1404]&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-2703&quot;&gt;&lt;del&gt;LU-2703&lt;/del&gt;&lt;/a&gt;/client-12vm1.dump on brent node.&lt;/p&gt;

&lt;p&gt;I also put the kernel-debuginfo-* packages into that directory.&lt;/p&gt;</comment>
                            <comment id="51721" author="yujian" created="Mon, 4 Feb 2013 09:29:27 +0000"  >&lt;p&gt;FYI, client-12vm&lt;span class=&quot;error&quot;&gt;&amp;#91;1-4&amp;#93;&lt;/span&gt; are remaining for debugging.&lt;/p&gt;</comment>
                            <comment id="51723" author="bobijam" created="Mon, 4 Feb 2013 10:29:40 +0000"  >&lt;p&gt;osc_update_grant is trying spin_lock cl_loi_list_lock, but I haven&apos;t found where the lock has been hold at the moment.&lt;/p&gt;

&lt;div class=&quot;panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;panelContent&quot;&gt;
&lt;p&gt;crash&amp;gt; bt -a&lt;br/&gt;
PID: 3789   TASK: ffff88007c73a040  CPU: 0   COMMAND: &quot;dd&quot;&lt;br/&gt;
    &lt;span class=&quot;error&quot;&gt;&amp;#91;exception RIP: _spin_lock+38&amp;#93;&lt;/span&gt;&lt;br/&gt;
    RIP: ffffffff814ec546  RSP: ffff88003318b388  RFLAGS: 00000206&lt;br/&gt;
    RAX: 0000000000000001  RBX: ffff88007ca5c180  RCX: 0000000000000030&lt;br/&gt;
    RDX: 0000000000000000  RSI: ffff88007ca5c180  RDI: ffff880079b7069c&lt;br/&gt;
    RBP: ffff88003318b388   R8: 0000000000000000   R9: 0000000000001000&lt;br/&gt;
    R10: ffff880037528310  R11: 000000000000000f  R12: ffff880079b705d0&lt;br/&gt;
    R13: ffff880079b7069c  R14: ffff880079b705d0  R15: 0000000000001000&lt;br/&gt;
    CS: 0010  SS: 0018&lt;br/&gt;
 #0 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff88003318b390&amp;#93;&lt;/span&gt; osc_update_grant at &lt;font color=&quot;red&quot;&gt;ffffffffa02667fe&lt;/font&gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;osc&amp;#93;&lt;/span&gt;&lt;br/&gt;
 #1 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff88003318b3e0&amp;#93;&lt;/span&gt; osc_brw_fini_request at ffffffffa027727f &lt;span class=&quot;error&quot;&gt;&amp;#91;osc&amp;#93;&lt;/span&gt;&lt;br/&gt;
 #2 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff88003318b500&amp;#93;&lt;/span&gt; osc_brw at ffffffffa0278931 &lt;span class=&quot;error&quot;&gt;&amp;#91;osc&amp;#93;&lt;/span&gt;&lt;br/&gt;
 #3 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff88003318b640&amp;#93;&lt;/span&gt; lov_brw at ffffffffa06879c9 &lt;span class=&quot;error&quot;&gt;&amp;#91;lov&amp;#93;&lt;/span&gt;&lt;br/&gt;
 #4 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff88003318b6f0&amp;#93;&lt;/span&gt; ll_prepare_write at ffffffffa070a3fe &lt;span class=&quot;error&quot;&gt;&amp;#91;lustre&amp;#93;&lt;/span&gt;&lt;br/&gt;
 #5 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff88003318b9a0&amp;#93;&lt;/span&gt; ll_write_begin at ffffffffa0722333 &lt;span class=&quot;error&quot;&gt;&amp;#91;lustre&amp;#93;&lt;/span&gt;&lt;br/&gt;
 #6 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff88003318ba20&amp;#93;&lt;/span&gt; generic_file_buffered_write at ffffffff81110793&lt;br/&gt;
 #7 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff88003318baf0&amp;#93;&lt;/span&gt; __generic_file_aio_write at ffffffff81112130&lt;br/&gt;
 #8 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff88003318bbb0&amp;#93;&lt;/span&gt; generic_file_aio_write at ffffffff811123cf&lt;br/&gt;
 #9 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff88003318bc00&amp;#93;&lt;/span&gt; ll_file_aio_write at ffffffffa06e0449 &lt;span class=&quot;error&quot;&gt;&amp;#91;lustre&amp;#93;&lt;/span&gt;&lt;br/&gt;
#10 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff88003318bdd0&amp;#93;&lt;/span&gt; ll_file_write at ffffffffa06e1760 &lt;span class=&quot;error&quot;&gt;&amp;#91;lustre&amp;#93;&lt;/span&gt;&lt;br/&gt;
#11 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff88003318bef0&amp;#93;&lt;/span&gt; vfs_write at ffffffff81176588&lt;br/&gt;
#12 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff88003318bf30&amp;#93;&lt;/span&gt; sys_write at ffffffff81176e81&lt;br/&gt;
#13 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff88003318bf80&amp;#93;&lt;/span&gt; system_call_fastpath at ffffffff8100b072&lt;br/&gt;
crash&amp;gt; dis osc_update_grant&lt;br/&gt;
0xffffffffa02667d0 &amp;lt;osc_update_grant&amp;gt;:  push   %rbp&lt;br/&gt;
0xffffffffa02667d1 &amp;lt;osc_update_grant+1&amp;gt;:        mov    %rsp,%rbp&lt;br/&gt;
0xffffffffa02667d4 &amp;lt;osc_update_grant+4&amp;gt;:        sub    $0x40,%rsp&lt;br/&gt;
0xffffffffa02667d8 &amp;lt;osc_update_grant+8&amp;gt;:        mov    %rbx,-0x18(%rbp)&lt;br/&gt;
0xffffffffa02667dc &amp;lt;osc_update_grant+12&amp;gt;:       mov    %r12,-0x10(%rbp)&lt;br/&gt;
0xffffffffa02667e0 &amp;lt;osc_update_grant+16&amp;gt;:       mov    %r13,-0x8(%rbp)&lt;br/&gt;
0xffffffffa02667e4 &amp;lt;osc_update_grant+20&amp;gt;:       nopl   0x0(%rax,%rax,1)&lt;br/&gt;
0xffffffffa02667e9 &amp;lt;osc_update_grant+25&amp;gt;:       lea    0xcc(%rdi),%r13&lt;br/&gt;
0xffffffffa02667f0 &amp;lt;osc_update_grant+32&amp;gt;:       mov    %rdi,%r12&lt;br/&gt;
0xffffffffa02667f3 &amp;lt;osc_update_grant+35&amp;gt;:       mov    %rsi,%rbx&lt;br/&gt;
0xffffffffa02667f6 &amp;lt;osc_update_grant+38&amp;gt;:       mov    %r13,%rdi&lt;br/&gt;
0xffffffffa02667f9 &amp;lt;osc_update_grant+41&amp;gt;:       callq  0xffffffff814ec520 &amp;lt;_spin_lock&amp;gt;  &lt;font color=&quot;blue&quot;&gt;    =====&amp;gt; client_obd_list_lock(&amp;amp;cli-&amp;gt;cl_loi_list_lock); &lt;/font&gt;&lt;br/&gt;
&lt;font color=&quot;red&quot;&gt;0xffffffffa02667fe&lt;/font&gt; &amp;lt;osc_update_grant+46&amp;gt;:       testb  $0x20,0x14833f(%rip)        # 0xffffffffa03aeb44&lt;br/&gt;
0xffffffffa0266805 &amp;lt;osc_update_grant+53&amp;gt;:       je     0xffffffffa0266810&lt;br/&gt;
0xffffffffa0266807 &amp;lt;osc_update_grant+55&amp;gt;:       testb  $0x8,0x148332(%rip)        # 0xffffffffa03aeb40&lt;br/&gt;
0xffffffffa026680e &amp;lt;osc_update_grant+62&amp;gt;:       jne    0xffffffffa0266840&lt;br/&gt;
0xffffffffa0266810 &amp;lt;osc_update_grant+64&amp;gt;:       mov    (%rbx),%rax&lt;br/&gt;
0xffffffffa0266813 &amp;lt;osc_update_grant+67&amp;gt;:       test   $0x8000000,%eax&lt;br/&gt;
0xffffffffa0266818 &amp;lt;osc_update_grant+72&amp;gt;:       je     0xffffffffa0266823&lt;br/&gt;
0xffffffffa026681a &amp;lt;osc_update_grant+74&amp;gt;:       mov    0x48(%rbx),%rax&lt;/p&gt;


&lt;p&gt;crash&amp;gt; ps  | grep dd&lt;br/&gt;
      2      0   0  ffff88007dc84aa0  IN   0.0       0      0  &lt;span class=&quot;error&quot;&gt;&amp;#91;kthreadd&amp;#93;&lt;/span&gt;&lt;br/&gt;
     13      2   0  ffff88007dd55500  RU   0.0       0      0  &lt;span class=&quot;error&quot;&gt;&amp;#91;sync_supers&amp;#93;&lt;/span&gt;&lt;br/&gt;
     14      2   0  ffff88007dd54aa0  RU   0.0       0      0  &lt;span class=&quot;error&quot;&gt;&amp;#91;bdi-default&amp;#93;&lt;/span&gt;&lt;br/&gt;
     15      2   0  ffff88007dd54040  IN   0.0       0      0  &lt;span class=&quot;error&quot;&gt;&amp;#91;kintegrityd/0&amp;#93;&lt;/span&gt;&lt;br/&gt;
    903      2   0  ffff880037b84080  RU   0.0       0      0  &lt;span class=&quot;error&quot;&gt;&amp;#91;ib_addr&amp;#93;&lt;/span&gt;&lt;br/&gt;
   1415   1387   0  ffff880037978080  IN   0.1   20220   1076  hald-addon-inpu&lt;br/&gt;
   1431   1387   0  ffff880037ffb500  IN   0.0   17804   1036  hald-addon-acpi&lt;br/&gt;
&amp;gt;  3789   6414   0  ffff88007c73a040  RU   0.0  105180    664  dd&lt;br/&gt;
   6401   6394   0  ffff880079bdd500  IN   0.1  106096   1244  file_rename.sh&lt;br/&gt;
  15786   6398   0  ffff88007bef1500  IN   0.0  105180    664  dd&lt;br/&gt;
  17462   6406   0  ffff88007d7ed540  IN   0.0  105180    664  dd&lt;br/&gt;
  18320   6404   0  ffff880079cdd500  UN   0.0  115700    992  ls&lt;br/&gt;
crash&amp;gt; bt 15786&lt;br/&gt;
PID: 15786  TASK: ffff88007bef1500  CPU: 0   COMMAND: &quot;dd&quot;&lt;br/&gt;
 #0 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff880078d715e8&amp;#93;&lt;/span&gt; schedule at ffffffff814e9c02&lt;br/&gt;
 #1 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff880078d716b0&amp;#93;&lt;/span&gt; osc_queue_async_io at &lt;font color=&quot;red&quot;&gt;ffffffffa0275e72&lt;/font&gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;osc&amp;#93;&lt;/span&gt;&lt;br/&gt;
 #2 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff880078d71800&amp;#93;&lt;/span&gt; lov_queue_async_io at ffffffffa067b0a9 &lt;span class=&quot;error&quot;&gt;&amp;#91;lov&amp;#93;&lt;/span&gt;&lt;br/&gt;
 #3 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff880078d71880&amp;#93;&lt;/span&gt; queue_or_sync_write at ffffffffa0703780 &lt;span class=&quot;error&quot;&gt;&amp;#91;lustre&amp;#93;&lt;/span&gt;&lt;br/&gt;
 #4 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff880078d71960&amp;#93;&lt;/span&gt; ll_commit_write at ffffffffa070ade5 &lt;span class=&quot;error&quot;&gt;&amp;#91;lustre&amp;#93;&lt;/span&gt;&lt;br/&gt;
 #5 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff880078d719f0&amp;#93;&lt;/span&gt; ll_write_end at ffffffffa0722280 &lt;span class=&quot;error&quot;&gt;&amp;#91;lustre&amp;#93;&lt;/span&gt;&lt;br/&gt;
 #6 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff880078d71a20&amp;#93;&lt;/span&gt; generic_file_buffered_write at ffffffff811107fa&lt;br/&gt;
 #7 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff880078d71af0&amp;#93;&lt;/span&gt; __generic_file_aio_write at ffffffff81112130&lt;br/&gt;
 #8 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff880078d71bb0&amp;#93;&lt;/span&gt; generic_file_aio_write at ffffffff811123cf&lt;br/&gt;
 #9 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff880078d71c00&amp;#93;&lt;/span&gt; ll_file_aio_write at ffffffffa06e0449 &lt;span class=&quot;error&quot;&gt;&amp;#91;lustre&amp;#93;&lt;/span&gt;&lt;br/&gt;
#10 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff880078d71dd0&amp;#93;&lt;/span&gt; ll_file_write at ffffffffa06e1760 &lt;span class=&quot;error&quot;&gt;&amp;#91;lustre&amp;#93;&lt;/span&gt;&lt;br/&gt;
#11 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff880078d71ef0&amp;#93;&lt;/span&gt; vfs_write at ffffffff81176588&lt;br/&gt;
#12 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff880078d71f30&amp;#93;&lt;/span&gt; sys_write at ffffffff81176e81&lt;br/&gt;
#13 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff880078d71f80&amp;#93;&lt;/span&gt; system_call_fastpath at ffffffff8100b072&lt;/p&gt;

&lt;p&gt;crash&amp;gt; bt 17462&lt;br/&gt;
PID: 17462  TASK: ffff88007d7ed540  CPU: 0   COMMAND: &quot;dd&quot;&lt;br/&gt;
 #0 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff8800784d35e8&amp;#93;&lt;/span&gt; schedule at ffffffff814e9c02&lt;br/&gt;
 #1 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff8800784d36b0&amp;#93;&lt;/span&gt; osc_queue_async_io at &lt;font color=&quot;red&quot;&gt;ffffffffa0275e72&lt;/font&gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;osc&amp;#93;&lt;/span&gt;&lt;br/&gt;
 #2 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff8800784d3800&amp;#93;&lt;/span&gt; lov_queue_async_io at ffffffffa067b0a9 &lt;span class=&quot;error&quot;&gt;&amp;#91;lov&amp;#93;&lt;/span&gt;&lt;br/&gt;
 #3 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff8800784d3880&amp;#93;&lt;/span&gt; queue_or_sync_write at ffffffffa0703780 &lt;span class=&quot;error&quot;&gt;&amp;#91;lustre&amp;#93;&lt;/span&gt;&lt;br/&gt;
 #4 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff8800784d3960&amp;#93;&lt;/span&gt; ll_commit_write at ffffffffa070ade5 &lt;span class=&quot;error&quot;&gt;&amp;#91;lustre&amp;#93;&lt;/span&gt;&lt;br/&gt;
 #5 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff8800784d39f0&amp;#93;&lt;/span&gt; ll_write_end at ffffffffa0722280 &lt;span class=&quot;error&quot;&gt;&amp;#91;lustre&amp;#93;&lt;/span&gt;&lt;br/&gt;
 #6 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff8800784d3a20&amp;#93;&lt;/span&gt; generic_file_buffered_write at ffffffff811107fa&lt;br/&gt;
 #7 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff8800784d3af0&amp;#93;&lt;/span&gt; __generic_file_aio_write at ffffffff81112130&lt;br/&gt;
 #8 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff8800784d3bb0&amp;#93;&lt;/span&gt; generic_file_aio_write at ffffffff811123cf&lt;br/&gt;
 #9 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff8800784d3c00&amp;#93;&lt;/span&gt; ll_file_aio_write at ffffffffa06e0449 &lt;span class=&quot;error&quot;&gt;&amp;#91;lustre&amp;#93;&lt;/span&gt;&lt;br/&gt;
#10 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff8800784d3dd0&amp;#93;&lt;/span&gt; ll_file_write at ffffffffa06e1760 &lt;span class=&quot;error&quot;&gt;&amp;#91;lustre&amp;#93;&lt;/span&gt;&lt;br/&gt;
#11 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff8800784d3ef0&amp;#93;&lt;/span&gt; vfs_write at ffffffff81176588&lt;br/&gt;
#12 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff8800784d3f30&amp;#93;&lt;/span&gt; sys_write at ffffffff81176e81&lt;br/&gt;
#13 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff8800784d3f80&amp;#93;&lt;/span&gt; system_call_fastpath at ffffffff8100b072&lt;br/&gt;
    RIP: 00000036d9edae60  RSP: 00007fffce21ace0  RFLAGS: 00000202&lt;br/&gt;
    RAX: 0000000000000001  RBX: ffffffff8100b072  RCX: 00000036d9edae60&lt;br/&gt;
    RDX: 0000000000000400  RSI: 0000000000ed9000  RDI: 0000000000000001&lt;br/&gt;
    RBP: 0000000000ed9000   R8: 00000036da18eee8   R9: 0000000000000001&lt;br/&gt;
    R10: 0000000000002403  R11: 0000000000000246  R12: 0000000000ed8fff&lt;br/&gt;
    R13: 0000000000000000  R14: 0000000000000400  R15: 0000000000000000&lt;br/&gt;
    ORIG_RAX: 0000000000000001  CS: 0033  SS: 002b&lt;/p&gt;

&lt;p&gt;crash&amp;gt; dis 0xffffffffa0275e41 20&lt;br/&gt;
0xffffffffa0275e41 &amp;lt;osc_queue_async_io+2385&amp;gt;:   mov    -0xd8(%rbp),%rdx&lt;br/&gt;
0xffffffffa0275e48 &amp;lt;osc_queue_async_io+2392&amp;gt;:   mov    %rbx,-0xb8(%rbp)&lt;br/&gt;
0xffffffffa0275e4f &amp;lt;osc_queue_async_io+2399&amp;gt;:   mov    %rdx,%rbx&lt;br/&gt;
0xffffffffa0275e52 &amp;lt;osc_queue_async_io+2402&amp;gt;:   mov    $0x1,%eax&lt;br/&gt;
0xffffffffa0275e57 &amp;lt;osc_queue_async_io+2407&amp;gt;:   xchg   %rax,(%rbx)&lt;br/&gt;
0xffffffffa0275e5a &amp;lt;osc_queue_async_io+2410&amp;gt;:   mov    -0xa8(%rbp),%rdi&lt;br/&gt;
0xffffffffa0275e61 &amp;lt;osc_queue_async_io+2417&amp;gt;:   mov    %r13,%rsi&lt;br/&gt;
0xffffffffa0275e64 &amp;lt;osc_queue_async_io+2420&amp;gt;:   callq  0xffffffffa0266890 &amp;lt;ocw_granted&amp;gt;&lt;br/&gt;
0xffffffffa0275e69 &amp;lt;osc_queue_async_io+2425&amp;gt;:   test   %eax,%eax&lt;br/&gt;
0xffffffffa0275e6b &amp;lt;osc_queue_async_io+2427&amp;gt;:   jne    0xffffffffa0275e92&lt;br/&gt;
0xffffffffa0275e6d &amp;lt;osc_queue_async_io+2429&amp;gt;:   callq  0xffffffff814e9850 &amp;lt;schedule&amp;gt; &lt;font color=&quot;blue&quot;&gt;   =====&amp;gt; l_wait_event(ocw.ocw_waitq, ocw_granted(cli, &amp;amp;ocw), &amp;amp;lwi);&lt;/font&gt;&lt;br/&gt;
&lt;font color=&quot;red&quot;&gt;0xffffffffa0275e72&lt;/font&gt; &amp;lt;osc_queue_async_io+2434&amp;gt;:   mov    -0xa8(%rbp),%rdi&lt;br/&gt;
0xffffffffa0275e79 &amp;lt;osc_queue_async_io+2441&amp;gt;:   mov    %r13,%rsi&lt;br/&gt;
0xffffffffa0275e7c &amp;lt;osc_queue_async_io+2444&amp;gt;:   callq  0xffffffffa0266890 &amp;lt;ocw_granted&amp;gt;&lt;br/&gt;
0xffffffffa0275e81 &amp;lt;osc_queue_async_io+2449&amp;gt;:   test   %eax,%eax&lt;br/&gt;
0xffffffffa0275e83 &amp;lt;osc_queue_async_io+2451&amp;gt;:   jne    0xffffffffa0275e92&lt;br/&gt;
0xffffffffa0275e85 &amp;lt;osc_queue_async_io+2453&amp;gt;:   callq  0xffffffffa039e250 &amp;lt;cfs_signal_pending&amp;gt;&lt;br/&gt;
0xffffffffa0275e8a &amp;lt;osc_queue_async_io+2458&amp;gt;:   test   %eax,%eax&lt;br/&gt;
0xffffffffa0275e8c &amp;lt;osc_queue_async_io+2460&amp;gt;:   nopl   0x0(%rax)&lt;/p&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="51729" author="johann" created="Mon, 4 Feb 2013 13:09:18 +0000"  >&lt;p&gt;Another possibility is that we forgot somewhere to unlock the spinlock ...&lt;/p&gt;</comment>
                            <comment id="51731" author="keith" created="Mon, 4 Feb 2013 14:00:16 +0000"  >&lt;p&gt;I had started on setting up a local setup to help get the crash but I see we have that first step.  I am now working to debug the spinlock.  It will be good to see who touched the lock last.  &lt;/p&gt;</comment>
                            <comment id="51771" author="keith" created="Mon, 4 Feb 2013 23:48:57 +0000"  >&lt;p&gt;Sorry no hard data from my local debug setup yet, I wasted some time with a 1.8 build environment when I should have just grabbed the 1.8.9 rpms from the build.  In general the plan for the Master Client is to ensure that when a spinlock is acquired the place and task is captured so that when everyone is waiting it lock can be examined to see who and where it was touched last. With any luck it is not a small window where a debug spinlock will not cause the issue to be skipped.&lt;/p&gt;

&lt;p&gt;With the current state of master in such change is there a version of master that is known to work? &lt;/p&gt;</comment>
                            <comment id="51778" author="bobijam" created="Tue, 5 Feb 2013 06:38:33 +0000"  >&lt;p&gt;I think I&apos;ve found who holds the cl_loi_list_lock and sleeps away.&lt;/p&gt;

&lt;div class=&quot;panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;panelContent&quot;&gt;
&lt;p&gt;crash&amp;gt; ps | grep ptlrpcd&lt;br/&gt;
   1808      2   0  ffff880037f0f540  RU   0.0       0      0  &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpcd&amp;#93;&lt;/span&gt;&lt;br/&gt;
   1809      2   0  ffff88007d3af500  RU   0.0       0      0  &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpcd-recov&amp;#93;&lt;/span&gt;&lt;br/&gt;
crash&amp;gt; bt 1808&lt;br/&gt;
PID: 1808   TASK: ffff880037f0f540  CPU: 0   COMMAND: &quot;ptlrpcd&quot;&lt;br/&gt;
 #0 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff88007c881620&amp;#93;&lt;/span&gt; schedule at ffffffff814e9c02&lt;br/&gt;
 #1 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff88007c8816e8&amp;#93;&lt;/span&gt; __cond_resched at ffffffff8106118a&lt;br/&gt;
 #2 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff88007c881708&amp;#93;&lt;/span&gt; _cond_resched at ffffffff814ea610&lt;br/&gt;
 #3 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff88007c881718&amp;#93;&lt;/span&gt; __kmalloc at ffffffff8115e8b0&lt;br/&gt;
 #4 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff88007c881768&amp;#93;&lt;/span&gt; cfs_alloc at ffffffffa039e701 &lt;span class=&quot;error&quot;&gt;&amp;#91;libcfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
 #5 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff88007c881798&amp;#93;&lt;/span&gt; ldlm_bl_to_thread at ffffffffa051b2e9 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
 #6 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff88007c8818c8&amp;#93;&lt;/span&gt; ldlm_bl_to_thread_lock at ffffffffa051b8eb &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
 #7 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff88007c881918&amp;#93;&lt;/span&gt; ldlm_lock_decref_internal at ffffffffa04f7f9d &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
 #8 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff88007c881998&amp;#93;&lt;/span&gt; ldlm_lock_decref at ffffffffa04f8fd9 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
 #9 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff88007c8819e8&amp;#93;&lt;/span&gt; osc_cancel at ffffffffa026962e &lt;span class=&quot;error&quot;&gt;&amp;#91;osc&amp;#93;&lt;/span&gt;&lt;br/&gt;
#10 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff88007c881a38&amp;#93;&lt;/span&gt; lov_cancel at ffffffffa0684e0b &lt;span class=&quot;error&quot;&gt;&amp;#91;lov&amp;#93;&lt;/span&gt;&lt;br/&gt;
#11 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff88007c881b58&amp;#93;&lt;/span&gt; ll_ap_completion at ffffffffa0701669 &lt;span class=&quot;error&quot;&gt;&amp;#91;lustre&amp;#93;&lt;/span&gt;&lt;br/&gt;
#12 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff88007c881c18&amp;#93;&lt;/span&gt; lov_ap_completion at ffffffffa0677b68 &lt;span class=&quot;error&quot;&gt;&amp;#91;lov&amp;#93;&lt;/span&gt;&lt;br/&gt;
#13 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff88007c881c48&amp;#93;&lt;/span&gt; osc_ap_completion at ffffffffa02683b3 &lt;span class=&quot;error&quot;&gt;&amp;#91;osc&amp;#93;&lt;/span&gt; &lt;font color=&quot;red&quot;&gt; ====&amp;gt; here ptlrpcd has holden the cl_loi_list_lock&lt;/font&gt;&lt;br/&gt;
#14 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff88007c881cb8&amp;#93;&lt;/span&gt; brw_interpret at ffffffffa0279429 &lt;span class=&quot;error&quot;&gt;&amp;#91;osc&amp;#93;&lt;/span&gt;&lt;br/&gt;
#15 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff88007c881d58&amp;#93;&lt;/span&gt; ptlrpc_check_set at ffffffffa0530a9a &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
#16 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff88007c881e38&amp;#93;&lt;/span&gt; ptlrpcd_check at ffffffffa05677ad &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
#17 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff88007c881e98&amp;#93;&lt;/span&gt; ptlrpcd at ffffffffa0567a50 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
#18 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff88007c881f48&amp;#93;&lt;/span&gt; kernel_thread at ffffffff8100c0ca&lt;/p&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Will push a patch for review.&lt;/p&gt;</comment>
                            <comment id="51779" author="johann" created="Tue, 5 Feb 2013 06:45:41 +0000"  >&lt;p&gt;I think master has a patch for this already (it prevents kmalloc on this path).&lt;/p&gt;</comment>
                            <comment id="51780" author="bobijam" created="Tue, 5 Feb 2013 06:50:33 +0000"  >&lt;p&gt;this issue is a little related to &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-2468&quot; title=&quot;MDS out of memory, blocked in ldlm_pools_shrink()&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-2468&quot;&gt;&lt;del&gt;LU-2468&lt;/del&gt;&lt;/a&gt;, they are all both involved in trying to allocate memory during memory freeing phase.&lt;/p&gt;</comment>
                            <comment id="51782" author="bobijam" created="Tue, 5 Feb 2013 07:19:09 +0000"  >&lt;p&gt;patch tracking at &lt;a href=&quot;http://review.whamcloud.com/5272&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/5272&lt;/a&gt;&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedHeader panelHeader&quot; style=&quot;border-bottom-width: 1px;&quot;&gt;&lt;b&gt;commit message&lt;/b&gt;&lt;/div&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;LU-2703 osc: ptlrpcd scheduled with a spinlock

When memory is stringent, there is a change that ptlrpcd calling path
reschedules waiting for memory available while holding a spinlock,
the backtrace is as follows:

PID: 1808 TASK: ffff880037f0f540 CPU: 0 COMMAND: &quot;ptlrpcd&quot;
0 [ffff88007c881620] schedule at ffffffff814e9c02
1 [ffff88007c8816e8] __cond_resched at ffffffff8106118a
2 [ffff88007c881708] _cond_resched at ffffffff814ea610
3 [ffff88007c881718] __kmalloc at ffffffff8115e8b0
4 [ffff88007c881768] cfs_alloc at ffffffffa039e701 [libcfs]
5 [ffff88007c881798] ldlm_bl_to_thread at ffffffffa051b2e9 [ptlrpc]
6 [ffff88007c8818c8] ldlm_bl_to_thread_lock at ffffffffa051b8eb
							   [ptlrpc]
7 [ffff88007c881918] ldlm_lock_decref_internal at ffffffffa04f7f9d
[ptlrpc]
8 [ffff88007c881998] ldlm_lock_decref at ffffffffa04f8fd9 [ptlrpc]
9 [ffff88007c8819e8] osc_cancel at ffffffffa026962e [osc]
10 [ffff88007c881a38] lov_cancel at ffffffffa0684e0b [lov]
11 [ffff88007c881b58] ll_ap_completion at ffffffffa0701669 [lustre]
12 [ffff88007c881c18] lov_ap_completion at ffffffffa0677b68 [lov]
13 [ffff88007c881c48] osc_ap_completion at ffffffffa02683b3 [osc]
====&amp;gt; here ptlrpcd has holden the cl_loi_list_lock
14 [ffff88007c881cb8] brw_interpret at ffffffffa0279429 [osc]
15 [ffff88007c881d58] ptlrpc_check_set at ffffffffa0530a9a [ptlrpc]
16 [ffff88007c881e38] ptlrpcd_check at ffffffffa05677ad [ptlrpc]
17 [ffff88007c881e98] ptlrpcd at ffffffffa0567a50 [ptlrpc]
18 [ffff88007c881f48] kernel_thread at ffffffff8100c0ca

This patch unlocks the spinlock before calling osc_ap_completion().
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="51787" author="johann" created="Tue, 5 Feb 2013 09:22:15 +0000"  >&lt;p&gt;This issue was actually added by this commit 29309746c4049aa7da6cde4cb9a44ec0df2b1af3 (bugzilla 21252) which was a fix for bugzilla 16774. I think nobody uses this feature on 1.8 (which is disabled by default anyway). Maybe we could just revert bugzilla 21252 as well as some part of bugzilla 16774 to restore the default readahead locking behavior?&lt;/p&gt;</comment>
                            <comment id="51858" author="hongchao.zhang" created="Wed, 6 Feb 2013 08:22:06 +0000"  >&lt;p&gt;the obd_cancel call in ll_ap_completion is too big to be included in a spin_lock for it is easy to cause such issue,&lt;br/&gt;
we could revert bugzilla 21252&amp;amp;16774 or move the obd_cancel out of *_ap_completion, will create a patch for the second options soon.&lt;/p&gt;</comment>
                            <comment id="51869" author="hongchao.zhang" created="Wed, 6 Feb 2013 11:02:52 +0000"  >&lt;p&gt;the patch is tracked at &lt;a href=&quot;http://review.whamcloud.com/#change,5285&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#change,5285&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="52096" author="bfaccini" created="Sat, 9 Feb 2013 19:32:41 +0000"  >&lt;p&gt;I am not sure but seems to me that some of the test failures may come from memory-leak detected/indicated at obdclass module unload. In fact it is the memory accounting which seems to be wrong and found more memory freed (&quot;leaked: 18446744073709551612&quot;, ie 0xFFFFFFFFFFFFFFFC).This problem may have been induced by last patch #5 where the new macro PPGA_SIZE() plays with pointer size to determine what unit-size to alloc/free in osc_build_ppga()/osc_release_ppga(). But this can fail (actually on archs, like i686 where pointer size is &amp;lt; sizeof(struct lustre_handle)=64-bits) if not all allocations use osc_build_ppga() like in osc_build_req().&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                        <issuelink>
            <issuekey id="16899">LU-2468</issuekey>
        </issuelink>
                            </outwardlinks>
                                                        </issuelinktype>
                    </issuelinks>
                <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzvgcf:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>6296</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>