<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 01:47:31 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-4982] Stack overrun in client under ll_fault</title>
                <link>https://jira.whamcloud.com/browse/LU-4982</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;Running a lustre client that is essentially 2.4.0-25chaos (see github.com/chaos/lustre) plus my stathead patch from &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-4624&quot; title=&quot;ll_stop_statahead deadlock&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-4624&quot;&gt;&lt;del&gt;LU-4624&lt;/del&gt;&lt;/a&gt;, a user hit the following kernel Oops:&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;2014-04-29 11:58:54 BUG: sleeping function called from invalid context at mm/slab.c:3113
2014-04-29 11:58:54 in_atomic(): 0, irqs_disabled(): 0, pid: 130134, name: java
2014-04-29 11:58:54 Pid: 130134, comm: java Tainted: G        W  ---------------    2.6.32-358.14.1.3chaos.ch5.1.1.x8
6_64 #1
2014-04-29 11:58:54 BUG: unable to handle kernel paging request at 00000003811c4f40
2014-04-29 11:58:54 IP: [&amp;lt;ffffffff81056d24&amp;gt;] update_curr+0x144/0x1f0
2014-04-29 11:58:54 PGD e38c58067 PUD 0
2014-04-29 11:58:54 Thread overran stack, or stack corrupted
2014-04-29 11:58:54 Oops: 0000 [#1] SMP
2014-04-29 11:58:54 last sysfs file: /sys/devices/pci0000:00/0000:00:01.1/0000:02:00.3/net/eth1/statistics/tx_window_
errors
2014-04-29 11:58:54 CPU 2 
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;I&apos;ll show the backtrace from crash rather than the console, because it is much cleaner:&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;PID: 130134  TASK: ffff880860051500  CPU: 2   COMMAND: &quot;java&quot;
 #0 [ffff8800446439a0] machine_kexec at ffffffff81035fcb
 #1 [ffff880044643a00] crash_kexec at ffffffff810c10b2
 #2 [ffff880044643ad0] oops_end at ffffffff815145a0
 #3 [ffff880044643b00] no_context at ffffffff81046c1b
 #4 [ffff880044643b50] __bad_area_nosemaphore at ffffffff81046ea5
 #5 [ffff880044643ba0] bad_area_nosemaphore at ffffffff81046f73
 #6 [ffff880044643bb0] __do_page_fault at ffffffff810476e1
 #7 [ffff880044643cd0] do_page_fault at ffffffff815164ee
 #8 [ffff880044643d00] page_fault at ffffffff815138a5
    [exception RIP: update_curr+324]
    RIP: ffffffff81056d24  RSP: ffff880044643db8  RFLAGS: 00010086
    RAX: ffff880860051500  RBX: 000000007feb8e00  RCX: ffff880877f101c0
    RDX: 00000000000192d8  RSI: 0000000000000000  RDI: ffff880860051538
    RBP: ffff880044643de8   R8: ffffffff8160bb45   R9: 0000000000000024
    R10: 0000000000000010  R11: 0000000000000024  R12: ffff880044656768
    R13: 0000000002275cdc  R14: 0013346892d953ec  R15: ffff880860051500
    ORIG_RAX: ffffffffffffffff  CS: 0010  SS: 0018
 #9 [ffff880044643df0] task_tick_fair at ffffffff810572db
#10 [ffff880044643e20] scheduler_tick at ffffffff8105b121
#11 [ffff880044643e60] update_process_times at ffffffff810819fe
#12 [ffff880044643e90] tick_sched_timer at ffffffff810a80c6
#13 [ffff880044643ec0] __run_hrtimer at ffffffff8109bc7e
#14 [ffff880044643f10] hrtimer_interrupt at ffffffff8109bfe6
#15 [ffff880044643f90] smp_apic_timer_interrupt at ffffffff8151a14b
#16 [ffff880044643fb0] apic_timer_interrupt at ffffffff8100bbd3
--- &amp;lt;IRQ stack&amp;gt; ---
#17 [ffff880148cb6078] apic_timer_interrupt at ffffffff8100bbd3
    [exception RIP: vprintk+593]
    RIP: ffffffff8106f491  RSP: ffff880148cb6128  RFLAGS: 00000246
    RAX: 0000000000011480  RBX: ffff880148cb61b8  RCX: 000000000000902c
    RDX: ffff880044640000  RSI: 0000000000000046  RDI: 0000000000000246
    RBP: ffffffff8100bbce   R8: 0000000000000000   R9: ffffffff8163fde0
    R10: 0000000000000001  R11: 0000000000000000  R12: ffffffff8106e6d5
    R13: ffff880148cb60b8  R14: 0000000000000046  R15: 0000000000000069
    ORIG_RAX: ffffffffffffff10  CS: 0010  SS: 0018
#18 [ffff880148cb61c0] printk at ffffffff815103f5
#19 [ffff880148cb6220] dump_stack at ffffffff81510231
#20 [ffff880148cb6260] __might_sleep at ffffffff8105695a
#21 [ffff880148cb6270] kmem_cache_alloc_node at ffffffff81167e23
#22 [ffff880148cb62c0] cache_grow at ffffffff81168197
#23 [ffff880148cb6330] cache_alloc_refill at ffffffff81168442
#24 [ffff880148cb63a0] __kmalloc at ffffffff81168af9
#25 [ffff880148cb63f0] cfs_alloc at ffffffffa0437be0 [libcfs]
#26 [ffff880148cb6420] lu_context_init at ffffffffa05cf347 [obdclass]
#27 [ffff880148cb6450] lu_env_init at ffffffffa05cf57e [obdclass]
#28 [ffff880148cb6470] cl_env_new at ffffffffa05d670d [obdclass]
#29 [ffff880148cb64c0] cl_env_get at ffffffffa05d7095 [obdclass]
#30 [ffff880148cb64f0] cl_env_nested_get at ffffffffa05d77cd [obdclass]
#31 [ffff880148cb6510] ll_releasepage at ffffffffa0aaf27c [lustre]
#32 [ffff880148cb6560] try_to_release_page at ffffffff811195a0
#33 [ffff880148cb6570] shrink_page_list.clone.3 at ffffffff81132ef7
#34 [ffff880148cb66c0] shrink_inactive_list at ffffffff81133a23
#35 [ffff880148cb6870] shrink_mem_cgroup_zone at ffffffff81134286
#36 [ffff880148cb6920] shrink_zone at ffffffff8113448a
#37 [ffff880148cb69a0] zone_reclaim at ffffffff8113515c
#38 [ffff880148cb6a70] get_page_from_freelist at ffffffff8112ac3c
#39 [ffff880148cb6b90] __alloc_pages_nodemask at ffffffff8112c1e3
#40 [ffff880148cb6cd0] kmem_getpages at ffffffff81167832
#41 [ffff880148cb6d00] cache_grow at ffffffff811681f1
#42 [ffff880148cb6d70] cache_alloc_refill at ffffffff81168442
#43 [ffff880148cb6de0] __kmalloc at ffffffff81168af9
#44 [ffff880148cb6e30] cfs_alloc at ffffffffa0437be0 [libcfs]
#45 [ffff880148cb6e60] null_alloc_reqbuf at ffffffffa07955c0 [ptlrpc]
#46 [ffff880148cb6e90] sptlrpc_cli_alloc_reqbuf at ffffffffa0785129 [ptlrpc]
#47 [ffff880148cb6ec0] lustre_pack_request at ffffffffa07590a1 [ptlrpc]
#48 [ffff880148cb6f20] __ptlrpc_request_bufs_pack at ffffffffa0745525 [ptlrpc]
#49 [ffff880148cb6f80] ptlrpc_request_bufs_pack at ffffffffa074584c [ptlrpc]
#50 [ffff880148cb6fd0] ptlrpc_request_pack at ffffffffa0745894 [ptlrpc]
#51 [ffff880148cb7000] osc_brw_prep_request at ffffffffa094a7a5 [osc]
#52 [ffff880148cb70e0] osc_build_rpc at ffffffffa094e36f [osc]
#53 [ffff880148cb71e0] osc_io_unplug0 at ffffffffa09689ed [osc]
#54 [ffff880148cb7390] osc_io_unplug at ffffffffa096b3b1 [osc]
#55 [ffff880148cb73a0] osc_queue_sync_pages at ffffffffa096b590 [osc]
#56 [ffff880148cb7410] osc_io_submit at ffffffffa095d2bf [osc]
#57 [ffff880148cb74b0] cl_io_submit_rw at ffffffffa05e43bc [obdclass]
#58 [ffff880148cb7500] lov_io_submit at ffffffffa09f2151 [lov]
#59 [ffff880148cb7590] cl_io_submit_rw at ffffffffa05e43bc [obdclass]
#60 [ffff880148cb75e0] cl_io_read_page at ffffffffa05e69de [obdclass]
#61 [ffff880148cb7630] ll_readpage at ffffffffa0a96bb6 [lustre]
#62 [ffff880148cb7670] filemap_fault at ffffffff8111b773
#63 [ffff880148cb76e0] vvp_io_fault_start at ffffffffa0ac47ec [lustre]
#64 [ffff880148cb7760] cl_io_start at ffffffffa05e451a [obdclass]
#65 [ffff880148cb7790] cl_io_loop at ffffffffa05e8c54 [obdclass]
#66 [ffff880148cb77c0] ll_fault at ffffffffa0aa6632 [lustre]
#67 [ffff880148cb7860] __do_fault at ffffffff811439f4
#68 [ffff880148cb78f0] handle_pte_fault at ffffffff81143fc7
#69 [ffff880148cb79d0] handle_mm_fault at ffffffff81144c6a
#70 [ffff880148cb7a40] __do_page_fault at ffffffff810474fc
#71 [ffff880148cb7b60] do_page_fault at ffffffff815164ee
#72 [ffff880148cb7b90] page_fault at ffffffff815138a5
    [exception RIP: copy_user_generic_string+45]
    RIP: ffffffff812844ed  RSP: ffff880148cb7c40  RFLAGS: 00010246
    RAX: ffff880148cb6000  RBX: ffff88102a7110c0  RCX: 00000000000001ff
    RDX: 0000000000000000  RSI: 00002ab74ff5cffc  RDI: ffff880859847008
    RBP: ffff880148cb7ce8   R8: 0000000000000000   R9: 0000000000000003
    R10: 0000000000000000  R11: ffffea001d394f88  R12: 0000000000001000
    R13: ffff8803d8920540  R14: 000000000000d00c  R15: 000000000000ffbc
    ORIG_RAX: ffffffffffffffff  CS: 0010  SS: 0018
#73 [ffff880148cb7c40] tcp_sendmsg at ffffffff8148f7c5
#74 [ffff880148cb7cf0] sock_aio_write at ffffffff81439deb
#75 [ffff880148cb7dc0] do_sync_write at ffffffff81181d9a
#76 [ffff880148cb7ef0] vfs_write at ffffffff81182164
#77 [ffff880148cb7f30] sys_write at ffffffff81182901
#78 [ffff880148cb7f80] system_call_fastpath at ffffffff8100b0b2
    RIP: 00002aaaaacdb4ed  RSP: 00002ab74fb2f590  RFLAGS: 00000246
    RAX: 0000000000000001  RBX: ffffffff8100b0b2  RCX: 00002ab4b37817c8
    RDX: 0000000000010000  RSI: 00002ab74ff5a000  RDI: 0000000000000069
    RBP: 0000000000010000   R8: 00002aaaac0228e0   R9: 00002ab4b698a470
    R10: 00000000000002ae  R11: 0000000000000293  R12: 00002ab4b698a460
    R13: 00002ab8300029d0  R14: 00002ab74ff5a000  R15: 00002ab8300029d0
    ORIG_RAX: 0000000000000001  CS: 0033  SS: 002b
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;It looks like the code is doing mmap IO.  Note that we are reentering cfs_alloc under the shrinker that was called to handle the first cfs_alloc.&lt;/p&gt;</description>
                <environment></environment>
        <key id="24499">LU-4982</key>
            <summary>Stack overrun in client under ll_fault</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="3" iconUrl="https://jira.whamcloud.com/images/icons/priorities/major.svg">Major</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="3">Duplicate</resolution>
                                        <assignee username="bobijam">Zhenyu Xu</assignee>
                                    <reporter username="morrone">Christopher Morrone</reporter>
                        <labels>
                            <label>llnl</label>
                    </labels>
                <created>Wed, 30 Apr 2014 00:14:04 +0000</created>
                <updated>Tue, 26 Aug 2014 22:54:20 +0000</updated>
                            <resolved>Tue, 26 Aug 2014 22:54:13 +0000</resolved>
                                                                        <due></due>
                            <votes>0</votes>
                                    <watches>7</watches>
                                                                            <comments>
                            <comment id="82834" author="bfaccini" created="Wed, 30 Apr 2014 09:14:09 +0000"  >&lt;p&gt;Looks very similar to &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-3188&quot; title=&quot;IOR fails due to client stack overrun&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-3188&quot;&gt;&lt;del&gt;LU-3188&lt;/del&gt;&lt;/a&gt;, but its patch having osc_queue_sync_pages() call osc_io_unplug_async() instead of osc_io_unplug(), to shorten the IO calling path and thus to reduce the chance of stack overflow, has been reverted because suspected to cause side effects.&lt;br/&gt;
I wonder why &quot;recent&quot; Kernels still use only 2 pages stacks, I thought it had been at least doubled and since a long time ...&lt;/p&gt;</comment>
                            <comment id="82845" author="pjones" created="Wed, 30 Apr 2014 13:23:50 +0000"  >&lt;p&gt;Bobijam&lt;/p&gt;

&lt;p&gt;Could you please comment on this one?&lt;/p&gt;

&lt;p&gt;Thanks&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="82961" author="jay" created="Thu, 1 May 2014 00:46:34 +0000"  >&lt;p&gt;It&apos;s probably good to revive the patch in &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-3188&quot; title=&quot;IOR fails due to client stack overrun&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-3188&quot;&gt;&lt;del&gt;LU-3188&lt;/del&gt;&lt;/a&gt; since we have patch in &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-4509&quot; title=&quot;clio can be stuck in osc_extent_wait&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-4509&quot;&gt;&lt;del&gt;LU-4509&lt;/del&gt;&lt;/a&gt; landed.&lt;/p&gt;</comment>
                            <comment id="83840" author="bobijam" created="Mon, 12 May 2014 14:17:43 +0000"  >&lt;p&gt;revive patch &lt;a href=&quot;http://review.whamcloud.com/10292&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/10292&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="87321" author="pjones" created="Mon, 23 Jun 2014 21:37:47 +0000"  >&lt;p&gt;The patch from &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-3188&quot; title=&quot;IOR fails due to client stack overrun&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-3188&quot;&gt;&lt;del&gt;LU-3188&lt;/del&gt;&lt;/a&gt; has landed to master. LLNL will try it out against their reproducer to see whether this is sufficient.&lt;/p&gt;</comment>
                            <comment id="92550" author="morrone" created="Tue, 26 Aug 2014 22:47:48 +0000"  >&lt;p&gt;We ran the reproducer &quot;a whole bunch of times, didn&apos;t see a crash&quot;.  I believe that this is fixed.  We&apos;ll want it re-landed to b2_5 as well.&lt;/p&gt;</comment>
                            <comment id="92552" author="pjones" created="Tue, 26 Aug 2014 22:54:13 +0000"  >&lt;p&gt;Great! So it is a duplicate of &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-3188&quot; title=&quot;IOR fails due to client stack overrun&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-3188&quot;&gt;&lt;del&gt;LU-3188&lt;/del&gt;&lt;/a&gt; which is fixed in 2.6 and we should consider for inclusion in a 2.5.x maintenance release&lt;/p&gt;</comment>
                    </comments>
                    <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzwlgv:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>13802</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>