Details
-
Bug
-
Resolution: Duplicate
-
Major
-
None
-
None
-
3
-
13802
Description
Running a lustre client that is essentially 2.4.0-25chaos (see github.com/chaos/lustre) plus my stathead patch from LU-4624, a user hit the following kernel Oops:
2014-04-29 11:58:54 BUG: sleeping function called from invalid context at mm/slab.c:3113 2014-04-29 11:58:54 in_atomic(): 0, irqs_disabled(): 0, pid: 130134, name: java 2014-04-29 11:58:54 Pid: 130134, comm: java Tainted: G W --------------- 2.6.32-358.14.1.3chaos.ch5.1.1.x8 6_64 #1 2014-04-29 11:58:54 BUG: unable to handle kernel paging request at 00000003811c4f40 2014-04-29 11:58:54 IP: [<ffffffff81056d24>] update_curr+0x144/0x1f0 2014-04-29 11:58:54 PGD e38c58067 PUD 0 2014-04-29 11:58:54 Thread overran stack, or stack corrupted 2014-04-29 11:58:54 Oops: 0000 [#1] SMP 2014-04-29 11:58:54 last sysfs file: /sys/devices/pci0000:00/0000:00:01.1/0000:02:00.3/net/eth1/statistics/tx_window_ errors 2014-04-29 11:58:54 CPU 2
I'll show the backtrace from crash rather than the console, because it is much cleaner:
PID: 130134 TASK: ffff880860051500 CPU: 2 COMMAND: "java" #0 [ffff8800446439a0] machine_kexec at ffffffff81035fcb #1 [ffff880044643a00] crash_kexec at ffffffff810c10b2 #2 [ffff880044643ad0] oops_end at ffffffff815145a0 #3 [ffff880044643b00] no_context at ffffffff81046c1b #4 [ffff880044643b50] __bad_area_nosemaphore at ffffffff81046ea5 #5 [ffff880044643ba0] bad_area_nosemaphore at ffffffff81046f73 #6 [ffff880044643bb0] __do_page_fault at ffffffff810476e1 #7 [ffff880044643cd0] do_page_fault at ffffffff815164ee #8 [ffff880044643d00] page_fault at ffffffff815138a5 [exception RIP: update_curr+324] RIP: ffffffff81056d24 RSP: ffff880044643db8 RFLAGS: 00010086 RAX: ffff880860051500 RBX: 000000007feb8e00 RCX: ffff880877f101c0 RDX: 00000000000192d8 RSI: 0000000000000000 RDI: ffff880860051538 RBP: ffff880044643de8 R8: ffffffff8160bb45 R9: 0000000000000024 R10: 0000000000000010 R11: 0000000000000024 R12: ffff880044656768 R13: 0000000002275cdc R14: 0013346892d953ec R15: ffff880860051500 ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0018 #9 [ffff880044643df0] task_tick_fair at ffffffff810572db #10 [ffff880044643e20] scheduler_tick at ffffffff8105b121 #11 [ffff880044643e60] update_process_times at ffffffff810819fe #12 [ffff880044643e90] tick_sched_timer at ffffffff810a80c6 #13 [ffff880044643ec0] __run_hrtimer at ffffffff8109bc7e #14 [ffff880044643f10] hrtimer_interrupt at ffffffff8109bfe6 #15 [ffff880044643f90] smp_apic_timer_interrupt at ffffffff8151a14b #16 [ffff880044643fb0] apic_timer_interrupt at ffffffff8100bbd3 --- <IRQ stack> --- #17 [ffff880148cb6078] apic_timer_interrupt at ffffffff8100bbd3 [exception RIP: vprintk+593] RIP: ffffffff8106f491 RSP: ffff880148cb6128 RFLAGS: 00000246 RAX: 0000000000011480 RBX: ffff880148cb61b8 RCX: 000000000000902c RDX: ffff880044640000 RSI: 0000000000000046 RDI: 0000000000000246 RBP: ffffffff8100bbce R8: 0000000000000000 R9: ffffffff8163fde0 R10: 0000000000000001 R11: 0000000000000000 R12: ffffffff8106e6d5 R13: ffff880148cb60b8 R14: 0000000000000046 R15: 0000000000000069 ORIG_RAX: ffffffffffffff10 CS: 0010 SS: 0018 #18 [ffff880148cb61c0] printk at ffffffff815103f5 #19 [ffff880148cb6220] dump_stack at ffffffff81510231 #20 [ffff880148cb6260] __might_sleep at ffffffff8105695a #21 [ffff880148cb6270] kmem_cache_alloc_node at ffffffff81167e23 #22 [ffff880148cb62c0] cache_grow at ffffffff81168197 #23 [ffff880148cb6330] cache_alloc_refill at ffffffff81168442 #24 [ffff880148cb63a0] __kmalloc at ffffffff81168af9 #25 [ffff880148cb63f0] cfs_alloc at ffffffffa0437be0 [libcfs] #26 [ffff880148cb6420] lu_context_init at ffffffffa05cf347 [obdclass] #27 [ffff880148cb6450] lu_env_init at ffffffffa05cf57e [obdclass] #28 [ffff880148cb6470] cl_env_new at ffffffffa05d670d [obdclass] #29 [ffff880148cb64c0] cl_env_get at ffffffffa05d7095 [obdclass] #30 [ffff880148cb64f0] cl_env_nested_get at ffffffffa05d77cd [obdclass] #31 [ffff880148cb6510] ll_releasepage at ffffffffa0aaf27c [lustre] #32 [ffff880148cb6560] try_to_release_page at ffffffff811195a0 #33 [ffff880148cb6570] shrink_page_list.clone.3 at ffffffff81132ef7 #34 [ffff880148cb66c0] shrink_inactive_list at ffffffff81133a23 #35 [ffff880148cb6870] shrink_mem_cgroup_zone at ffffffff81134286 #36 [ffff880148cb6920] shrink_zone at ffffffff8113448a #37 [ffff880148cb69a0] zone_reclaim at ffffffff8113515c #38 [ffff880148cb6a70] get_page_from_freelist at ffffffff8112ac3c #39 [ffff880148cb6b90] __alloc_pages_nodemask at ffffffff8112c1e3 #40 [ffff880148cb6cd0] kmem_getpages at ffffffff81167832 #41 [ffff880148cb6d00] cache_grow at ffffffff811681f1 #42 [ffff880148cb6d70] cache_alloc_refill at ffffffff81168442 #43 [ffff880148cb6de0] __kmalloc at ffffffff81168af9 #44 [ffff880148cb6e30] cfs_alloc at ffffffffa0437be0 [libcfs] #45 [ffff880148cb6e60] null_alloc_reqbuf at ffffffffa07955c0 [ptlrpc] #46 [ffff880148cb6e90] sptlrpc_cli_alloc_reqbuf at ffffffffa0785129 [ptlrpc] #47 [ffff880148cb6ec0] lustre_pack_request at ffffffffa07590a1 [ptlrpc] #48 [ffff880148cb6f20] __ptlrpc_request_bufs_pack at ffffffffa0745525 [ptlrpc] #49 [ffff880148cb6f80] ptlrpc_request_bufs_pack at ffffffffa074584c [ptlrpc] #50 [ffff880148cb6fd0] ptlrpc_request_pack at ffffffffa0745894 [ptlrpc] #51 [ffff880148cb7000] osc_brw_prep_request at ffffffffa094a7a5 [osc] #52 [ffff880148cb70e0] osc_build_rpc at ffffffffa094e36f [osc] #53 [ffff880148cb71e0] osc_io_unplug0 at ffffffffa09689ed [osc] #54 [ffff880148cb7390] osc_io_unplug at ffffffffa096b3b1 [osc] #55 [ffff880148cb73a0] osc_queue_sync_pages at ffffffffa096b590 [osc] #56 [ffff880148cb7410] osc_io_submit at ffffffffa095d2bf [osc] #57 [ffff880148cb74b0] cl_io_submit_rw at ffffffffa05e43bc [obdclass] #58 [ffff880148cb7500] lov_io_submit at ffffffffa09f2151 [lov] #59 [ffff880148cb7590] cl_io_submit_rw at ffffffffa05e43bc [obdclass] #60 [ffff880148cb75e0] cl_io_read_page at ffffffffa05e69de [obdclass] #61 [ffff880148cb7630] ll_readpage at ffffffffa0a96bb6 [lustre] #62 [ffff880148cb7670] filemap_fault at ffffffff8111b773 #63 [ffff880148cb76e0] vvp_io_fault_start at ffffffffa0ac47ec [lustre] #64 [ffff880148cb7760] cl_io_start at ffffffffa05e451a [obdclass] #65 [ffff880148cb7790] cl_io_loop at ffffffffa05e8c54 [obdclass] #66 [ffff880148cb77c0] ll_fault at ffffffffa0aa6632 [lustre] #67 [ffff880148cb7860] __do_fault at ffffffff811439f4 #68 [ffff880148cb78f0] handle_pte_fault at ffffffff81143fc7 #69 [ffff880148cb79d0] handle_mm_fault at ffffffff81144c6a #70 [ffff880148cb7a40] __do_page_fault at ffffffff810474fc #71 [ffff880148cb7b60] do_page_fault at ffffffff815164ee #72 [ffff880148cb7b90] page_fault at ffffffff815138a5 [exception RIP: copy_user_generic_string+45] RIP: ffffffff812844ed RSP: ffff880148cb7c40 RFLAGS: 00010246 RAX: ffff880148cb6000 RBX: ffff88102a7110c0 RCX: 00000000000001ff RDX: 0000000000000000 RSI: 00002ab74ff5cffc RDI: ffff880859847008 RBP: ffff880148cb7ce8 R8: 0000000000000000 R9: 0000000000000003 R10: 0000000000000000 R11: ffffea001d394f88 R12: 0000000000001000 R13: ffff8803d8920540 R14: 000000000000d00c R15: 000000000000ffbc ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0018 #73 [ffff880148cb7c40] tcp_sendmsg at ffffffff8148f7c5 #74 [ffff880148cb7cf0] sock_aio_write at ffffffff81439deb #75 [ffff880148cb7dc0] do_sync_write at ffffffff81181d9a #76 [ffff880148cb7ef0] vfs_write at ffffffff81182164 #77 [ffff880148cb7f30] sys_write at ffffffff81182901 #78 [ffff880148cb7f80] system_call_fastpath at ffffffff8100b0b2 RIP: 00002aaaaacdb4ed RSP: 00002ab74fb2f590 RFLAGS: 00000246 RAX: 0000000000000001 RBX: ffffffff8100b0b2 RCX: 00002ab4b37817c8 RDX: 0000000000010000 RSI: 00002ab74ff5a000 RDI: 0000000000000069 RBP: 0000000000010000 R8: 00002aaaac0228e0 R9: 00002ab4b698a470 R10: 00000000000002ae R11: 0000000000000293 R12: 00002ab4b698a460 R13: 00002ab8300029d0 R14: 00002ab74ff5a000 R15: 00002ab8300029d0 ORIG_RAX: 0000000000000001 CS: 0033 SS: 002b
It looks like the code is doing mmap IO. Note that we are reentering cfs_alloc under the shrinker that was called to handle the first cfs_alloc.