Details
-
Bug
-
Resolution: Unresolved
-
Critical
-
None
-
Lustre 2.15.0
-
None
-
4.18.0-348.2.1.el8_lustre.x86_64 redhat8.5
-
2
-
9223372036854775807
Description
Repeated kernel crash (BUG: unable to handle kernel paging reques) with zfs and without zfs. I have multiple crash dump I can upload.
Here are some of the stack traces
127.0.0.1-2022-07-12-12\:07\:42/vmcore crash> bt 21199 PID: 21199 TASK: ffff90997c843000 CPU: 28 COMMAND: "fio" #0 [ffff9fe0e357fb88] machine_kexec at fffffffface641ce #1 [ffff9fe0e357fbe0] __crash_kexec at ffffffffacf9df1d #2 [ffff9fe0e357fca8] crash_kexec at ffffffffacf9ee0d #3 [ffff9fe0e357fcc0] oops_end at fffffffface2613d #4 [ffff9fe0e357fce0] no_context at fffffffface7562f #5 [ffff9fe0e357fd38] __bad_area_nosemaphore at fffffffface7598c #6 [ffff9fe0e357fd80] do_page_fault at fffffffface76267 #7 [ffff9fe0e357fdb0] page_fault at ffffffffad80111e [exception RIP: __kmalloc_node+418] RIP: ffffffffad0fd382 RSP: ffff9fe0e357fe68 RFLAGS: 00010246 RAX: ffff909c032f3e78 RBX: ffff9fe0e357fed8 RCX: 0000000000000000 RDX: 0000000000623f36 RSI: 00000000006000c0 RDI: 000000000002f040 RBP: 00000000006000c0 R8: ffff909bbd32f040 R9: ffff908ff2efae40 R10: ffff905c80004400 R11: ffffffffae45a410 R12: 0000000000000008 R13: 00000000ffffffff R14: ffff905c80004400 R15: ffffffffad75537b ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0018 #8 [ffff9fe0e357feb8] alloc_cpumask_var_node at ffffffffad75537b #9 [ffff9fe0e357fec8] sched_setaffinity at ffffffffacf1eb4d #10 [ffff9fe0e357ff08] __x64_sys_sched_setaffinity at ffffffffacf1edef #11 [ffff9fe0e357ff38] do_syscall_64 at fffffffface042bb #12 [ffff9fe0e357ff50] entry_SYSCALL_64_after_hwframe at ffffffffad8000ad RIP: 00007ff5a7c8e81d RSP: 00007ff546cd46f8 RFLAGS: 00000203 RAX: ffffffffffffffda RBX: 000055a5798b4170 RCX: 00007ff5a7c8e81d RDX: 00007ff546cd4700 RSI: 0000000000000080 RDI: 00000000000052cf RBP: 00007ff546cd4700 R8: 00007ff546cd7700 R9: 0000000000000100 R10: 00007ff546cd7700 R11: 0000000000000203 R12: 00007ffcf7d4db5e R13: 00007ffcf7d4db5f R14: 0000000000000000 R15: 00007ff546cd4880 ORIG_RAX: 00000000000000cb CS: 0033 SS: 002b 127.0.0.1-2022-07-12-23\:43\:02/vmcore crash> bt 2 PID: 2 TASK: ffff9012c36b4800 CPU: 18 COMMAND: "kthreadd" #0 [ffff9e5d0c4bb708] machine_kexec at ffffffff9e8641ce #1 [ffff9e5d0c4bb760] __crash_kexec at ffffffff9e99df1d #2 [ffff9e5d0c4bb828] crash_kexec at ffffffff9e99ee0d #3 [ffff9e5d0c4bb840] oops_end at ffffffff9e82613d #4 [ffff9e5d0c4bb860] no_context at ffffffff9e87562f #5 [ffff9e5d0c4bb8b8] __bad_area_nosemaphore at ffffffff9e87598c #6 [ffff9e5d0c4bb900] do_page_fault at ffffffff9e876267 #7 [ffff9e5d0c4bb930] page_fault at ffffffff9f20111e [exception RIP: deactivate_slab+158] RIP: ffffffff9eafad0e RSP: ffff9e5d0c4bb9e0 RFLAGS: 00010086 RAX: 0000000000000010 RBX: ffff904244a66908 RCX: 00000000000000cc RDX: ffff904244a66248 RSI: ffff905244a66e00 RDI: ffff904244a66000 RBP: ffff9e5d0c4bbab0 R8: 0000000000000001 R9: 00000000000000cc R10: 0000000000000000 R11: 0000000000000000 R12: ffffd01702129980 R13: ffff9021fffaf040 R14: ffff905244a66e00 R15: ffff9012c0004400 ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0018 #8 [ffff9e5d0c4bbab8] ___slab_alloc at ffffffff9eafbd31 #9 [ffff9e5d0c4bbaf0] memcg_alloc_page_obj_cgroups at ffffffff9eb18dea #10 [ffff9e5d0c4bbb30] perf_output_end at ffffffff9ea6bf9a #11 [ffff9e5d0c4bbb38] perf_event_task_output at ffffffff9ea5b92c #12 [ffff9e5d0c4bbcf8] copy_process at ffffffff9e8e96b9 #13 [ffff9e5d0c4bbd70] copy_process at ffffffff9e8e96b9 #14 [ffff9e5d0c4bbdd0] __switch_to_asm at ffffffff9f2001e5 #15 [ffff9e5d0c4bbe30] _do_fork at ffffffff9e8eb09f #16 [ffff9e5d0c4bbea8] kernel_thread at ffffffff9e8eb415 #17 [ffff9e5d0c4bbeb0] kthreadd at ffffffff9e910c04 #18 [ffff9e5d0c4bbf50] ret_from_fork at ffffffff9f200242 127.0.0.1-2022-07-12-23:31:06/vmcore crash> bt 2 PID: 2 TASK: ffff8ce78317c800 CPU: 18 COMMAND: "kthreadd" #0 [ffff9a144c4bb708] machine_kexec at ffffffff8fa641ce #1 [ffff9a144c4bb760] __crash_kexec at ffffffff8fb9df1d #2 [ffff9a144c4bb828] crash_kexec at ffffffff8fb9ee0d #3 [ffff9a144c4bb840] oops_end at ffffffff8fa2613d #4 [ffff9a144c4bb860] no_context at ffffffff8fa7562f #5 [ffff9a144c4bb8b8] __bad_area_nosemaphore at ffffffff8fa7598c #6 [ffff9a144c4bb900] do_page_fault at ffffffff8fa76267 #7 [ffff9a144c4bb930] page_fault at ffffffff9040111e [exception RIP: deactivate_slab+158] RIP: ffffffff8fcfad0e RSP: ffff9a144c4bb9e0 RFLAGS: 00010082 RAX: 0000000000000010 RBX: ffff8ce78436eff8 RCX: 00000000000000ae RDX: ffff8ce78436e248 RSI: ffff8cf78436e038 RDI: ffff8ce78436e000 RBP: ffff9a144c4bbab0 R8: 0000000000000001 R9: 00000000000000ae R10: 0000000000000000 R11: 0000000000000000 R12: ffffee500210db80 R13: ffff8cc73ffaf040 R14: ffff8cf78436e038 R15: ffff8cb800004400 ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0018 #8 [ffff9a144c4bbab8] ___slab_alloc at ffffffff8fcfbd31 #9 [ffff9a144c4bbaf0] memcg_alloc_page_obj_cgroups at ffffffff8fd18dea #10 [ffff9a144c4bbb30] perf_output_end at ffffffff8fc6bf9a #11 [ffff9a144c4bbb38] perf_event_task_output at ffffffff8fc5b92c #12 [ffff9a144c4bbcf8] copy_process at ffffffff8fae96b9 #13 [ffff9a144c4bbd70] copy_process at ffffffff8fae96b9 #14 [ffff9a144c4bbdd0] __switch_to_asm at ffffffff904001e5 #15 [ffff9a144c4bbe30] _do_fork at ffffffff8faeb09f #16 [ffff9a144c4bbea8] kernel_thread at ffffffff8faeb415 #17 [ffff9a144c4bbeb0] kthreadd at ffffffff8fb10c04 #18 [ffff9a144c4bbf50] ret_from_fork at ffffffff90400242 127.0.0.1-2022-07-13-10_12_31/vmcore crash> bt 106937 PID: 106937 TASK: ffff9b71c54c9800 CPU: 27 COMMAND: "ll_ost_io02_016" #0 [ffffad02a1ba7488] machine_kexec at ffffffff93c641ce #1 [ffffad02a1ba74e0] __crash_kexec at ffffffff93d9df1d #2 [ffffad02a1ba75a8] crash_kexec at ffffffff93d9ee0d #3 [ffffad02a1ba75c0] oops_end at ffffffff93c2613d #4 [ffffad02a1ba75e0] no_context at ffffffff93c7562f #5 [ffffad02a1ba7638] __bad_area_nosemaphore at ffffffff93c7598c #6 [ffffad02a1ba7680] do_page_fault at ffffffff93c76267 #7 [ffffad02a1ba76b0] page_fault at ffffffff9460111e [exception RIP: __kmalloc_node+418] RIP: ffffffff93efd382 RSP: ffffad02a1ba7768 RFLAGS: 00010246 RAX: 0000001000000000 RBX: 000000000060c2c0 RCX: 0000000000000000 RDX: 000000000001eca3 RSI: 000000000060c2c0 RDI: 000000000002f040 RBP: 000000000060c2c0 R8: ffff9b917fdef040 R9: ffffad02a1ba786c R10: ffff9b6240004400 R11: 0000000000000000 R12: 0000000000000008 R13: 00000000ffffffff R14: ffff9b6240004400 R15: ffffffffc01f2563 ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0018 #8 [ffffad02a1ba77b8] spl_kmem_zalloc at ffffffffc01f2563 [spl] #9 [ffffad02a1ba77e8] dmu_buf_hold_array_by_dnode at ffffffffc03e9bbd [zfs] #10 [ffffad02a1ba7850] dmu_write_by_dnode at ffffffffc03ea2ca [zfs] #11 [ffffad02a1ba78a0] osd_write at ffffffffc255ebe8 [osd_zfs] #12 [ffffad02a1ba78e0] dt_record_write at ffffffffc1b684b2 [obdclass] #13 [ffffad02a1ba78f0] tgt_server_data_write at ffffffffc193e460 [ptlrpc] #14 [ffffad02a1ba7920] tgt_txn_stop_cb at ffffffffc19469c0 [ptlrpc] #15 [ffffad02a1ba7988] dt_txn_hook_stop at ffffffffc1b6b563 [obdclass] #16 [ffffad02a1ba79b0] osd_trans_stop at ffffffffc25509f6 [osd_zfs] #17 [ffffad02a1ba7a00] ofd_commitrw_write at ffffffffc203a4d3 [ofd] #18 [ffffad02a1ba7aa0] ofd_commitrw at ffffffffc203f831 [ofd] #19 [ffffad02a1ba7b60] obd_commitrw at ffffffffc194b47c [ptlrpc] #20 [ffffad02a1ba7bd0] tgt_brw_write at ffffffffc1953a80 [ptlrpc] #21 [ffffad02a1ba7d50] tgt_request_handle at ffffffffc1955053 [ptlrpc] #22 [ffffad02a1ba7dd0] ptlrpc_server_handle_request at ffffffffc1901983 [ptlrpc] #23 [ffffad02a1ba7e38] ptlrpc_main at ffffffffc1903486 [ptlrpc] #24 [ffffad02a1ba7f10] kthread at ffffffff93d0f726 #25 [ffffad02a1ba7f50] ret_from_fork at ffffffff94600242
Attachments
Issue Links
- is related to
-
LU-15308 lod_alloc_comp_entries should set ldo_mirrors to NULL on error
-
- Resolved
-