Details
Description
soak-7 (OSS) OOM after it stuck in orphan delete
full cordump is on spirit:
[root@soak-16 10.10.1.107-2017-12-14-12:40:00]# pwd
/scratch/dumps/soak-7.spirit.hpdd.intel.com/10.10.1.107-2017-12-14-12:40:00
[44337.534689] LustreError: dumping log to /tmp/lustre-log.1513254458.29742 [44337.745264] LNet: Service thread pid 9119 was inactive for 1205.40s. The thread might be hung, or it might only be slow and will resume later. Dumping the stack trace for debugging purposes: [44337.745265] Pid: 9119, comm: ll_ost01_014 [44337.745265] Call Trace: [44337.745267] [<ffffffff816a9569>] schedule+0x29/0x70 [44337.745280] [<ffffffffc07974d5>] cv_wait_common+0x125/0x150 [spl] [44337.745281] [<ffffffff810b1920>] ? autoremove_wake_function+0x0/0x40 [44337.745285] [<ffffffffc0797515>] __cv_wait+0x15/0x20 [spl] [44337.745327] [<ffffffffc1fd8a33>] txg_wait_open+0xc3/0x110 [zfs] [44337.745344] [<ffffffffc1f8dd9a>] dmu_tx_wait+0x3aa/0x3c0 [zfs] [44337.745359] [<ffffffffc1f8de41>] dmu_tx_assign+0x91/0x490 [zfs] [44337.745365] [<ffffffffc0fbdf3a>] osd_trans_start+0xaa/0x3c0 [osd_zfs] [44337.745373] [<ffffffffc10eb198>] ofd_destroy+0x248/0x940 [ofd] [44337.745377] [<ffffffffc10e3b43>] ofd_destroy_by_fid+0x1e3/0x4a0 [ofd] [44337.745402] [<ffffffffc0dbf200>] ? ldlm_blocking_ast+0x0/0x170 [ptlrpc] [44337.745423] [<ffffffffc0db97a0>] ? ldlm_completion_ast+0x0/0x920 [ptlrpc] [44337.745427] [<ffffffffc10d84cd>] ofd_create_hdl+0x1abd/0x2090 [ofd] [44337.745451] [<ffffffffc0deee37>] ? lustre_msg_add_version+0x27/0xa0 [ptlrpc] [44337.745480] [<ffffffffc0e520d5>] tgt_request_handle+0x925/0x1370 [ptlrpc] [44337.745504] [<ffffffffc0dfaf16>] ptlrpc_server_handle_request+0x236/0xa90 [ptlrpc] [44337.745528] [<ffffffffc0df7548>] ? ptlrpc_wait_event+0x98/0x340 [ptlrpc] [44337.745554] [<ffffffffc0c14be1>] ? keys_fini+0xb1/0x1d0 [obdclass] [44337.745578] [<ffffffffc0df98d1>] ? ptlrpc_server_drop_request+0x431/0x6c0 [ptlrpc] [44337.745602] [<ffffffffc0dfe652>] ptlrpc_main+0xa92/0x1e40 [ptlrpc] [44337.745625] [<ffffffffc0dfdbc0>] ? ptlrpc_main+0x0/0x1e40 [ptlrpc] [44337.745626] [<ffffffff810b099f>] kthread+0xcf/0xe0 [44337.745627] [<ffffffff810b08d0>] ? kthread+0x0/0xe0 [44337.745629] [<ffffffff816b4fd8>] ret_from_fork+0x58/0x90 [44337.745630] [<ffffffff810b08d0>] ? kthread+0x0/0xe0 [44337.745630] [44339.829743] LNet: Service thread pid 103074 was inactive for 1201.15s. The thread might be hung, or it might only be slow and will resume later. Dumping the stack trace for debugging purposes: [44339.829744] Pid: 103074, comm: ll_ost01_048 [44339.829744] Call Trace: [44339.829748] [<ffffffff816aa489>] schedule_preempt_disabled+0x29/0x70 [44339.829750] [<ffffffff816a83b7>] __mutex_lock_slowpath+0xc7/0x1d0 [44339.829752] [<ffffffff816a77cf>] mutex_lock+0x1f/0x2f [44339.829759] [<ffffffffc10d77db>] ofd_create_hdl+0xdcb/0x2090 [ofd] [44339.829793] [<ffffffffc0deee37>] ? lustre_msg_add_version+0x27/0xa0 [ptlrpc] [44339.829818] [<ffffffffc0def18f>] ? lustre_pack_reply_v2+0x14f/0x280 [ptlrpc] [44339.829842] [<ffffffffc0def4b1>] ? lustre_pack_reply+0x11/0x20 [ptlrpc] [44339.829872] [<ffffffffc0e520d5>] tgt_request_handle+0x925/0x1370 [ptlrpc] [44339.829897] [<ffffffffc0dfaf16>] ptlrpc_server_handle_request+0x236/0xa90 [ptlrpc] [44339.829921] [<ffffffffc0df7548>] ? ptlrpc_wait_event+0x98/0x340 [ptlrpc] [44339.829923] [<ffffffff810c4832>] ? default_wake_function+0x12/0x20 [44339.829924] [<ffffffff810ba598>] ? __wake_up_common+0x58/0x90 [44339.829949] [<ffffffffc0dfe652>] ptlrpc_main+0xa92/0x1e40 [ptlrpc] [44339.829973] [<ffffffffc0dfdbc0>] ? ptlrpc_main+0x0/0x1e40 [ptlrpc] [44339.829974] [<ffffffff810b099f>] kthread+0xcf/0xe0 [44339.829975] [<ffffffff810b08d0>] ? kthread+0x0/0xe0 [44339.829977] [<ffffffff816b4fd8>] ret_from_fork+0x58/0x90 [44339.829978] [<ffffffff810b08d0>] ? kthread+0x0/0xe0 [44343.925683] LNet: Service thread pid 103097 was inactive for 1202.37s. Watchdog stack traces are limited to 3 per 300 seconds, skipping this one. [44343.925685] LNet: Skipped 3 previous similar messages [44343.925687] LustreError: dumping log to /tmp/lustre-log.1513254464.103097 [44348.021604] LNet: Service thread pid 72214 was inactive for 1203.49s. Watchdog stack traces are limited to 3 per 300 seconds, skipping this one. [44348.178864] LNet: Skipped 2 previous similar messages [44348.240314] LustreError: dumping log to /tmp/lustre-log.1513254468.72214 [44771.642327] LNet: 5683:0:(o2iblnd_cb.c:3192:kiblnd_check_conns()) Timed out tx for 192.168.1.114@o2ib: 0 seconds [44791.889635] Lustre: soaked-OST0005: Export ffff88074f3a3000 already connecting from 192.168.1.108@o2ib [44792.001974] Lustre: Skipped 12 previous similar messages [44929.515633] Lustre: 136835:0:(service.c:1346:ptlrpc_at_send_early_reply()) @@@ Couldn't add any time (5/5), not sending early reply req@ffff8804149d5450 x1586704105272848/t0(0) o4->7df9a58e-df80-cd01-0d09-5737d3c03bbe@192.168.1.131@o2ib:250/0 lens 608/0 e 0 to 0 dl 1513255055 ref 2 fl New:/2/ffffffff rc 0/-1 [44929.613566] Lustre: 136835:0:(service.c:1346:ptlrpc_at_send_early_reply()) Skipped 795 previous similar messages [44935.379406] Lustre: soaked-OST0005: Client soaked-MDT0003-mdtlov_UUID (at 192.168.1.111@o2ib) reconnecting [44935.412601] Lustre: Skipped 77 previous similar messages [44935.429973] Lustre: soaked-OST0005: Connection restored to 6943d9de-cea7-4a53-1661-77d616b076c6 (at 192.168.1.136@o2ib) [44935.429975] Lustre: Skipped 79 previous similar messages [44940.659687] gmain invoked oom-killer: gfp_mask=0x200da, order=0, oom_score_adj=0 [44940.685141] gmain cpuset=/ mems_allowed=0-1 [44940.699853] CPU: 10 PID: 2945 Comm: gmain Tainted: P OE ------------ 3.10.0-693.5.2.el7_lustre.x86_64 #1 [44940.735748] Hardware name: Intel Corporation S2600GZ ........../S2600GZ, BIOS SE5C600.86B.01.08.0003.022620131521 02/26/2013 [44940.773670] ffff8800b12d4f10 00000000b91ef903 ffff880411287988 ffffffff816a3e2d [44940.799046] ffff880411287a18 ffffffff8169f246 ffffffff810e93ac ffff880410fb97c0 [44940.824448] ffff880410fb97d8 0000000000000202 0000000000000202 ffff880411287a08 [44940.849853] Call Trace: [44940.858910] [<ffffffff816a3e2d>] dump_stack+0x19/0x1b [44940.876837] [<ffffffff8169f246>] dump_header+0x90/0x229 [44940.895326] [<ffffffff810e93ac>] ? ktime_get_ts64+0x4c/0xf0 [44940.914962] [<ffffffff8113d37f>] ? delayacct_end+0x8f/0xb0 [44940.934307] [<ffffffff811863a4>] oom_kill_process+0x254/0x3d0 [44940.954512] [<ffffffff81185e4d>] ? oom_unkillable_task+0xcd/0x120 [44940.975861] [<ffffffff81185ef6>] ? find_lock_task_mm+0x56/0xc0 [44940.996357] [<ffffffff81186be6>] out_of_memory+0x4b6/0x4f0 [44941.015701] [<ffffffff8169fd4a>] __alloc_pages_slowpath+0x5d6/0x724 [44941.037591] [<ffffffff8118cdb5>] __alloc_pages_nodemask+0x405/0x420 [44941.059496] [<ffffffff811d40a5>] alloc_pages_vma+0xb5/0x200 [44941.079116] [<ffffffff811c44ad>] read_swap_cache_async+0xed/0x160 [44941.100448] [<ffffffff811c45c8>] swapin_readahead+0xa8/0x110 [44941.120355] [<ffffffff811b22cb>] handle_mm_fault+0xadb/0xfa0 [44941.140262] [<ffffffff816b0074>] __do_page_fault+0x154/0x450 [44941.160165] [<ffffffff816b03a5>] do_page_fault+0x35/0x90 [44941.178931] [<ffffffff816ac5c8>] page_fault+0x28/0x30 [44941.196833] Mem-Info: [44941.205292] active_anon:35 inactive_anon:0 isolated_anon:0 active_file:0 inactive_file:80 isolated_file:0 unevictable:6965 dirty:0 writeback:11 unstable:0 slab_reclaimable:9803 slab_unreclaimable:234987 mapped:1718 shmem:0 pagetables:1777 bounce:0 free:39369 free_pcp:20 free_cma:0 [44941.317280] Node 0 DMA free:15576kB min:40kB low:48kB high:60kB active_anon:0kB inactive_anon:0kB active_file:0kB inactive_file:0kB unevictable:0kB isolated(anon):0kB isolated(file):0kB present:15920kB managed:15836kB mlocked:0kB dirty:0kB writeback:0kB mapped:0kB shmem:0kB slab_reclaimable:0kB slab_unreclaimable:0kB kernel_stack:0kB pagetables:0kB unstable:0kB bounce:0kB free_pcp:0kB local_pcp:0kB free_cma:0kB writeback_tmp:0kB pages_scanned:0 all_unreclaimable? yes [44941.457405] lowmem_reserve[]: 0 2580 15620 15620 [44941.473633] Node 0 DMA32 free:59480kB min:7320kB low:9148kB high:10980kB active_anon:4kB inactive_anon:4kB active_file:8kB inactive_file:4kB unevictable:568kB isolated(anon):0kB isolated(file):0kB present:3051628kB managed:2643828kB mlocked:568kB dirty:0kB writeback:0kB mapped:576kB shmem:0kB slab_reclaimable:2240kB slab_unreclaimable:59756kB kernel_stack:1056kB pagetables:360kB unstable:0kB bounce:0kB free_pcp:0kB local_pcp:0kB free_cma:0kB writeback_tmp:0kB pages_scanned:118 all_unreclaimable? yes [44941.623315] lowmem_reserve[]: 0 0 13040 13040 [44941.638694] Node 0 Normal free:36872kB min:37012kB low:46264kB high:55516kB active_anon:4kB inactive_anon:32kB active_file:0kB inactive_file:372kB unevictable:3804kB isolated(anon):0kB isolated(file):0kB present:13631488kB managed:13353036kB mlocked:3804kB dirty:0kB writeback:0kB mapped:4120kB shmem:0kB slab_reclaimable:16428kB slab_unreclaimable:312080kB kernel_stack:10832kB pagetables:3212kB unstable:0kB bounce:0kB free_pcp:0kB local_pcp:0kB free_cma:0kB writeback_tmp:0kB pages_scanned:973 all_unreclaimable? yes [44941.793202] lowmem_reserve[]: 0 0 0 0 [44941.806201] Node 1 Normal free:45552kB min:45728kB low:57160kB high:68592kB active_anon:140kB inactive_anon:0kB active_file:44kB inactive_file:184kB unevictable:23488kB isolated(anon):0kB isolated(file):0kB present:16777216kB managed:16498508kB mlocked:23488kB dirty:0kB writeback:44kB mapped:2180kB shmem:0kB slab_reclaimable:20544kB slab_unreclaimable:568112kB kernel_stack:19024kB pagetables:3536kB unstable:0kB bounce:0kB free_pcp:4kB local_pcp:0kB free_cma:0kB writeback_tmp:0kB pages_scanned:1148 all_unreclaimable? yes [44941.962556] lowmem_reserve[]: 0 0 0 0 [44941.975591] Node 0 DMA: 0*4kB 1*8kB (U) 1*16kB (U) 0*32kB 1*64kB (U) 1*128kB (U) 0*256kB 0*512kB 1*1024kB (U) 1*2048kB (M) 3*4096kB (M) = 15576kB [44942.020723] Node 0 DMA32: 400*4kB (UEM) 615*8kB (UEM) 478*16kB (UEM) 418*32kB (UEM) 172*64kB (UEM) 22*128kB (EM) 11*256kB (M) 6*512kB (UM) 4*1024kB (UM) 4*2048kB (M) 0*4096kB = 59544kB [44942.077164] Node 0 Normal: 183*4kB (UEM) 883*8kB (UEM) 494*16kB (UEM) 417*32kB (UEM) 118*64kB (UEM) 10*128kB (M) 0*256kB 0*512kB 0*1024kB 0*2048kB 0*4096kB = 37876kB [44942.128198] Node 1 Normal: 1054*4kB (UEM) 729*8kB (UEM) 488*16kB (UEM) 382*32kB (UEM) 265*64kB (UEM) 13*128kB (UM) 0*256kB 0*512kB 0*1024kB 0*2048kB 0*4096kB = 48704kB [44942.179875] Node 0 hugepages_total=0 hugepages_free=0 hugepages_surp=0 hugepages_size=1048576kB [44942.209631] Node 0 hugepages_total=0 hugepages_free=0 hugepages_surp=0 hugepages_size=2048kB [44942.238520] Node 1 hugepages_total=0 hugepages_free=0 hugepages_surp=0 hugepages_size=1048576kB [44942.268263] Node 1 hugepages_total=0 hugepages_free=0 hugepages_surp=0 hugepages_size=2048kB [44942.297135] 2053 total pagecache pages [44942.310568] 0 pages in swap cache [44942.322563] Swap cache stats: add 69725, delete 69725, find 14748/19926 [44942.345438] Free swap = 16207300kB [44942.358023] Total swap = 16319484kB