Uploaded image for project: 'Lustre'
  1. Lustre
  2. LU-969

2.1 client stack overruns

    XMLWordPrintable

Details

    • Bug
    • Resolution: Fixed
    • Blocker
    • Lustre 2.1.0, Lustre 2.2.0
    • None
    • RHEL 6.2
    • 3
    • 4691

    Description

      We've seen a few 2.1.0 client crashes that appear to be due to stack overruns. We are testing to see if the the ORI-377 debug logging patches fix the problem. Two workloads have reproduced it, IOR and a 8192-task MPI job dumping core into lustre. Here is a stack trace from the core-dumping client and a printout of the thread_info structure of the panic thread. You can see that many of the thread_info fields do not look sane.

      BUG: unable to handle kernel paging request at fffffffff176d220
      IP: [<ffffffff81051e9c>] update_curr+0x14c/0x1f0
      PGD 1a87067 PUD 1a88067 PMD 0
      Oops: 0000 [#1] SMP
      last sysfs file: /sys/devices/system/cpu/cpu15/cache/index2/shared_cpu_map
      CPU 0
      Modules linked in: xt_owner nf_conntrack_ipv4 nf_defrag_ipv4 xt_state nf_conntrack lmv(U) mgc(U) lustre(U) lov(U) osc(U) lquota(U) mdc(U) fid(U) fld(U) ptlrpc(U) obdclass(U) lvfs(U) ko2iblnd(U) lnet(U) libcfs(U) acpi_cpufreq freq_table mperf ipt_LOG xt_multiport iptable_filter ip_tables ib_ipoib rdma_ucm ib_ucm ib_uverbs ib_umad rdma_cm ib_cm iw_cm ib_addr ib_sa dm_mirror dm_region_hash dm_log dm_mod vhost_net macvtap macvlan tun kvm uinput microcode ahci isci libsas scsi_transport_sas sb_edac edac_core iTCO_wdt iTCO_vendor_support i2c_i801 i2c_core ib_qib(U) ib_mad ib_core ioatdma wmi ipv6 nfs lockd fscache nfs_acl auth_rpcgss sunrpc igb dca [last unloaded: cpufreq_ondemand]
      
      Pid: 119440, comm: amg2006.mvapich Not tainted 2.6.32-220.1chaos.ch5.x86_64 #1 appro appro-512x/S2600JF
      RIP: 0010:[<ffffffff81051e9c>]  [<ffffffff81051e9c>] update_curr+0x14c/0x1f0
      RSP: 0018:ffff880036603db8  EFLAGS: 00010086
      RAX: ffff88063fe22040 RBX: 000000000df6ec00 RCX: ffff8808364f46c0
      RDX: 0000000000018b88 RSI: 0000000000000000 RDI: ffff88063fe22078
      RBP: ffff880036603de8 R08: ffffffff8160b665 R09: 0000000000000000
      R10: 0000000000000010 R11: 0000000000000001 R12: ffff880036616028
      R13: 00000000000f3054 R14: 0000000000000000 R15: ffff88063fe22040
      FS:  00002aaaac245260(0000) GS:ffff880036600000(0000) knlGS:0000000000000000
      CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
      CR2: fffffffff176d220 CR3: 0000000250b9d000 CR4: 00000000000406f0
      DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
      DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
      Process amg2006.mvapich (pid: 119440, threadinfo ffff8805d0418000, task ffff88063fe22040)
      Stack:
       ffff880036603df8 0000000000000082 ffff88063fe22078 ffff880036616028
      <0> 0000000000000000 0000000000000000 ffff880036603e18 ffffffff810524eb
      <0> ffff880036615fc0 0000000000000000 0000000000015fc0 0000000000000000
      Call Trace:
       <IRQ>
       [<ffffffff810524eb>] task_tick_fair+0xdb/0x160
       [<ffffffff81058f14>] scheduler_tick+0xd4/0x290
       [<ffffffff810a0b70>] ? tick_sched_timer+0x0/0xc0
       [<ffffffff8107c322>] update_process_times+0x52/0x70
       [<ffffffff810a0bd6>] tick_sched_timer+0x66/0xc0
       [<ffffffff8109536e>] __run_hrtimer+0x8e/0x1a0
       [<ffffffff81012b59>] ? read_tsc+0x9/0x20
       [<ffffffff81095716>] hrtimer_interrupt+0xe6/0x250
       [<ffffffff814f67bb>] smp_apic_timer_interrupt+0x6b/0x9b
       [<ffffffffa04668de>] ? cfs_mem_cache_free+0xe/0x10 [libcfs]
       [<ffffffff8100bc13>] apic_timer_interrupt+0x13/0x20
       <EOI>
       [<ffffffff81160098>] ? kmem_cache_free+0xd8/0x2b0
       [<ffffffffa07d91f6>] ? loi_list_maint+0xa6/0x130 [osc]
       [<ffffffffa04668de>] cfs_mem_cache_free+0xe/0x10 [libcfs]
       [<ffffffffa08d4e2e>] vvp_page_fini_common+0x13e/0x190 [lustre]
       [<ffffffffa0475a3e>] ? cfs_hash_dual_bd_findadd_locked+0xce/0x100 [libcfs]
       [<ffffffffa08d4f26>] vvp_page_fini+0x26/0x50 [lustre]
       [<ffffffffa059c863>] cl_page_free+0xb3/0x4c0 [obdclass]
       [<ffffffffa08d515d>] ? vvp_page_delete+0x6d/0x100 [lustre]
       [<ffffffffa059ceda>] cl_page_put+0x26a/0x440 [obdclass]
       [<ffffffffa059b6cd>] ? cl_page_delete+0x3d/0xf0 [obdclass]
       [<ffffffffa08c5275>] cl_invalidatepage+0xa5/0x140 [lustre]
       [<ffffffffa08c534b>] ll_releasepage+0x2b/0x50 [lustre]
       [<ffffffff811102d0>] try_to_release_page+0x30/0x60
       [<ffffffff8112a731>] shrink_page_list.clone.0+0x4f1/0x5c0
       [<ffffffff8112aafb>] shrink_inactive_list+0x2fb/0x740
       [<ffffffff8112b80f>] shrink_zone+0x38f/0x520
       [<ffffffff8112c5b4>] zone_reclaim+0x354/0x410
       [<ffffffff8112d200>] ? isolate_pages_global+0x0/0x350
       [<ffffffff81122a94>] get_page_from_freelist+0x694/0x820
       [<ffffffff81123d11>] __alloc_pages_nodemask+0x111/0x940
       [<ffffffff81123d11>] ? __alloc_pages_nodemask+0x111/0x940
       [<ffffffff8115ead0>] ? cache_alloc_refill+0x1c0/0x240
       [<ffffffffa0466a73>] ? cfs_alloc+0x63/0x90 [libcfs]
       [<ffffffff8115e252>] kmem_getpages+0x62/0x170
       [<ffffffff8115e8bf>] cache_grow+0x2cf/0x320
       [<ffffffff8115eb12>] cache_alloc_refill+0x202/0x240
       [<ffffffffa0466a73>] ? cfs_alloc+0x63/0x90 [libcfs]
       [<ffffffff8115f839>] __kmalloc+0x1a9/0x220
       [<ffffffffa0466a73>] cfs_alloc+0x63/0x90 [libcfs]
       [<ffffffffa0675b0a>] ptlrpc_prep_bulk_imp+0x7a/0x350 [ptlrpc]
       [<ffffffffa068510c>] ? lustre_msg_set_timeout+0x9c/0x110 [ptlrpc]
       [<ffffffffa07e049f>] osc_brw_prep_request+0x88f/0x1040 [osc]
       [<ffffffffa07f596b>] ? osc_req_attr_set+0xfb/0x2a0 [osc]
       [<ffffffffa08cf298>] ? ccc_req_attr_set+0x78/0x150 [lustre]
       [<ffffffffa05a523c>] ? cl_req_prep+0x8c/0x190 [obdclass]
       [<ffffffffa07e1d85>] osc_send_oap_rpc+0x1135/0x1bc0 [osc]
       [<ffffffffa0826dc7>] ? lov_merge_lvb_kms+0x127/0x2b0 [lov]
       [<ffffffffa07d3ff1>] ? osc_consume_write_grant+0x81/0x160 [osc]
       [<ffffffffa07e2aee>] osc_check_rpcs+0x2de/0x470 [osc]
       [<ffffffffa07ae2f7>] ? osc_quota_chkdq+0x47/0x3a0 [lquota]
       [<ffffffffa0598098>] ? cl_object_attr_get+0x88/0x1b0 [obdclass]
       [<ffffffffa07d9143>] ? on_list+0x43/0x50 [osc]
       [<ffffffffa07e3693>] osc_queue_async_io+0x3c3/0x8f0 [osc]
       [<ffffffffa0596399>] ? cl_env_hops_keycmp+0x19/0x70 [obdclass]
       [<ffffffffa07f158f>] osc_page_cache_add+0xcf/0x200 [osc]
       [<ffffffffa05994a8>] cl_page_invoke+0xb8/0x160 [obdclass]
       [<ffffffffa059a4b8>] cl_page_cache_add+0x58/0x240 [obdclass]
       [<ffffffffa08c55e3>] ? ll_set_page_dirty+0x13/0x90 [lustre]
       [<ffffffffa08916a6>] ? vvp_write_pending+0x56/0x150 [lustre]
       [<ffffffffa08d6513>] vvp_io_commit_write+0x343/0x5a0 [lustre]
       [<ffffffffa04763a2>] ? cfs_hash_lookup+0x82/0xa0 [libcfs]
       [<ffffffffa05a83af>] cl_io_commit_write+0xaf/0x1e0 [obdclass]
       [<ffffffffa0598759>] ? cl_env_get+0x29/0x350 [obdclass]
       [<ffffffffa08adced>] ll_commit_write+0xed/0x300 [lustre]
       [<ffffffffa08c53a0>] ll_write_end+0x30/0x60 [lustre]
       [<ffffffff811118e4>] generic_file_buffered_write+0x174/0x2a0
       [<ffffffff81070797>] ? current_fs_time+0x27/0x30
       [<ffffffff811131d0>] __generic_file_aio_write+0x250/0x480
       [<ffffffffa0596975>] ? cl_env_info+0x15/0x20 [obdclass]
       [<ffffffff8111346f>] generic_file_aio_write+0x6f/0xe0
       [<ffffffffa08d6e21>] vvp_io_write_start+0xa1/0x270 [lustre]
       [<ffffffffa05a4b88>] cl_io_start+0x68/0x170 [obdclass]
       [<ffffffffa05a9700>] cl_io_loop+0x110/0x1c0 [obdclass]
       [<ffffffffa04763a2>] ? cfs_hash_lookup+0x82/0xa0 [libcfs]
       [<ffffffffa087e92b>] ll_file_io_generic+0x44b/0x580 [lustre]
       [<ffffffffa0474494>] ? cfs_hash_dual_bd_unlock+0x34/0x60 [libcfs]
       [<ffffffffa0598759>] ? cl_env_get+0x29/0x350 [obdclass]
       [<ffffffffa087eb9f>] ll_file_aio_write+0x13f/0x310 [lustre]
       [<ffffffffa05988ce>] ? cl_env_get+0x19e/0x350 [obdclass]
       [<ffffffffa0885231>] ll_file_write+0x171/0x310 [lustre]
       [<ffffffff811cd9ae>] elf_core_dump+0x104e/0x1120
       [<ffffffff8117f9f4>] do_coredump+0x824/0xc10
       [<ffffffff8108034d>] ? __sigqueue_free+0x3d/0x50
       [<ffffffff8108426d>] get_signal_to_deliver+0x1ed/0x460
       [<ffffffff8100a2d5>] do_signal+0x75/0x800
       [<ffffffff81178492>] ? do_readv_writev+0x162/0x1f0
       [<ffffffff81010000>] ? find_oprom+0x340/0x3a0
       [<ffffffff81042117>] ? is_prefetch+0x1a7/0x230
       [<ffffffff8100aaf0>] do_notify_resume+0x90/0xc0
       [<ffffffff8100bb5c>] retint_signal+0x48/0x8c
      Code: a4 00 45 85 db 74 38 48 8b 50 08 8b 5a 18 48 8b 90 10 09 00 00 48 8b 4a 50 48 85 c9 74 21 48 63 db 66 0f 1f 44 00 00 48 8b 51 20 <48> 03 14 dd 20 72 bf 81 4c 01 2a 48 8b 49 78 48 85 c9 75 e8 48
      RIP  [<ffffffff81051e9c>] update_curr+0x14c/0x1f0
       RSP <ffff880036603db8>
      CR2: fffffffff176d220
      

      Here is the thread_info struct:

      crash> struct -x thread_info 0xffff8805d0418000
      struct thread_info {
        task = 0xffff88060000000a, 
        exec_domain = 0xffffea000df6ec38, 
        flags = 0x1, 
        status = 0x0, 
        cpu = 0xdf6ec00, 
        preempt_count = 0xea00, 
        addr_limit = {
          seg = 0x0
        }, 
        restart_block = {
          fn = 0xffff880000021dd8, 
          {
            {
              arg0 = 0x1f, 
              arg1 = 0xffff88043febaa50, 
              arg2 = 0x10, 
              arg3 = 0xffff88043febaa58
            }, 
            futex = {
              uaddr = 0x1f, 
              val = 0x3febaa50, 
              flags = 0xffff8804, 
              bitset = 0x10, 
              time = 0xffff88043febaa58, 
              uaddr2 = 0x97
            }, 
            nanosleep = {
              index = 0x1f, 
              rmtp = 0xffff88043febaa50, 
              compat_rmtp = 0x10, 
              expires = 0xffff88043febaa58
            }, 
            poll = {
              ufds = 0x1f, 
              nfds = 0x3febaa50, 
              has_timeout = 0xffff8804, 
              tv_sec = 0x10, 
              tv_nsec = 0xffff88043febaa58
            }
          }
        }, 
        sysenter_return = 0xffff88043febaa40, 
        uaccess_err = 0x0
      }
      

      Attachments

        Issue Links

          Activity

            People

              hongchao.zhang Hongchao Zhang
              nedbass Ned Bass (Inactive)
              Votes:
              0 Vote for this issue
              Watchers:
              4 Start watching this issue

              Dates

                Created:
                Updated:
                Resolved: