Uploaded image for project: 'Lustre'
  1. Lustre
  2. LU-2889

There's a race between starting and stopping service threads

    XMLWordPrintable

Details

    • Bug
    • Resolution: Fixed
    • Critical
    • Lustre 2.5.0
    • Lustre 2.4.0, Lustre 2.1.5, Lustre 1.8.9
    • no specific environment is needed
    • 3
    • 6971

    Description

      When ptlrpc_start_thread fails to create a new thread, it will finalize and free a struct ptlrpc_thread created and used here. Considering this, it can be problem when ptlrpc_svcpt_stop_thread is driven and handles the struct ptlrpc_thread right after or right before failure of cfs_create_thread.

      This situation let the both of ptlrpc_start_thread and ptlrpc_svcpt_stop_threads access the freed ptlrpc_thread and cause OS panic.

      ptlrpc_thread_start
      int ptlrpc_start_thread(struct ptlrpc_service_part *svcpt, int wait)
      {
              ...
      
              spin_lock(&svcpt->scp_lock);
      
              ...
      
              cfs_list_add(&thread->t_link, &svcpt->scp_threads);
              spin_unlock(&svcpt->scp_lock);
      
              if (svcpt->scp_cpt >= 0) {
                      snprintf(thread->t_name, PTLRPC_THR_NAME_LEN, "%s%02d_%03d",
                               svc->srv_thread_name, svcpt->scp_cpt, thread->t_id);
              } else {
                      snprintf(thread->t_name, PTLRPC_THR_NAME_LEN, "%s_%04d",
                               svc->srv_thread_name, thread->t_id);
              }
      
              CDEBUG(D_RPCTRACE, "starting thread '%s'\n", thread->t_name);
              /*
               * CLONE_VM and CLONE_FILES just avoid a needless copy, because we
               * just drop the VM and FILES in cfs_daemonize_ctxt() right away.
               */
              rc = cfs_create_thread(ptlrpc_main, thread, CFS_DAEMON_FLAGS);
              if (rc < 0) {
                      CERROR("cannot start thread '%s': rc %d\n",
                             thread->t_name, rc); 
                                                  //////////////////////////////////////
                                                  // <---- let's say when
                                                  // ptlrpc_svcpt_stop_thread is driven here
                                                  //////////////////////////////////////
                      spin_lock(&svcpt->scp_lock);
                      cfs_list_del(&thread->t_link);
                      --svcpt->scp_nthrs_starting;
                      spin_unlock(&svcpt->scp_lock);
      
                      OBD_FREE(thread, sizeof(*thread));
                      RETURN(rc);
              }
      
          ...
      
      }
      
      ptlrpc_svcpt_stop_threads
      static void ptlrpc_svcpt_stop_threads(struct ptlrpc_service_part *svcpt)
      {
              struct l_wait_info      lwi = { 0 };
              struct ptlrpc_thread    *thread;
              CFS_LIST_HEAD           (zombie);
      
              ENTRY;
      
              CDEBUG(D_INFO, "Stopping threads for service %s\n",
                     svcpt->scp_service->srv_name);
      
              spin_lock(&svcpt->scp_lock);
              /* let the thread know that we would like it to stop asap */
              list_for_each_entry(thread, &svcpt->scp_threads, t_link) {
                      CDEBUG(D_INFO, "Stopping thread %s #%u\n",
                             svcpt->scp_service->srv_thread_name, thread->t_id);
                      thread_add_flags(thread, SVC_STOPPING);
              }
      
              cfs_waitq_broadcast(&svcpt->scp_waitq);
      
              while (!cfs_list_empty(&svcpt->scp_threads)) {
                      thread = cfs_list_entry(svcpt->scp_threads.next,
                                              struct ptlrpc_thread, t_link);
                      if (thread_is_stopped(thread)) {
                              cfs_list_del(&thread->t_link);
                              cfs_list_add(&thread->t_link, &zombie);
                              continue;
                      }
                      spin_unlock(&svcpt->scp_lock);
      
                      CDEBUG(D_INFO, "waiting for stopping-thread %s #%u\n",
                             svcpt->scp_service->srv_thread_name, thread->t_id);
                      l_wait_event(thread->t_ctl_waitq,
                                   thread_is_stopped(thread), &lwi);
      
                      spin_lock(&svcpt->scp_lock);
              }
      
              spin_unlock(&svcpt->scp_lock);
      
              while (!cfs_list_empty(&zombie)) {
                      thread = cfs_list_entry(zombie.next,
                                              struct ptlrpc_thread, t_link);
                      cfs_list_del(&thread->t_link);
                      OBD_FREE_PTR(thread);
              }
              EXIT;
      }
      

      Attachments

        Activity

          People

            keith Keith Mannthey (Inactive)
            nozaki Hiroya Nozaki (Inactive)
            Votes:
            0 Vote for this issue
            Watchers:
            4 Start watching this issue

            Dates

              Created:
              Updated:
              Resolved: