diff --git a/lustre/include/lustre_import.h b/lustre/include/lustre_import.h index 0e93bcc..b5743d4 100644 --- a/lustre/include/lustre_import.h +++ b/lustre/include/lustre_import.h @@ -178,6 +178,16 @@ struct obd_import { cfs_list_t imp_delayed_list; /** @} */ + /** + * List of requests that are retained for open replay, the imp_cursor + * is for accelerating search during replay. They will be removed when + * the 'simplifed interop' finished, see bug 24217. + * @{ + */ + cfs_list_t imp_committed_list; + cfs_list_t *imp_cursor; + /** @} */ + /** obd device for this import */ struct obd_device *imp_obd; diff --git a/lustre/include/lustre_net.h b/lustre/include/lustre_net.h index f74547c..e28d860 100644 --- a/lustre/include/lustre_net.h +++ b/lustre/include/lustre_net.h @@ -1373,6 +1373,7 @@ int ptlrpc_register_rqbd (struct ptlrpc_request_buffer_desc *rqbd); * request queues, request management, etc. * @{ */ +void ptlrpc_free_open(struct ptlrpc_request *req); void ptlrpc_init_client(int req_portal, int rep_portal, char *name, struct ptlrpc_client *); void ptlrpc_cleanup_client(struct obd_import *imp); diff --git a/lustre/llite/llite_lib.c b/lustre/llite/llite_lib.c index 73170fc..4b7bbe0 100644 --- a/lustre/llite/llite_lib.c +++ b/lustre/llite/llite_lib.c @@ -1169,6 +1169,11 @@ static int ll_setattr_done_writing(struct inode *inode, CERROR("inode %lu mdc truncate failed: rc = %d\n", inode->i_ino, rc); } + if (mod) { + LASSERT(mod->mod_open_req != NULL); + ptlrpc_free_open(mod->mod_open_req); + obd_mod_put(mod); + } RETURN(rc); } diff --git a/lustre/mdc/mdc_reint.c b/lustre/mdc/mdc_reint.c index 89cd5be..50592a8 100644 --- a/lustre/mdc/mdc_reint.c +++ b/lustre/mdc/mdc_reint.c @@ -191,8 +191,11 @@ int mdc_setattr(struct obd_export *exp, struct md_op_data *op_data, * mod from being freed on eviction (commit callback is * called despite rq_replay flag). * Will be put on mdc_done_writing(). + * Another reference for calling ptlrpc_free_open() in + * ll_setattr_done_writing(). */ obd_mod_get(*mod); + obd_mod_get(*mod); } } diff --git a/lustre/mdc/mdc_request.c b/lustre/mdc/mdc_request.c index 92c453c..5c8e09c 100644 --- a/lustre/mdc/mdc_request.c +++ b/lustre/mdc/mdc_request.c @@ -776,6 +776,9 @@ int mdc_clear_open_replay_data(struct obd_export *exp, LASSERT(mod != LP_POISON); + LASSERT(mod->mod_open_req != NULL); + ptlrpc_free_open(mod->mod_open_req); + mod->mod_och = NULL; och->och_mod = NULL; obd_mod_put(mod); @@ -892,6 +895,7 @@ int mdc_close(struct obd_export *exp, struct md_op_data *op_data, if (mod) { if (rc != 0) mod->mod_close_req = NULL; + /* Since now, mod is accessed through open_req only, * thus close req does not keep a reference on mod anymore. */ obd_mod_put(mod); diff --git a/lustre/mdt/mdt_internal.h b/lustre/mdt/mdt_internal.h index e84628f..9315ce0 100644 --- a/lustre/mdt/mdt_internal.h +++ b/lustre/mdt/mdt_internal.h @@ -86,7 +86,6 @@ struct mdt_file_data { struct portals_handle mfd_handle; /* must be first */ cfs_list_t mfd_list; /* protected by med_open_lock */ __u64 mfd_xid; /* xid of the open request */ - struct lustre_handle mfd_old_handle; /* old handle in replay case */ int mfd_mode; /* open mode provided by client */ struct mdt_object *mfd_object; /* point to opened object */ }; @@ -503,6 +502,14 @@ static inline void mdt_export_evict(struct obd_export *exp) class_export_put(exp); } +static inline struct mdt_file_data * +mdt_handle2mfd(const struct lustre_handle *handle) +{ + LASSERT(handle != NULL); + return class_handle2object(handle->cookie); +} + + int mdt_get_disposition(struct ldlm_reply *rep, int flag); void mdt_set_disposition(struct mdt_thread_info *info, struct ldlm_reply *rep, int flag); @@ -591,8 +598,6 @@ void mdt_mfd_set_mode(struct mdt_file_data *mfd, int mdt_reint_open(struct mdt_thread_info *info, struct mdt_lock_handle *lhc); -struct mdt_file_data *mdt_handle2mfd(struct mdt_thread_info *, - const struct lustre_handle *); enum { MDT_IOEPOCH_CLOSED = 0, diff --git a/lustre/mdt/mdt_open.c b/lustre/mdt/mdt_open.c index d581b82..7b399e0 100644 --- a/lustre/mdt/mdt_open.c +++ b/lustre/mdt/mdt_open.c @@ -70,32 +70,6 @@ struct mdt_file_data *mdt_mfd_new(void) RETURN(mfd); } -/* - * Find the mfd pointed to by handle in global hash table. - * In case of replay the handle is obsoleted - * but mfd can be found in mfd list by that handle - */ -struct mdt_file_data *mdt_handle2mfd(struct mdt_thread_info *info, - const struct lustre_handle *handle) -{ - struct ptlrpc_request *req = mdt_info_req(info); - struct mdt_file_data *mfd; - ENTRY; - - LASSERT(handle != NULL); - mfd = class_handle2object(handle->cookie); - /* during dw/setattr replay the mfd can be found by old handle */ - if (mfd == NULL && req_is_replay(req)) { - struct mdt_export_data *med = &req->rq_export->exp_mdt_data; - cfs_list_for_each_entry(mfd, &med->med_open_head, mfd_list) { - if (mfd->mfd_old_handle.cookie == handle->cookie) - RETURN (mfd); - } - mfd = NULL; - } - RETURN (mfd); -} - /* free mfd */ void mdt_mfd_free(struct mdt_file_data *mfd) { @@ -674,7 +648,7 @@ static int mdt_mfd_open(struct mdt_thread_info *info, struct mdt_object *p, * restart replay, so there maybe some orphan * mfd here, we should remove them */ LASSERT(info->mti_rr.rr_handle != NULL); - old_mfd = mdt_handle2mfd(info, info->mti_rr.rr_handle); + old_mfd = mdt_handle2mfd(info->mti_rr.rr_handle); if (old_mfd) { CDEBUG(D_HA, "del orph mfd %p fid=("DFID") " "cookie=" LPX64"\n", mfd, @@ -688,8 +662,22 @@ static int mdt_mfd_open(struct mdt_thread_info *info, struct mdt_object *p, } CDEBUG(D_HA, "Store old cookie "LPX64" in new mfd\n", info->mti_rr.rr_handle->cookie); - mfd->mfd_old_handle.cookie = - info->mti_rr.rr_handle->cookie; + + /* Reuse the old open handle */ + /* Try to handle the case buggy client sending zero + * file handle to us */ + if (unlikey(!lustre_handle_is_used( + (struct lustre_handle *) + info->mti_rr.rr_handle))) { + CERROR("Zero file handle from client!"); + class_handle_unhash(&mfd->mfd_handle); + mdt_mfd_close(info, mfd); + RETURN(-EFAULT); + } + class_handle_unhash(&mfd->mfd_handle); + mfd->mfd_handle.h_cookie = + info->mti_rr.rr_handle->cookie; + class_handle_hash_back(&mfd->mfd_handle); } repbody->handle.cookie = mfd->mfd_handle.h_cookie; @@ -843,6 +831,8 @@ static int mdt_finish_open(struct mdt_thread_info *info, mfd = NULL; if (lustre_msg_get_flags(req->rq_reqmsg) & MSG_RESENT) { cfs_spin_lock(&med->med_open_lock); + /* Last opened file is always inserted at the start of the + * med_open_head list, so this search isn't so expensive */ cfs_list_for_each(t, &med->med_open_head) { mfd = cfs_list_entry(t, struct mdt_file_data, mfd_list); if (mfd->mfd_xid == req->rq_xid) { @@ -1566,7 +1556,7 @@ int mdt_close(struct mdt_thread_info *info) med = &req->rq_export->exp_mdt_data; cfs_spin_lock(&med->med_open_lock); - mfd = mdt_handle2mfd(info, &info->mti_ioepoch->handle); + mfd = mdt_handle2mfd(&info->mti_ioepoch->handle); if (mdt_mfd_closed(mfd)) { cfs_spin_unlock(&med->med_open_lock); CDEBUG(D_INODE, "no handle for file close: fid = "DFID @@ -1636,7 +1626,7 @@ int mdt_done_writing(struct mdt_thread_info *info) med = &info->mti_exp->exp_mdt_data; cfs_spin_lock(&med->med_open_lock); - mfd = mdt_handle2mfd(info, &info->mti_ioepoch->handle); + mfd = mdt_handle2mfd(&info->mti_ioepoch->handle); if (mfd == NULL) { cfs_spin_unlock(&med->med_open_lock); CDEBUG(D_INODE, "no handle for done write: fid = "DFID diff --git a/lustre/mdt/mdt_reint.c b/lustre/mdt/mdt_reint.c index c142d53..b0367ec 100644 --- a/lustre/mdt/mdt_reint.c +++ b/lustre/mdt/mdt_reint.c @@ -539,7 +539,7 @@ static int mdt_reint_setattr(struct mdt_thread_info *info, LASSERT(info->mti_ioepoch); cfs_spin_lock(&med->med_open_lock); - mfd = mdt_handle2mfd(info, &info->mti_ioepoch->handle); + mfd = mdt_handle2mfd(&info->mti_ioepoch->handle); if (mfd == NULL) { cfs_spin_unlock(&med->med_open_lock); CDEBUG(D_INODE, "no handle for file close: " diff --git a/lustre/obdclass/genops.c b/lustre/obdclass/genops.c index 51cd9b9..8a60d28 100644 --- a/lustre/obdclass/genops.c +++ b/lustre/obdclass/genops.c @@ -966,6 +966,8 @@ struct obd_import *class_new_import(struct obd_device *obd) CFS_INIT_LIST_HEAD(&imp->imp_replay_list); CFS_INIT_LIST_HEAD(&imp->imp_sending_list); CFS_INIT_LIST_HEAD(&imp->imp_delayed_list); + CFS_INIT_LIST_HEAD(&imp->imp_committed_list); + imp->imp_cursor = &imp->imp_committed_list; cfs_spin_lock_init(&imp->imp_lock); imp->imp_last_success_conn = 0; imp->imp_state = LUSTRE_IMP_NEW; diff --git a/lustre/ptlrpc/client.c b/lustre/ptlrpc/client.c index 214be9c..b5d8061 100644 --- a/lustre/ptlrpc/client.c +++ b/lustre/ptlrpc/client.c @@ -2266,6 +2266,25 @@ int ptlrpc_unregister_reply(struct ptlrpc_request *request, int async) RETURN(0); } +/* XXX This will be removed after simplified interop landed, + * see bug 24217 */ +void ptlrpc_free_open(struct ptlrpc_request *req) +{ + struct obd_import *imp = req->rq_import; + + cfs_spin_lock(&imp->imp_lock); + + LASSERT(req->rq_replay == 0); + if (req->rq_transno <= imp->imp_peer_committed_transno && + req->rq_commit_cb != NULL && !req->rq_committed) { + req->rq_commit_cb(req); + cfs_list_del_init(&req->rq_replay_list); + __ptlrpc_req_finished(req, 1); + } + + cfs_spin_unlock(&imp->imp_lock); +} + /** * Iterates through replay_list on import and prunes * all requests have transno smaller than last_committed for the @@ -2316,17 +2335,19 @@ void ptlrpc_free_committed(struct obd_import *imp) GOTO(free_req, 0); } - if (req->rq_replay) { - DEBUG_REQ(D_RPCTRACE, req, "keeping (FL_REPLAY)"); - continue; - } - /* not yet committed */ if (req->rq_transno > imp->imp_peer_committed_transno) { DEBUG_REQ(D_RPCTRACE, req, "stopping search"); break; } + if (req->rq_replay) { + DEBUG_REQ(D_ERROR, req, "keeping (FL_REPLAY)"); + cfs_list_move_tail(&req->rq_replay_list, + &imp->imp_committed_list); + continue; + } + DEBUG_REQ(D_INFO, req, "commit (last_committed "LPU64")", imp->imp_peer_committed_transno); free_req: @@ -2435,17 +2456,16 @@ void ptlrpc_retain_replayable_request(struct ptlrpc_request *req, cfs_list_entry(tmp, struct ptlrpc_request, rq_replay_list); - /* We may have duplicate transnos if we create and then - * open a file, or for closes retained if to match creating - * opens, so use req->rq_xid as a secondary key. - * (See bugs 684, 685, and 428.) - * XXX no longer needed, but all opens need transnos! - */ if (iter->rq_transno > req->rq_transno) continue; - if (iter->rq_transno == req->rq_transno) { + /* XXX The transno of each request should be unique, + * however in case of whatever reason we might need + * non-unique transno in future, I just leave the + * following checking code. */ + if (unlikely(iter->rq_transno == req->rq_transno)) { LASSERT(iter->rq_xid != req->rq_xid); + DEBUG_REQ(D_WARNING, req, "duplicated transno!"); if (iter->rq_xid > req->rq_xid) continue; } diff --git a/lustre/ptlrpc/import.c b/lustre/ptlrpc/import.c index 1464917..a6d02dd 100644 --- a/lustre/ptlrpc/import.c +++ b/lustre/ptlrpc/import.c @@ -568,17 +568,31 @@ static int ptlrpc_first_transno(struct obd_import *imp, __u64 *transno) struct ptlrpc_request *req; cfs_list_t *tmp; - if (cfs_list_empty(&imp->imp_replay_list)) - return 0; - tmp = imp->imp_replay_list.next; - req = cfs_list_entry(tmp, struct ptlrpc_request, rq_replay_list); - *transno = req->rq_transno; - if (req->rq_transno == 0) { - DEBUG_REQ(D_ERROR, req, "zero transno in replay"); - LBUG(); + /* The requests in committed_list always have smaller transnos than + * the requests in replay_list */ + if (!cfs_list_empty(&imp->imp_committed_list)) { + tmp = imp->imp_committed_list.next; + req = cfs_list_entry(tmp, struct ptlrpc_request, rq_replay_list); + *transno = req->rq_transno; + if (req->rq_transno == 0) { + DEBUG_REQ(D_ERROR, req, "zero transno in committed_list"); + LBUG(); + } + return 1; + } + + if (!cfs_list_empty(&imp->imp_replay_list)) { + tmp = imp->imp_replay_list.next; + req = cfs_list_entry(tmp, struct ptlrpc_request, rq_replay_list); + *transno = req->rq_transno; + if (req->rq_transno == 0) { + DEBUG_REQ(D_ERROR, req, "zero transno in replay_list"); + LBUG(); + } + return 1; } - return 1; + return 0; } /** diff --git a/lustre/ptlrpc/ptlrpc_module.c b/lustre/ptlrpc/ptlrpc_module.c index f1be6c9..2e89f8f 100644 --- a/lustre/ptlrpc/ptlrpc_module.c +++ b/lustre/ptlrpc/ptlrpc_module.c @@ -168,6 +168,7 @@ EXPORT_SYMBOL(ptlrpc_resend_req); EXPORT_SYMBOL(ptl_send_rpc); /* client.c */ +EXPORT_SYMBOL(ptlrpc_free_open); EXPORT_SYMBOL(ptlrpc_init_client); EXPORT_SYMBOL(ptlrpc_cleanup_client); EXPORT_SYMBOL(ptlrpc_uuid_to_connection); diff --git a/lustre/ptlrpc/recover.c b/lustre/ptlrpc/recover.c index 17085d6..1e57995 100644 --- a/lustre/ptlrpc/recover.c +++ b/lustre/ptlrpc/recover.c @@ -116,24 +116,56 @@ int ptlrpc_replay_next(struct obd_import *imp, int *inflight) * imp_lock is being held by ptlrpc_replay, but it's not. it's * just a little race... */ - cfs_list_for_each_safe(tmp, pos, &imp->imp_replay_list) { + + /* Replay the committed_list first, since the requests on it have + * smaller transnos than the replay_list */ + if (!cfs_list_empty(&imp->imp_committed_list)) { + tmp = imp->imp_committed_list.prev; req = cfs_list_entry(tmp, struct ptlrpc_request, rq_replay_list); - /* If need to resend the last sent transno (because a - reconnect has occurred), then stop on the matching - req and send it again. If, however, the last sent - transno has been committed then we continue replay - from the next request. */ if (req->rq_transno > last_transno) { - if (imp->imp_resend_replay) - lustre_msg_add_flags(req->rq_reqmsg, - MSG_RESENT); - break; + /* Since the imp_committed_list is immutable before + * all of it's requests being replayed, it's safe to + * use a cursor to accelerate the search */ + imp->imp_cursor = imp->imp_cursor->next; + + while (imp->imp_cursor != &imp->imp_committed_list) { + req = cfs_list_entry(imp->imp_cursor, + struct ptlrpc_request, + rq_replay_list); + if (req->rq_transno > last_transno) + break; + req = NULL; + imp->imp_cursor = imp->imp_cursor->next; + } + } else { + /* If the transno of last request is smaller than the + * last_replayed_transno, which indicates all the + * requests on committed_list have been replayed. */ + imp->imp_cursor = &imp->imp_committed_list; + req = NULL; + } + } + + if (req == NULL) { + cfs_list_for_each_safe(tmp, pos, &imp->imp_replay_list) { + req = cfs_list_entry(tmp, struct ptlrpc_request, + rq_replay_list); + + if (req->rq_transno > last_transno) + break; + req = NULL; } - req = NULL; } + /* If need to resend the last sent transno (because a reconnect + * has occurred), then stop on the matching req and send it again. + * If, however, the last sent transno has been committed then we + * continue replay from the next request. */ + if (req != NULL && imp->imp_resend_replay) + lustre_msg_add_flags(req->rq_reqmsg, MSG_RESENT); + cfs_spin_lock(&imp->imp_lock); imp->imp_resend_replay = 0; cfs_spin_unlock(&imp->imp_lock);