From d78660288e166f2ce93980fb96963e072edb7392 Mon Sep 17 00:00:00 2001 From: root Date: Thu, 26 Jan 2012 12:02:06 +0200 Subject: [PATCH] MRP-303 handle bulk IO errors correctly. don't panic on incorrect bulk transfer, correctly handle a request reorder. Signed-off-by: root --- lustre/include/obd_support.h | 1 + lustre/ost/ost_handler.c | 19 +++++++++++++++++-- lustre/ptlrpc/client.c | 4 ++-- lustre/ptlrpc/events.c | 6 ++++++ lustre/tests/sanity.sh | 20 ++++++++++++++++++++ 5 files changed, 46 insertions(+), 4 deletions(-) diff --git a/lustre/include/obd_support.h b/lustre/include/obd_support.h index 5cdebfd..c7269a2 100644 --- a/lustre/include/obd_support.h +++ b/lustre/include/obd_support.h @@ -347,6 +347,7 @@ int obd_alloc_fail(const void *ptr, const char *name, const char *type, #define OBD_FAIL_PTLRPC_HPREQ_NOTIMEOUT 0x512 #define OBD_FAIL_PTLRPC_DROP_REQ_OPC 0x513 #define OBD_FAIL_PTLRPC_FINISH_REPLAY 0x514 +#define OBD_FAIL_PTLRPC_CLIENT_BULK_CB2 0x515 #define OBD_FAIL_OBD_PING_NET 0x600 #define OBD_FAIL_OBD_LOG_CANCEL_NET 0x601 diff --git a/lustre/ost/ost_handler.c b/lustre/ost/ost_handler.c index 8dc1cec..2a1326e 100644 --- a/lustre/ost/ost_handler.c +++ b/lustre/ost/ost_handler.c @@ -900,7 +900,8 @@ static int ost_brw_read(struct ptlrpc_request *req, struct obd_trans_info *oti) /* Check if client was evicted while we were doing i/o before touching network */ if (rc == 0) { - rc = target_bulk_io(exp, desc, &lwi); + if (!CFS_FAIL_PRECHECK(OBD_FAIL_PTLRPC_CLIENT_BULK_CB2)) + rc = target_bulk_io(exp, desc, &lwi); no_reply = rc != 0; } @@ -920,7 +921,7 @@ out_lock: out_tls: ost_tls_put(req); out_bulk: - if (desc) + if (desc && !CFS_FAIL_PRECHECK(OBD_FAIL_PTLRPC_CLIENT_BULK_CB2)) ptlrpc_free_bulk(desc); out: LASSERT(rc <= 0); @@ -944,6 +945,20 @@ out: exp->exp_connection->c_remote_uuid.uuid, libcfs_id2str(req->rq_peer)); } + /* send a bulk after reply to simulate a network delay or reordering + * after router */ + if (CFS_FAIL_PRECHECK(OBD_FAIL_PTLRPC_CLIENT_BULK_CB2)) { + cfs_waitq_t waitq; + struct l_wait_info lwi1; + + CDEBUG(D_INFO, "reoder BULK\n"); + cfs_waitq_init(&waitq); + + lwi1 = LWI_TIMEOUT_INTR(cfs_time_seconds(3), NULL, NULL, NULL); + l_wait_event(waitq, 0, &lwi1); + rc = target_bulk_io(exp, desc, &lwi); + ptlrpc_free_bulk(desc); + } RETURN(rc); } diff --git a/lustre/ptlrpc/client.c b/lustre/ptlrpc/client.c index 11697d6..e57b486 100644 --- a/lustre/ptlrpc/client.c +++ b/lustre/ptlrpc/client.c @@ -1675,7 +1675,7 @@ int ptlrpc_check_set(const struct lu_env *env, struct ptlrpc_request_set *set) * process the reply. Similarly if the RPC returned * an error, and therefore the bulk will never arrive. */ - if (req->rq_bulk == NULL || req->rq_status != 0) { + if (req->rq_bulk == NULL || req->rq_status < 0) { ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET); GOTO(interpret, req->rq_status); } @@ -1693,7 +1693,7 @@ int ptlrpc_check_set(const struct lu_env *env, struct ptlrpc_request_set *set) * was good after getting the REPLY for her GET or * the ACK for her PUT. */ DEBUG_REQ(D_ERROR, req, "bulk transfer failed"); - LBUG(); + req->rq_status = -EIO; } ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET); diff --git a/lustre/ptlrpc/events.c b/lustre/ptlrpc/events.c index 4165f88..cc124ce 100644 --- a/lustre/ptlrpc/events.c +++ b/lustre/ptlrpc/events.c @@ -190,6 +190,12 @@ void client_bulk_callback (lnet_event_t *ev) ev->type == LNET_EVENT_UNLINK); LASSERT (ev->unlinked); + if (CFS_FAIL_CHECK_ORSET(OBD_FAIL_PTLRPC_CLIENT_BULK_CB, CFS_FAIL_ONCE)) + ev->status = -EIO; + + if (CFS_FAIL_CHECK_ORSET(OBD_FAIL_PTLRPC_CLIENT_BULK_CB2, CFS_FAIL_ONCE)) + ev->status = -EIO; + CDEBUG((ev->status == 0) ? D_NET : D_ERROR, "event type %d, status %d, desc %p\n", ev->type, ev->status, desc); diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh index da07a92..15f6fe0 100644 --- a/lustre/tests/sanity.sh +++ b/lustre/tests/sanity.sh @@ -8338,6 +8338,26 @@ test_220() { #LU-325 } run_test 220 "the preallocated objects in MDS still can be used if ENOSPC is returned by OST with enough disk space" +test_221a() { + #define OBD_FAIL_PTLRPC_CLIENT_BULK_CB 0x508 + $LCTL set_param fail_loc=0x508 + dd if=/dev/zero of=$DIR/$tfile bs=4096 count=1 conv=fsync + $LCTL set_param fail_loc=0 + df $DIR +} +run_test 221a "MRP-303: don't panic on bulk IO fail" + +test_221b() { + dd if=/dev/zero of=$DIR/$tfile bs=4096 count=1 + cancel_lru_locks osc + #define OBD_FAIL_PTLRPC_CLIENT_BULK_CB2 0x515 + $LCTL set_param fail_loc=0x515 + dd of=/dev/null if=$DIR/$tfile bs=4096 count=1 + $LCTL set_param fail_loc=0 + df $DIR +} +run_test 221b "MRP-303: don't panic on bulk IO fail" + # # tests that do cleanup/setup should be run at the end # -- 1.7.1