diff --git a/src/mpi/romio/adio/ad_lustre/Makefile.mk b/src/mpi/romio/adio/ad_lustre/Makefile.mk index 43eaa02..17d4ed0 100644 --- a/src/mpi/romio/adio/ad_lustre/Makefile.mk +++ b/src/mpi/romio/adio/ad_lustre/Makefile.mk @@ -16,7 +16,8 @@ romio_other_sources += \ adio/ad_lustre/ad_lustre_wrcoll.c \ adio/ad_lustre/ad_lustre_wrstr.c \ adio/ad_lustre/ad_lustre_hints.c \ - adio/ad_lustre/ad_lustre_aggregate.c + adio/ad_lustre/ad_lustre_aggregate.c \ + adio/ad_lustre/ad_lustre_lock.c endif BUILD_AD_LUSTRE diff --git a/src/mpi/romio/adio/ad_lustre/ad_lustre_hints.c b/src/mpi/romio/adio/ad_lustre/ad_lustre_hints.c index 1d40c86..a9b27a7 100644 --- a/src/mpi/romio/adio/ad_lustre/ad_lustre_hints.c +++ b/src/mpi/romio/adio/ad_lustre/ad_lustre_hints.c @@ -23,6 +23,15 @@ void ADIOI_LUSTRE_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code) int myrank; static char myname[] = "ADIOI_LUSTRE_SETINFO"; + /* Set lock ahead default hints */ + fd->hints->fs_hints.lustre.lock_ahead_read = 0; + fd->hints->fs_hints.lustre.lock_ahead_write = 0; + fd->hints->fs_hints.lustre.lock_ahead_num_extents = 500; + fd->hints->fs_hints.lustre.lock_ahead_flags = 0; + fd->hints->fs_hints.lustre.lock_ahead_ladvise = 0; + fd->hints->fs_hints.lustre.lock_ahead_start_extent = INT64_MAX; + fd->hints->fs_hints.lustre.lock_ahead_end_extent = -1UL; + value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char)); if ( (fd->info) == MPI_INFO_NULL) { /* This must be part of the open call. can set striping parameters @@ -79,6 +88,43 @@ void ADIOI_LUSTRE_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code) ADIOI_Info_set(fd->info, "direct_write", "true"); fd->direct_write = 1; } + + /* Get lock ahead hints */ + + ADIOI_Info_check_and_install_int(fd, users_info, + "romio_lustre_cb_lock_ahead_write", + &(fd->hints->fs_hints.lustre.lock_ahead_write), + myname, error_code ); + ADIOI_Info_check_and_install_int(fd, users_info, + "romio_lustre_cb_lock_ahead_read", + &(fd->hints->fs_hints.lustre.lock_ahead_read), + myname, error_code ); + + /* If, and only if, we're using lock ahead, + process/set the number of extents to pre-lock and the flags */ + if (fd->hints->fs_hints.lustre.lock_ahead_read || + fd->hints->fs_hints.lustre.lock_ahead_write) { + /* Get user's number of extents*/ + ADIOI_Info_check_and_install_int(fd, users_info, + "romio_lustre_cb_lock_ahead_num_extents", + &(fd->hints->fs_hints.lustre.lock_ahead_num_extents), + myname, error_code ); + + /* ADIOI_Info_check_and_install_int doesn't set the + value in fd unless it was in user_info, but knowing + the value - default or explicit - is useful. + Set the final number of extents in the fd->info */ + MPL_snprintf(value, MPI_MAX_INFO_VAL+1, "%d", + fd->hints->fs_hints.lustre.lock_ahead_num_extents); + ADIOI_Info_set(fd->info, "romio_lustre_cb_lock_ahead_num_extents", + value); + + /* Get user's flags*/ + ADIOI_Info_check_and_install_int(fd, users_info, + "romio_lustre_cb_lock_ahead_flags", + &(fd->hints->fs_hints.lustre.lock_ahead_flags), + myname, error_code ); + } } /* set striping information with ioctl */ diff --git a/src/mpi/romio/adio/ad_lustre/ad_lustre_lock.c b/src/mpi/romio/adio/ad_lustre/ad_lustre_lock.c new file mode 100644 index 0000000..346c7ac --- /dev/null +++ b/src/mpi/romio/adio/ad_lustre/ad_lustre_lock.c @@ -0,0 +1,534 @@ +/* + * Copyright 2016 Cray Inc. All Rights Reserved. + */ +#include "adio.h" + +#include "ad_lustre.h" + +#ifdef HAVE_UNISTD_H +#include +#endif + +#include + +/* If necessary (older luster client headers) define the new + locking structures. */ + +//#define LOCK_AHEAD_DEBUG + +#ifndef LL_IOC_LADVISE +#define LL_IOC_LADVISE _IOR('f', 250, struct llapi_lu_ladvise) + +enum lu_ladvise_type { + LU_LADVISE_INVALID = 0, + LU_LADVISE_WILLREAD = 1, + LU_LADVISE_DONTNEED = 2, + LU_LADVISE_LOCKNOEXPAND = 3, + LU_LADVISE_LOCKAHEAD = 4, + LU_LADVISE_MAX +}; + +#define LU_LADVISE_NAMES { \ + [LU_LADVISE_WILLREAD] = "willread", \ + [LU_LADVISE_DONTNEED] = "dontneed", \ + [LU_LADVISE_LOCKNOEXPAND] = "locknoexpand", \ + [LU_LADVISE_LOCKAHEAD] = "lockahead", \ +} + +/* This is the userspace argument for ladvise. It is currently the same as + * what goes on the wire (struct lu_ladvise), but is defined separately as we + * may need info which is only used locally. */ +struct llapi_lu_ladvise { + __u16 lla_advice; /* advice type */ + __u16 lla_value1; /* values for different advice types */ + __u32 lla_value2; + __u64 lla_start; /* first byte of extent for advice */ + __u64 lla_end; /* last byte of extent for advice */ + __u32 lla_value3; + __u32 lla_value4; +}; + +enum ladvise_flag { + LF_ASYNC = 0x00000001, + LF_UNSET = 0x00000002, + /* For lock requests */ + LF_NONBLOCK = 0x00000003, +}; + +#define LADVISE_MAGIC 0x1ADF1CE0 +/* Masks of valid flags for each advice */ +#define LF_LOCKNOEXPAND_MASK LF_UNSET +#define LF_LOCKAHEAD_MASK LF_NONBLOCK +/* Flags valid for all advices not explicitly specified */ +#define LF_DEFAULT_MASK LF_ASYNC +/* All flags */ +#define LF_MASK (LF_ASYNC | LF_UNSET | LF_NONBLOCK) + +#define lla_lockahead_mode lla_value1 +#define lla_peradvice_flags lla_value2 +#define lla_lockahead_result lla_value3 + +/* This is the userspace argument for ladvise, corresponds to ladvise_hdr which + * is used on the wire. It is defined separately as we may need info which is + * only used locally. */ +struct llapi_ladvise_hdr { + __u32 lah_magic; /* LADVISE_MAGIC */ + __u32 lah_count; /* number of advices */ + __u64 lah_flags; /* from enum ladvise_flag */ + __u32 lah_value1; /* unused */ + __u32 lah_value2; /* unused */ + __u64 lah_value3; /* unused */ + struct llapi_lu_ladvise lah_advise[0]; /* advices in this header */ +}; + +#define LAH_COUNT_MAX (1024) + +enum lock_mode_user { + MODE_READ_USER = 1, + MODE_WRITE_USER, + MODE_MAX_USER, +}; + +#define LOCK_MODE_NAMES { \ + [MODE_READ_USER] = "READ",\ + [MODE_WRITE_USER] = "WRITE"\ +} + +enum lockahead_results { + LLA_RESULT_SENT = 0, + LLA_RESULT_DIFFERENT, + LLA_RESULT_SAME, +}; +#endif + +// PAF: may not need this... +/* Trivial helper for one advice */ +void setup_ladvise_requestlock_2(struct llapi_lu_ladvise *advice, int mode, + int flags, size_t start, size_t end) +{ + advice->lla_advice = LU_LADVISE_LOCKAHEAD; + advice->lla_lockahead_mode = mode; + advice->lla_peradvice_flags = flags | LF_ASYNC; + advice->lla_start = start; + advice->lla_end = end; + advice->lla_value3 = 0; + advice->lla_value4 = 0; + advice->lla_lockahead_result = 0; +} + +/* This is this way for the sake of simplicity when building... We'll do better later. */ +int llapi_ladvise_2(int fd, unsigned long long flags, int num_advise, + struct llapi_lu_ladvise *ladvise) +{ + struct llapi_ladvise_hdr *ladvise_hdr; + int rc; + int i; + + if (num_advise < 1 || num_advise >= LAH_COUNT_MAX) { + errno = EINVAL; + /*llapi_error(LLAPI_MSG_ERROR, -EINVAL, + "bad advice number %d", num_advise);*/ + return -1; + } + + ladvise_hdr = calloc(1, offsetof(typeof(*ladvise_hdr), + lah_advise[num_advise])); + if (ladvise_hdr == NULL) { + errno = ENOMEM; + //llapi_error(LLAPI_MSG_ERROR, -ENOMEM, "not enough memory"); + return -1; + } + ladvise_hdr->lah_magic = LADVISE_MAGIC; + ladvise_hdr->lah_count = num_advise; + ladvise_hdr->lah_flags = flags & LF_MASK; + memcpy(ladvise_hdr->lah_advise, ladvise, sizeof(*ladvise) * num_advise); + + rc = ioctl(fd, LL_IOC_LADVISE, ladvise_hdr); + if (rc < 0) { + //llapi_error(LLAPI_MSG_ERROR, -errno, "cannot give advice"); + return -1; + } + + /* Copy results back in to caller provided structs */ + for (i = 0; i < num_advise; i++) { + struct llapi_lu_ladvise *ladvise_iter; + + ladvise_iter = &ladvise_hdr->lah_advise[i]; + + if (ladvise_iter->lla_advice == LU_LADVISE_LOCKAHEAD) + ladvise[i].lla_lockahead_result = + ladvise_iter->lla_lockahead_result; + } + + return 0; +} + +#ifndef LL_IOC_LOCK_AHEAD +/* Declaration for lustre request-only locking */ +#define LL_IOC_REQUEST_ONLY _IO('f', 252) +#endif + +#ifndef LL_IOC_LOCK_AHEAD +/* Declarations for lustre lock ahead */ +#define LL_IOC_LOCK_AHEAD _IOWR('f', 251, struct llapi_lock_ahead_arg) + +typedef enum { + READ_USER = 1, + WRITE_USER, + MAX_USER, +} lock_mode_user; + +struct llapi_lock_ahead_extent { + __u64 start; + __u64 end; + /* 0 on success, -ERRNO on error, 1 when a + * matching but non-identical lock is found, 2 + * when a matching and identical lock is found */ + __s32 result; +}; + +/* lock ahead ioctl arguments */ +struct llapi_lock_ahead_arg { + __u32 lla_version; + __u32 lla_lock_mode; + __u32 lla_flags; + __u32 lla_extent_count; + struct llapi_lock_ahead_extent lla_extents[0]; +}; +#endif + +/* Set lustre locks to only lock the requested byte range, do not + extend any locks to 'infinity' which is the normal behavior. + This will enhance 'lock ahead' extent locking, which can be interfered + with by other requests getting expanded. */ +int ADIOI_LUSTRE_noexpand_locks(ADIO_File fd) +{ + int err; + + err = ioctl(fd->fd_sys, LL_IOC_REQUEST_ONLY); + + if (err == -ENOTTY || errno == ENOTTY) { + struct llapi_lu_ladvise *advice_noexpand; + advice_noexpand = ADIOI_Malloc(sizeof(struct llapi_lu_ladvise)); + if (!advice_noexpand) { + err = -ENOMEM; + goto out; + } + advice_noexpand[0].lla_advice = LU_LADVISE_LOCKNOEXPAND; + advice_noexpand[0].lla_peradvice_flags = 0; + err = llapi_ladvise_2(fd->fd_sys, 0, 1, advice_noexpand); + ADIOI_Free(advice_noexpand); + } + +out: + return err; +} + +/* Use group locks to 'clear' existing locks on the file + before attempting 'lock ahead' extent locking. */ +int ADIOI_LUSTRE_clear_locks(ADIO_File fd) +{ + int err; + int id; + + if (!fd->my_cb_nodes_index) { + srand(time(NULL)); + id = rand(); + err = ioctl(fd->fd_sys, LL_IOC_GROUP_LOCK, id); + err = ioctl(fd->fd_sys, LL_IOC_GROUP_UNLOCK, id); + } + return err; +} + +int do_lock_request(ADIO_File fd, int num_extents, ADIO_Offset *offset, int stripe_size, ADIO_Offset step_size) +{ + struct llapi_lock_ahead_arg *arg; + struct llapi_lu_ladvise *advice; + enum lock_mode_user mode; + int flags = fd->hints->fs_hints.lustre.lock_ahead_flags; + int err; + int i; + + if (fd->hints->fs_hints.lustre.lock_ahead_ladvise) + goto ladvise; + + arg = ADIOI_Malloc(sizeof(struct llapi_lock_ahead_arg) + sizeof(struct llapi_lock_ahead_extent) * num_extents); + for (i = 0; i < num_extents; ++i) { + arg->lla_extents[i].start = *offset; + arg->lla_extents[i].end = *offset + stripe_size - 1; + arg->lla_extents[i].result = 0; + *offset += step_size; + } + /* Simply save the new start/end extents, forget what we aleady had locked + since lustre may reclaim it at any time.*/ + fd->hints->fs_hints.lustre.lock_ahead_start_extent = arg->lla_extents[0].start; + fd->hints->fs_hints.lustre.lock_ahead_end_extent = arg->lla_extents[num_extents - 1].end; + + arg->lla_version = 1; + arg->lla_extent_count = num_extents; + arg->lla_flags = flags; + + if (fd->hints->fs_hints.lustre.lock_ahead_write) /* or read/write */ + arg->lla_lock_mode = WRITE_USER; + else if (fd->hints->fs_hints.lustre.lock_ahead_read) /* read only */ + arg->lla_lock_mode = READ_USER; + else MPI_Abort(MPI_COMM_WORLD, 1); + + err = ioctl(fd->fd_sys, LL_IOC_LOCK_AHEAD, arg); + +#ifdef LOCK_AHEAD_DEBUG + /* Print any per extent errors */ + for (i = 0; i < num_extents; ++i) { + if (arg->lla_extents[i].result) { + fprintf(stderr, "%s(%d) " + "lock ahead extent[%4.4d] {%ld,%ld} stripe {%lld,%lld} errno %d\n", + __func__, __LINE__, + i, + (long int)arg->lla_extents[i].start, + (long int)arg->lla_extents[i].end, + (long int)arg->lla_extents[i].start/stripe_size, + (long int)arg->lla_extents[i].end/stripe_size, + arg->lla_extents[i].result); + } + } +#endif + + /* If the ioctl is not supported, we must use ladvise instead */ + if ((err == -ENOTTY || errno == ENOTTY)) { + fd->hints->fs_hints.lustre.lock_ahead_ladvise = 1; + goto ladvise; + } else { + goto out; + } + +ladvise: + + advice = ADIOI_Malloc(sizeof(struct llapi_lu_ladvise)*num_extents); + + if (fd->hints->fs_hints.lustre.lock_ahead_write) + mode = MODE_WRITE_USER; + else if (fd->hints->fs_hints.lustre.lock_ahead_read) /* read only */ + mode = MODE_READ_USER; + else MPI_Abort(MPI_COMM_WORLD, 1); + + for (i = 0; i < num_extents; ++i) { + setup_ladvise_requestlock_2(&(advice[i]), mode, flags, *offset, *offset + stripe_size - 1); + *offset += step_size; + } + /* Simply save the new start/end extents, forget what we aleady had locked + * since lustre may reclaim it at any time.*/ + fd->hints->fs_hints.lustre.lock_ahead_start_extent = advice[0].lla_start; + fd->hints->fs_hints.lustre.lock_ahead_end_extent = advice[num_extents - 1].lla_end; + + //err = ioctl(fd->fd_sys, LL_IOC_LOCK_AHEAD, arg); + err = llapi_ladvise_2(fd->fd_sys, 0, num_extents, advice); + +#ifdef LOCK_AHEAD_DEBUG + /* Print any per extent errors */ + for (i = 0; i < num_extents; ++i) { + if (advice[i].lla_lockahead_result) { + fprintf(stderr, "%s(%d) " + "lock ahead extent[%4.4d] {%ld,%ld} stripe {%lld,%lld} error %d\n", + __func__, __LINE__, + i, + (long int)advice[i].lla_start, + (long int)advice[i].lla_end, + (long int)advice[i].lla_start/stripe_size, + (long int)advice[i].lla_end/stripe_size, + advice[i].lla_lockahead_result); + } + } +#endif + + +out: + return err; +} + +/* Lock a predefined series of 'extents' in the file. + The intent is to match the aggregator locking pattern. */ +void ADIOI_LUSTRE_lock_ahead_ioctl(ADIO_File fd, int avail_cb_nodes, ADIO_Offset next_offset, int *error_code) +{ + struct llapi_lock_ahead_arg *arg; + int err = 0, i; + int num_extents = fd->hints->fs_hints.lustre.lock_ahead_num_extents; + //int flags = fd->hints->fs_hints.lustre.lock_ahead_flags; + ADIO_Offset offset = 0, step_size = 0; + int stripe_size = fd->hints->striping_unit; + + int agg_idx = fd->my_cb_nodes_index; + + /* Not a collective aggregator? Do nothing and return + since current code is based on aggregator/stripes */ + if (agg_idx < 0) { + /* Disable further lock ahead ... + fd->hints->fs_hints.lustre.lock_ahead_read = 0; + fd->hints->fs_hints.lustre.lock_ahead_write = 0; + fd->hints->fs_hints.lustre.lock_ahead_start_extent = 0; + fd->hints->fs_hints.lustre.lock_ahead_end_extent = INT64_MAX; + */ + return; + } + +#ifdef LOCK_AHEAD_DEBUG + { + /* Debug check. Calculate the expected rank for this stripe */ + int rank_index; + rank_index = (int)((next_offset / stripe_size) % avail_cb_nodes); + /* Not sure why, but this happens in the generic read coll? + It doesn't do the aggregation striped quite as expected. + We'll probably lock the wrong stripes for this read ... + but we're more interested in write locks than read locks + so stick with the lustre specific calculations for now. + Consider dropping read support if performance isn't improved + or ad_lustre doesn't add read coll code. + */ + if (agg_idx != rank_index) { + fprintf(stderr, "%s(%d) rank[%d] file system %d " + "lock ahead debug R(%d)/W(%d), " + "aggregator %d(%d)/%d(%d), " + "offset %lld, start offset %lld, stripe %lld " + "num_extents %d\n", + __func__, __LINE__, + fd->hints->ranklist[agg_idx], + fd->file_system, + fd->hints->fs_hints.lustre.lock_ahead_read, + fd->hints->fs_hints.lustre.lock_ahead_write, + agg_idx,rank_index, + avail_cb_nodes,fd->hints->cb_nodes, + (long long)next_offset,(long long)(next_offset / stripe_size * stripe_size), + (long long)next_offset/stripe_size, + num_extents); + } + /* Just checking the config vs what was passed in */ + if (agg_idx >= avail_cb_nodes) { + fprintf(stderr, "%s(%d) file system %d " + "lock ahead debug R(%d)/W(%d), " + "aggregator %d(%d)/%d(%d), " + "num_extents %d\n", + __func__, __LINE__, fd->file_system, + fd->hints->fs_hints.lustre.lock_ahead_read, + fd->hints->fs_hints.lustre.lock_ahead_write, + agg_idx,rank_index, + avail_cb_nodes,fd->hints->cb_nodes, + num_extents); + MPI_Abort(MPI_COMM_WORLD, 1); + } + } +#endif + + /* Check file access vs requested lock ahead */ + if (fd->access_mode & ADIO_RDONLY){ + /* Don't need write lock ahead */ + fd->hints->fs_hints.lustre.lock_ahead_write = 0; + + /* Do need read lock ahead or give up. */ + if (!(fd->hints->fs_hints.lustre.lock_ahead_read)) { + fd->hints->fs_hints.lustre.lock_ahead_start_extent = 0; + fd->hints->fs_hints.lustre.lock_ahead_end_extent = INT64_MAX; + return; + } + } + if (fd->access_mode & ADIO_WRONLY) { + /* Don't need read lock ahead */ + fd->hints->fs_hints.lustre.lock_ahead_read = 0; + + /* Do need write lock ahead or give up. */ + if (!(fd->hints->fs_hints.lustre.lock_ahead_write)) { + fd->hints->fs_hints.lustre.lock_ahead_start_extent = 0; + fd->hints->fs_hints.lustre.lock_ahead_end_extent = INT64_MAX; + return; + } + } + + + step_size = (ADIO_Offset)avail_cb_nodes * stripe_size; + + if (next_offset == 0) { /* 1st call, calculate our starting offset */ + offset = (ADIO_Offset)agg_idx * stripe_size; + } else /* Have to assume we're writing to one of our stripes */ + offset = next_offset / stripe_size * stripe_size; /* start of stripe */ + +/* arg = ADIOI_Malloc(sizeof(struct llapi_lock_ahead_arg) + sizeof(struct llapi_lock_ahead_extent) * num_extents); + for (i = 0; i < num_extents; ++i) { + arg->lla_extents[i].start = offset; + arg->lla_extents[i].end = offset + stripe_size - 1; + arg->lla_extents[i].result = 0; + offset += step_size; + }*/ + + /* Simply save the new start/end extents, forget what we aleady had locked + since lustre may reclaim it at any time.*/ +/* fd->hints->fs_hints.lustre.lock_ahead_start_extent = arg->lla_extents[0].start; + fd->hints->fs_hints.lustre.lock_ahead_end_extent = arg->lla_extents[num_extents - 1].end;*/ + +/* arg->lla_version = 1; + arg->lla_extent_count = num_extents; + arg->lla_flags = flags; + + if (fd->hints->fs_hints.lustre.lock_ahead_write) * or read/write * + arg->lla_lock_mode = WRITE_USER; + else if (fd->hints->fs_hints.lustre.lock_ahead_read) * read only * + arg->lla_lock_mode = READ_USER; + else MPI_Abort(MPI_COMM_WORLD, 1); + + //err = ioctl(fd->fd_sys, LL_IOC_LOCK_AHEAD, arg); + +#ifdef LOCK_AHEAD_DEBUG + * Print any per extent errors * + for (i = 0; i < num_extents; ++i) { + if (arg->lla_extents[i].result) { + fprintf(stderr, "%s(%d) " + "lock ahead extent[%4.4d] {%ld,%ld} stripe {%lld,%lld} errno %d\n", + __func__, __LINE__, + i, + (long int)arg->lla_extents[i].start, + (long int)arg->lla_extents[i].end, + (long int)arg->lla_extents[i].start/stripe_size, + (long int)arg->lla_extents[i].end/stripe_size, + arg->lla_extents[i].result); + } + } +#endif*/ + err = do_lock_request(fd, num_extents, &offset, stripe_size, step_size); + + if (err == -1) { /* turn off lock ahead after a failure */ +#ifdef LOCK_AHEAD_DEBUG + fprintf(stderr, "%s(%d) file system %d " + "lock ahead failure R(%d)/W(%d), " + "aggregator %d/%d, " + "next offset %lld, stripe %lld, " + "last offset %lld, stripe %lld, " + "step %lld, stripe size %lld " + "num_extents %d\n", + __func__, __LINE__, fd->file_system, + fd->hints->fs_hints.lustre.lock_ahead_read, + fd->hints->fs_hints.lustre.lock_ahead_write, + agg_idx, + avail_cb_nodes, + (long long)next_offset,(long long)next_offset/stripe_size, + (long long)offset,(long long)offset/stripe_size, + (long long)step_size, + (long long)stripe_size, + num_extents); +#endif + fd->hints->fs_hints.lustre.lock_ahead_read = 0; + fd->hints->fs_hints.lustre.lock_ahead_write = 0; + fd->hints->fs_hints.lustre.lock_ahead_start_extent = 0; + fd->hints->fs_hints.lustre.lock_ahead_end_extent = INT64_MAX; + + *error_code = ADIOI_Err_create_code("ADIOI_LUSTRE_lock_ahead_ioctl", fd->filename, errno); + if (agg_idx == 0) { + fprintf(stderr, "%s: ioctl(LL_IOC_LOCK_AHEAD) \'%s\'\n", + __func__, + strerror(errno)); + } + /* Note: it's too late to turn off 'request only' locking, which + could affect performance without also having 'lock ahead'. + + We expect lustre to support this (turning it off) later */ + } + + return; +} + diff --git a/src/mpi/romio/adio/ad_lustre/ad_lustre_open.c b/src/mpi/romio/adio/ad_lustre/ad_lustre_open.c index 9028aec..1509957 100644 --- a/src/mpi/romio/adio/ad_lustre/ad_lustre_open.c +++ b/src/mpi/romio/adio/ad_lustre/ad_lustre_open.c @@ -15,6 +15,10 @@ #define MAX_LOV_UUID_COUNT 1000 +/* Prototype from ad_lustre_lock.c */ +int ADIOI_LUSTRE_clear_locks(ADIO_File); +int ADIOI_LUSTRE_noexpand_locks(ADIO_File); + void ADIOI_LUSTRE_Open(ADIO_File fd, int *error_code) { int perm, old_mask, amode, amode_direct; @@ -159,6 +163,13 @@ void ADIOI_LUSTRE_Open(ADIO_File fd, int *error_code) } } + if(fd->hints->fs_hints.lustre.lock_ahead_read || + fd->hints->fs_hints.lustre.lock_ahead_write) { + ADIOI_LUSTRE_clear_locks(fd); + if(getenv("SET_REQ_ONLY")) + ADIOI_LUSTRE_noexpand_locks(fd); + } + fn_exit: ADIOI_Free(lum); ADIOI_Free(value); diff --git a/src/mpi/romio/adio/ad_lustre/ad_lustre_wrcoll.c b/src/mpi/romio/adio/ad_lustre/ad_lustre_wrcoll.c index cf6787c..014f8ba 100644 --- a/src/mpi/romio/adio/ad_lustre/ad_lustre_wrcoll.c +++ b/src/mpi/romio/adio/ad_lustre/ad_lustre_wrcoll.c @@ -11,6 +11,23 @@ #include "ad_lustre.h" #include "adio_extern.h" +/* in ad_lustre_lock.c */ +void ADIOI_LUSTRE_lock_ahead_ioctl(ADIO_File fd, + int avail_cb_nodes, + ADIO_Offset next_offset, + int *error_code); + +/* Handle lock ahead. If this write is outside our locked region, lock it now */ +#define ADIOI_LUSTRE_WR_LOCK_AHEAD(fd,cb_nodes,offset,error_code) \ +if(fd->hints->fs_hints.lustre.lock_ahead_write) { \ + if (offset > fd->hints->fs_hints.lustre.lock_ahead_end_extent) { \ + ADIOI_LUSTRE_lock_ahead_ioctl(fd, cb_nodes, offset, error_code); \ + } \ + else if (offset < fd->hints->fs_hints.lustre.lock_ahead_start_extent) { \ + ADIOI_LUSTRE_lock_ahead_ioctl(fd, cb_nodes, offset, error_code); \ + } \ +} + /* prototypes of functions used for collective writes only. */ static void ADIOI_LUSTRE_Exch_and_write(ADIO_File fd, const void *buf, MPI_Datatype datatype, int nprocs, @@ -637,6 +654,7 @@ static void ADIOI_LUSTRE_Exch_and_write(ADIO_File fd, const void *buf, if (flag) { /* check whether to do data sieving */ if(data_sieving == ADIOI_HINT_ENABLE) { + ADIOI_LUSTRE_WR_LOCK_AHEAD(fd, striping_info[2], off, error_code); ADIO_WriteContig(fd, write_buf, real_size, MPI_BYTE, ADIO_EXPLICIT_OFFSET, off, &status, error_code); @@ -644,6 +662,7 @@ static void ADIOI_LUSTRE_Exch_and_write(ADIO_File fd, const void *buf, /* if there is no hole, write data in one time; * otherwise, write data in several times */ if (!hole) { + ADIOI_LUSTRE_WR_LOCK_AHEAD(fd, striping_info[2], off, error_code); ADIO_WriteContig(fd, write_buf, real_size, MPI_BYTE, ADIO_EXPLICIT_OFFSET, off, &status, error_code); @@ -660,6 +679,7 @@ static void ADIOI_LUSTRE_Exch_and_write(ADIO_File fd, const void *buf, if (srt_off[i] == block_offset + block_len) { block_len += srt_len[i]; } else { + ADIOI_LUSTRE_WR_LOCK_AHEAD(fd, striping_info[2], block_offset, error_code); ADIO_WriteContig(fd, write_buf + block_offset - off, block_len, @@ -675,6 +695,7 @@ static void ADIOI_LUSTRE_Exch_and_write(ADIO_File fd, const void *buf, } } if (block_offset != -1) { + ADIOI_LUSTRE_WR_LOCK_AHEAD(fd, striping_info[2], block_offset, error_code); ADIO_WriteContig(fd, write_buf + block_offset - off, block_len, diff --git a/src/mpi/romio/adio/common/ad_read_coll.c b/src/mpi/romio/adio/common/ad_read_coll.c index 0335374..6bb9c93 100644 --- a/src/mpi/romio/adio/common/ad_read_coll.c +++ b/src/mpi/romio/adio/common/ad_read_coll.c @@ -51,6 +51,24 @@ void ADIOI_Fill_user_buffer(ADIO_File fd, void *buf, ADIOI_Flatlist_node ADIO_Offset *fd_end, MPI_Aint buftype_extent); +#ifdef LUSTRE_RD_LOCK_AHEAD +/* There is no ad_lustre_rdcoll.c, so stub in some basic common code here + If it's called for over non-lustre file systems, it will turn itself off + when the ioctl fails. */ +void ADIOI_LUSTRE_lock_ahead_ioctl(ADIO_File fd, int avail_cb_nodes, ADIO_Offset next_offset, int *error_code); /* ad_lustre_lock.c */ +/* Handle lock ahead. If this read is outside our locked region, lock it now */ +#define ADIOI_LUSTRE_RD_LOCK_AHEAD(fd,cb_nodes,offset,error_code) \ +if((fd->file_system == ADIO_LUSTRE) && (fd->hints->fs_hints.lustre.lock_ahead_read)) { \ + if (offset > fd->hints->fs_hints.lustre.lock_ahead_end_extent) { \ + ADIOI_LUSTRE_lock_ahead_ioctl(fd,cb_nodes,offset,error_code); \ + } \ + else if (offset < fd->hints->fs_hints.lustre.lock_ahead_start_extent) { \ + ADIOI_LUSTRE_lock_ahead_ioctl(fd,cb_nodes,offset,error_code); \ + } \ +} +#else +#define ADIOI_LUSTRE_RD_LOCK_AHEAD(fd,cb_nodes,offset,error_code) +#endif void ADIOI_GEN_ReadStridedColl(ADIO_File fd, void *buf, int count, MPI_Datatype datatype, int file_ptr_type, @@ -713,6 +731,7 @@ static void ADIOI_Read_and_exch(ADIO_File fd, void *buf, MPI_Datatype if (flag) { ADIOI_Assert(size == (int)size); + ADIOI_LUSTRE_RD_LOCK_AHEAD(fd,fd->hints->cb_nodes, off,error_code); ADIO_ReadContig(fd, read_buf+for_curr_iter, (int)size, MPI_BYTE, ADIO_EXPLICIT_OFFSET, off, &status, error_code); if (*error_code != MPI_SUCCESS) return; diff --git a/src/mpi/romio/adio/include/adioi.h b/src/mpi/romio/adio/include/adioi.h index bea225d..a9426da 100644 --- a/src/mpi/romio/adio/include/adioi.h +++ b/src/mpi/romio/adio/include/adioi.h @@ -69,6 +69,13 @@ struct ADIOI_Hints_struct { int co_ratio; int coll_threshold; int ds_in_coll; + int lock_ahead_read; + int lock_ahead_write; + int lock_ahead_num_extents; + int lock_ahead_flags; + int lock_ahead_ladvise; + ADIO_Offset lock_ahead_start_extent; + ADIO_Offset lock_ahead_end_extent; } lustre; struct { unsigned read_chunk_sz; /* chunk size for direct reads */