diff --git a/config/lustre-build-zfs.m4 b/config/lustre-build-zfs.m4 index b6d56dd..dd14cc6 100644 --- a/config/lustre-build-zfs.m4 +++ b/config/lustre-build-zfs.m4 @@ -401,6 +401,31 @@ your distribution. AC_DEFINE(HAVE_DSL_SYNC_TASK_DO_NOWAIT, 1, [Have dsl_sync_task_do_nowait in ZFS]) ]) + LB_CHECK_COMPILE([if zfs defines sa_spill_alloc], + sa_spill_alloc, [ + #include + #include + ],[ + void *ptr; + + ptr = sa_spill_alloc(KM_SLEEP); + sa_spill_free(ptr); + ],[ + AC_DEFINE(HAVE_SA_SPILL_ALLOC, 1, + [Have sa_spill_alloc in ZFS]) + ]) + LB_CHECK_COMPILE([if zfs defines spa_maxblocksize], + spa_maxblocksize, [ + #include + ],[ + spa_t *spa = NULL; + int size; + + size = spa_maxblocksize(spa); + ],[ + AC_DEFINE(HAVE_SPA_MAXBLOCKSIZE, 1, + [Have spa_maxblocksize in ZFS]) + ]) ]) AM_CONDITIONAL(ZFS_ENABLED, [test "x$enable_zfs" = xyes]) diff --git a/lustre/osd-zfs/osd_handler.c b/lustre/osd-zfs/osd_handler.c index 62965a7..9674f72 100644 --- a/lustre/osd-zfs/osd_handler.c +++ b/lustre/osd-zfs/osd_handler.c @@ -307,7 +307,7 @@ static struct thandle *osd_trans_create(const struct lu_env *env, /* Estimate the number of objects from a number of blocks */ uint64_t osd_objs_count_estimate(uint64_t refdbytes, uint64_t usedobjs, - uint64_t nrblocks) + uint64_t nrblocks, uint64_t est_maxblockshift) { uint64_t est_objs, est_refdblocks, est_usedobjs; @@ -326,7 +326,7 @@ uint64_t osd_objs_count_estimate(uint64_t refdbytes, uint64_t usedobjs, CLASSERT(OSD_DNODE_MIN_BLKSHIFT > 0); CLASSERT(OSD_DNODE_EST_BLKSHIFT > 0); - est_refdblocks = (refdbytes >> SPA_MAXBLOCKSHIFT) + + est_refdblocks = (refdbytes >> est_maxblockshift) + (OSD_DNODE_EST_COUNT >> OSD_DNODE_EST_BLKSHIFT); est_usedobjs = usedobjs + OSD_DNODE_EST_COUNT; @@ -371,14 +371,15 @@ uint64_t osd_objs_count_estimate(uint64_t refdbytes, uint64_t usedobjs, return est_objs; } -static int osd_objset_statfs(struct objset *os, struct obd_statfs *osfs) +static int osd_objset_statfs(struct osd_device *osd, struct obd_statfs *osfs) { + struct objset *os = osd->od_os; uint64_t refdbytes, availbytes, usedobjs, availobjs; uint64_t est_availobjs; uint64_t reserved; + uint64_t bshift; - dmu_objset_space(os, &refdbytes, &availbytes, &usedobjs, - &availobjs); + dmu_objset_space(os, &refdbytes, &availbytes, &usedobjs, &availobjs); /* * ZFS allows multiple block sizes. For statfs, Linux makes no @@ -388,10 +389,11 @@ static int osd_objset_statfs(struct objset *os, struct obd_statfs *osfs) * largest possible block size as IO size for the optimum performance * and scale the free and used blocks count appropriately. */ - osfs->os_bsize = 1ULL << SPA_MAXBLOCKSHIFT; + osfs->os_bsize = osd->od_max_blksz; + bshift = fls64(osfs->os_bsize) - 1; - osfs->os_blocks = (refdbytes + availbytes) >> SPA_MAXBLOCKSHIFT; - osfs->os_bfree = availbytes >> SPA_MAXBLOCKSHIFT; + osfs->os_blocks = (refdbytes + availbytes) >> bshift; + osfs->os_bfree = availbytes >> bshift; osfs->os_bavail = osfs->os_bfree; /* no extra root reservation */ /* Take replication (i.e. number of copies) into account */ @@ -406,12 +408,11 @@ static int osd_objset_statfs(struct objset *os, struct obd_statfs *osfs) * Reserve 0.78% of total space, at least 4MB for small filesystems, * for internal files to be created/unlinked when space is tight. */ - CLASSERT(OSD_STATFS_RESERVED_BLKS > 0); - if (likely(osfs->os_blocks >= - OSD_STATFS_RESERVED_BLKS << OSD_STATFS_RESERVED_SHIFT)) + CLASSERT(OSD_STATFS_RESERVED_SIZE > 0); + if (likely(osfs->os_blocks >= OSD_STATFS_RESERVED_SIZE)) reserved = osfs->os_blocks >> OSD_STATFS_RESERVED_SHIFT; else - reserved = OSD_STATFS_RESERVED_BLKS; + reserved = OSD_STATFS_RESERVED_SIZE >> bshift; osfs->os_blocks -= reserved; osfs->os_bfree -= MIN(reserved, osfs->os_bfree); @@ -425,7 +426,7 @@ static int osd_objset_statfs(struct objset *os, struct obd_statfs *osfs) * Compute a better estimate in udmu_objs_count_estimate(). */ est_availobjs = osd_objs_count_estimate(refdbytes, usedobjs, - osfs->os_bfree); + osfs->os_bfree, bshift); osfs->os_ffree = min(availobjs, est_availobjs); osfs->os_files = osfs->os_ffree + usedobjs; @@ -454,11 +455,10 @@ static int osd_objset_statfs(struct objset *os, struct obd_statfs *osfs) int osd_statfs(const struct lu_env *env, struct dt_device *d, struct obd_statfs *osfs) { - struct osd_device *osd = osd_dt_dev(d); int rc; ENTRY; - rc = osd_objset_statfs(osd->od_os, osfs); + rc = osd_objset_statfs(osd_dt_dev(d), osfs); if (unlikely(rc != 0)) RETURN(rc); @@ -468,13 +468,14 @@ int osd_statfs(const struct lu_env *env, struct dt_device *d, RETURN(0); } -static int osd_blk_insert_cost(void) +static int osd_blk_insert_cost(struct osd_device *osd) { - int max_blockshift, nr_blkptrshift; + int max_blockshift, nr_blkptrshift, bshift; /* max_blockshift is the log2 of the number of blocks needed to reach * the maximum filesize (that's to say 2^64) */ - max_blockshift = DN_MAX_OFFSET_SHIFT - SPA_MAXBLOCKSHIFT; + bshift = osd_spa_maxblockshift(dmu_objset_spa(osd->od_os)); + max_blockshift = DN_MAX_OFFSET_SHIFT - bshift; /* nr_blkptrshift is the log2 of the number of block pointers that can * be stored in an indirect block */ @@ -526,7 +527,7 @@ static void osd_conf_get(const struct lu_env *env, * estimate the real size consumed by an object */ param->ddp_inodespace = OSD_DNODE_EST_COUNT; /* per-fragment overhead to be used by the client code */ - param->ddp_grant_frag = osd_blk_insert_cost(); + param->ddp_grant_frag = osd_blk_insert_cost(osd); } /* @@ -676,6 +677,70 @@ static void osd_xattr_changed_cb(void *arg, uint64_t newval) osd->od_xattr_in_sa = (newval == ZFS_XATTR_SA); } +static void osd_recordsize_changed_cb(void *arg, uint64_t newval) +{ + struct osd_device *osd = arg; + + LASSERT(newval <= osd_spa_maxblocksize(dmu_objset_spa(osd->od_os))); + LASSERT(newval >= SPA_MINBLOCKSIZE); + LASSERT(ISP2(newval)); + + osd->od_max_blksz = newval; +} + +/* + * This function unregisters all registered callbacks. It's harmless to + * unregister callbacks that were never registered so it is used to safely + * unwind a partially completed call to osd_objset_register_callbacks(). + */ +static void osd_objset_unregister_callbacks(struct osd_device *o) +{ + struct dsl_dataset *ds = dmu_objset_ds(o->od_os); + + (void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_XATTR), + osd_xattr_changed_cb, o); + (void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_RECORDSIZE), + osd_recordsize_changed_cb, o); + + if (o->arc_prune_cb != NULL) { + arc_remove_prune_callback(o->arc_prune_cb); + o->arc_prune_cb = NULL; + } +} + +/* + * Register the required callbacks to be notified when zfs properties + * are modified using the 'zfs(8)' command line utility. + */ +static int osd_objset_register_callbacks(struct osd_device *o) +{ + struct dsl_dataset *ds = dmu_objset_ds(o->od_os); + dsl_pool_t *dp = dmu_objset_pool(o->od_os); + int rc; + + LASSERT(ds); + LASSERT(dp); + + dsl_pool_config_enter(dp, FTAG); + rc = -dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_XATTR), + osd_xattr_changed_cb, o); + if (rc) + GOTO(err, rc); + + rc = -dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_RECORDSIZE), + osd_recordsize_changed_cb, o); + if (rc) + GOTO(err, rc); + + o->arc_prune_cb = arc_add_prune_callback(arc_prune_func, o); +err: + dsl_pool_config_exit(dp, FTAG); + if (rc) + osd_objset_unregister_callbacks(o); + + RETURN(rc); +} + static int osd_objset_open(struct osd_device *o) { uint64_t version = ZPL_VERSION; @@ -738,11 +803,9 @@ out: static int osd_mount(const struct lu_env *env, struct osd_device *o, struct lustre_cfg *cfg) { - struct dsl_dataset *ds; char *mntdev = lustre_cfg_string(cfg, 1); char *svname = lustre_cfg_string(cfg, 4); dmu_buf_t *rootdb; - dsl_pool_t *dp; const char *opts; int rc; ENTRY; @@ -765,30 +828,19 @@ static int osd_mount(const struct lu_env *env, o->od_is_ost = 1; rc = osd_objset_open(o); - if (rc) { - CERROR("%s: can't open objset %s: rc = %d\n", o->od_svname, - o->od_mntdev, rc); - RETURN(rc); - } + if (rc) + GOTO(err, rc); - ds = dmu_objset_ds(o->od_os); - dp = dmu_objset_pool(o->od_os); - LASSERT(ds); - LASSERT(dp); - dsl_pool_config_enter(dp, FTAG); - rc = dsl_prop_register(ds, "xattr", osd_xattr_changed_cb, o); - dsl_pool_config_exit(dp, FTAG); + o->od_xattr_in_sa = B_TRUE; + o->od_max_blksz = SPA_OLD_MAXBLOCKSIZE; + + rc = osd_objset_register_callbacks(o); if (rc) - CWARN("%s: can't register xattr callback, ignore: rc=%d\n", - o->od_svname, rc); + GOTO(err, rc); rc = __osd_obj2dbuf(env, o->od_os, o->od_rootid, &rootdb); - if (rc) { - CERROR("%s: obj2dbuf() failed: rc = %d\n", o->od_svname, rc); - dmu_objset_disown(o->od_os, o); - o->od_os = NULL; - RETURN(rc); - } + if (rc) + GOTO(err, rc); o->od_root = rootdb->db_object; sa_buf_rele(rootdb, osd_obj_tag); @@ -819,8 +871,6 @@ static int osd_mount(const struct lu_env *env, if (rc) GOTO(err, rc); - o->arc_prune_cb = arc_add_prune_callback(arc_prune_func, o); - /* initialize quota slave instance */ o->od_quota_slave = qsd_init(env, o->od_svname, &o->od_dt_dev, o->od_proc_entry); @@ -836,6 +886,11 @@ static int osd_mount(const struct lu_env *env, o->od_posix_acl = 1; err: + if (rc) { + dmu_objset_disown(o->od_os, o); + o->od_os = NULL; + } + RETURN(rc); } @@ -946,7 +1001,6 @@ static struct lu_device *osd_device_fini(const struct lu_env *env, struct lu_device *d) { struct osd_device *o = osd_dev(d); - struct dsl_dataset *ds; int rc; ENTRY; @@ -955,15 +1009,7 @@ static struct lu_device *osd_device_fini(const struct lu_env *env, osd_oi_fini(env, o); if (o->od_os) { - ds = dmu_objset_ds(o->od_os); - rc = dsl_prop_unregister(ds, "xattr", osd_xattr_changed_cb, o); - if (rc) - CERROR("%s: dsl_prop_unregister xattr error %d\n", - o->od_svname, rc); - if (o->arc_prune_cb != NULL) { - arc_remove_prune_callback(o->arc_prune_cb); - o->arc_prune_cb = NULL; - } + osd_objset_unregister_callbacks(o); osd_sync(env, lu2dt_dev(d)); txg_wait_callbacks(spa_get_dsl(dmu_objset_spa(o->od_os))); } diff --git a/lustre/osd-zfs/osd_internal.h b/lustre/osd-zfs/osd_internal.h index 3888c5b..b74aa1d 100644 --- a/lustre/osd-zfs/osd_internal.h +++ b/lustre/osd-zfs/osd_internal.h @@ -47,23 +47,35 @@ #include #include #include - -#define _SPL_KMEM_H -/* SPL redefines this but to the same value: ~0UL vs -1, but GCC complains. - * fixed in SPL master 52479ecf58fa89190e384edcf838fecccc786af5 */ +#ifdef SHRINK_STOP #undef SHRINK_STOP -#include -#define kmem_zalloc(a, b) kzalloc(a, b) -#define kmem_free(ptr, sz) ((void)(sz), kfree(ptr)) -#ifndef KM_SLEEP -#define KM_SLEEP GFP_KERNEL #endif - #include #include #include #include +/** + * By design including kmem.h overrides the Linux slab interfaces to provide + * the Illumos kmem cache interfaces. To override this and gain access to + * the Linux interfaces these preprocessor macros must be undefined. + */ +#ifdef kmem_cache_destroy +#undef kmem_cache_destroy +#endif + +#ifdef kmem_cache_create +#undef kmem_cache_create +#endif + +#ifdef kmem_cache_alloc +#undef kmem_cache_alloc +#endif + +#ifdef kmem_cache_free +#undef kmem_cache_free +#endif + #define LUSTRE_ROOT_FID_SEQ 0 #define DMU_OSD_SVNAME "svname" #define DMU_OSD_OI_NAME_BASE "oi" @@ -71,13 +83,13 @@ #define OSD_GFP_IO (GFP_NOFS | __GFP_HIGHMEM) /* Statfs space reservation for grant, fragmentation, and unlink space. */ -#define OSD_STATFS_RESERVED_BLKS (1ULL << (22 - SPA_MAXBLOCKSHIFT)) /* 4MB */ -#define OSD_STATFS_RESERVED_SHIFT (7) /* reserve 0.78% of all space */ +#define OSD_STATFS_RESERVED_SIZE (16ULL << 20) /* reserve 16MB minimum */ +#define OSD_STATFS_RESERVED_SHIFT (7) /* reserve 0.78% of all space */ /* Statfs {minimum, safe estimate, and maximum} dnodes per block */ -#define OSD_DNODE_MIN_BLKSHIFT (SPA_MAXBLOCKSHIFT - DNODE_SHIFT) /* 17-9 =8 */ -#define OSD_DNODE_EST_BLKSHIFT (SPA_MAXBLOCKSHIFT - 12) /* 17-12=5 */ -#define OSD_DNODE_EST_COUNT 1024 +#define OSD_DNODE_MIN_BLKSHIFT (DNODES_PER_BLOCK_SHIFT) +#define OSD_DNODE_EST_BLKSHIFT (DNODES_PER_BLOCK_SHIFT >> 1) +#define OSD_DNODE_EST_COUNT 1024 #define OSD_GRANT_FOR_LOCAL_OIDS (2ULL << 20) /* 2MB for last_rcvd, ... */ @@ -262,6 +274,7 @@ struct osd_device { struct proc_dir_entry *od_proc_entry; struct lprocfs_stats *od_stats; + uint64_t od_max_blksz; uint64_t od_root; uint64_t od_O_id; struct osd_oi **od_oi_table; @@ -344,7 +357,7 @@ int osd_declare_quota(const struct lu_env *env, struct osd_device *osd, struct osd_thandle *oh, bool is_blk, int *flags, bool force); uint64_t osd_objs_count_estimate(uint64_t refdbytes, uint64_t usedobjs, - uint64_t nrblocks); + uint64_t nrblocks, uint64_t est_maxblockshift); /* * Helpers. @@ -546,7 +559,32 @@ static inline void dsl_pool_config_enter(dsl_pool_t *dp, char *name) static inline void dsl_pool_config_exit(dsl_pool_t *dp, char *name) { } +#endif + +#ifdef HAVE_SPA_MAXBLOCKSIZE +#define osd_spa_maxblocksize(spa) spa_maxblocksize(spa) +#define osd_spa_maxblockshift(spa) fls64(spa_maxblocksize(spa) - 1) +#else +#define osd_spa_maxblocksize(spa) SPA_MAXBLOCKSIZE +#define osd_spa_maxblockshift(spa) SPA_MAXBLOCKSHIFT +#define SPA_OLD_MAXBLOCKSIZE SPA_MAXBLOCKSIZE +#endif + +#ifdef HAVE_SA_SPILL_ALLOC +static inline void * +osd_zio_buf_alloc(size_t size) +{ + return sa_spill_alloc(KM_SLEEP); +} +static inline void +osd_zio_buf_free(void *buf, size_t size) +{ + sa_spill_free(buf); +} +#else +#define osd_zio_buf_alloc(size) zio_buf_alloc(size) +#define osd_zio_buf_free(buf, size) zio_buf_free(buf, size) #endif #endif /* _OSD_INTERNAL_H */ diff --git a/lustre/osd-zfs/osd_io.c b/lustre/osd-zfs/osd_io.c index 47058d2..c771efa 100644 --- a/lustre/osd-zfs/osd_io.c +++ b/lustre/osd-zfs/osd_io.c @@ -533,7 +533,8 @@ static int osd_count_not_mapped(struct osd_object *obj, uint64_t start, if (start < dn->dn_datablksz) start = dn->dn_datablksz; /* assume largest block size */ - blkshift = SPA_MAXBLOCKSHIFT; + blkshift = osd_spa_maxblockshift( + dmu_objset_spa(osd_obj2dev(obj)->od_os)); } else { /* blocksize can't change */ blkshift = dn->dn_datablkshift; diff --git a/lustre/osd-zfs/osd_object.c b/lustre/osd-zfs/osd_object.c index 1767769..bae5cbc 100644 --- a/lustre/osd-zfs/osd_object.c +++ b/lustre/osd-zfs/osd_object.c @@ -759,7 +759,8 @@ static int osd_attr_get(const struct lu_env *env, attr->la_size = 512 * blocks; /* Block size may be not set; suggest maximal I/O transfers. */ if (blksize == 0) - blksize = 1ULL << SPA_MAXBLOCKSHIFT; + blksize = osd_spa_maxblocksize( + dmu_objset_spa(osd_obj2dev(obj)->od_os)); attr->la_blksize = blksize; attr->la_blocks = blocks; @@ -1321,13 +1322,15 @@ static dmu_buf_t* osd_mkreg(const struct lu_env *env, struct osd_device *osd, return ERR_PTR(rc); /* - * XXX: a hack, OST to use bigger blocksize. we need - * a method in OSD API to control this from OFD/MDD + * XXX: This heuristic is non-optimal. It would be better to + * increase the blocksize up to osd->od_max_blksz during the write. + * This is exactly how the ZPL behaves and it ensures that the right + * blocksize is selected based on the file size rather than the + * making broad assumptions based on the osd type. */ if (!lu_device_is_md(osd2lu_dev(osd))) { - rc = -dmu_object_set_blocksize(osd->od_os, - db->db_object, - 128 << 10, 0, oh->ot_tx); + rc = -dmu_object_set_blocksize(osd->od_os, db->db_object, + osd->od_max_blksz, 0, oh->ot_tx); if (unlikely(rc)) { CERROR("%s: can't change blocksize: %d\n", osd->od_svname, rc); diff --git a/lustre/osd-zfs/osd_quota.c b/lustre/osd-zfs/osd_quota.c index 3247ddd..74a2bfd 100644 --- a/lustre/osd-zfs/osd_quota.c +++ b/lustre/osd-zfs/osd_quota.c @@ -49,15 +49,16 @@ uint64_t osd_quota_fid2dmu(const struct lu_fid *fid) static uint64_t osd_objset_user_iused(struct osd_device *osd, uint64_t uidbytes) { uint64_t refdbytes, availbytes, usedobjs, availobjs; - uint64_t uidobjs; + uint64_t uidobjs, bshift; /* get fresh statfs info */ dmu_objset_space(osd->od_os, &refdbytes, &availbytes, &usedobjs, &availobjs); /* estimate the number of objects based on the disk usage */ + bshift = fls64(osd->od_max_blksz) - 1; uidobjs = osd_objs_count_estimate(refdbytes, usedobjs, - uidbytes >> SPA_MAXBLOCKSHIFT); + uidbytes >> bshift, bshift); if (uidbytes > 0) /* if we have at least 1 byte, we have at least one dnode ... */ uidobjs = max_t(uint64_t, uidobjs, 1); diff --git a/lustre/osd-zfs/osd_xattr.c b/lustre/osd-zfs/osd_xattr.c index 1697728..4a32f1a 100644 --- a/lustre/osd-zfs/osd_xattr.c +++ b/lustre/osd-zfs/osd_xattr.c @@ -89,7 +89,7 @@ int __osd_xattr_load(struct osd_device *osd, uint64_t dnode, nvlist_t **sa) goto out_sa; } - buf = sa_spill_alloc(KM_SLEEP); + buf = osd_zio_buf_alloc(size); if (buf == NULL) { rc = -ENOMEM; goto out_sa; @@ -97,7 +97,7 @@ int __osd_xattr_load(struct osd_device *osd, uint64_t dnode, nvlist_t **sa) rc = -sa_lookup(sa_hdl, SA_ZPL_DXATTR(osd), buf, size); if (rc == 0) rc = -nvlist_unpack(buf, size, sa, KM_SLEEP); - sa_spill_free(buf); + osd_zio_buf_free(buf, size); out_sa: sa_handle_destroy(sa_hdl); @@ -375,7 +375,7 @@ __osd_sa_xattr_update(const struct lu_env *env, struct osd_object *obj, if (rc) return rc; - dxattr = sa_spill_alloc(KM_SLEEP); + dxattr = osd_zio_buf_alloc(sa_size); if (dxattr == NULL) RETURN(-ENOMEM); @@ -386,7 +386,7 @@ __osd_sa_xattr_update(const struct lu_env *env, struct osd_object *obj, rc = osd_object_sa_update(obj, SA_ZPL_DXATTR(osd), dxattr, sa_size, oh); out_free: - sa_spill_free(dxattr); + osd_zio_buf_free(dxattr, sa_size); RETURN(rc); }