From 8ea1081cbd71d05645253eaa65380cfeab79cd68 Mon Sep 17 00:00:00 2001 From: Niu Yawei Date: Wed, 21 May 2014 00:46:41 -0400 Subject: [PATCH] quota: remove dqptr_sem for scalability Remove dqptr_sem (but kept in struct quota_info to keep kernel ABI unchanged), and the functionality of this lock is implemented by other locks: * i_dquot is protected by i_lock, however only this pointer, the content of this struct is by dq_data_lock. * Q_GETFMT is now protected with dqonoff_mutex instead of dqptr_sem. * Small changes in __dquot_initialize() to avoid unnecessary dqget()/dqput() calls. Signed-off-by: Lai Siyao Signed-off-by: Niu Yawei --- fs/quota/dquot.c | 171 ++++++++++++++++++++++++++------------------------- fs/quota/quota.c | 6 +- fs/stat.c | 7 ++- fs/super.c | 1 - include/linux/fs.h | 1 + 5 files changed, 97 insertions(+), 89 deletions(-) diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 9cd5f63..99394b2 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -83,26 +83,21 @@ /* * There are three quota SMP locks. dq_list_lock protects all lists with quotas * and quota formats. - * dq_data_lock protects data from dq_dqb and also mem_dqinfo structures and - * also guards consistency of dquot->dq_dqb with inode->i_blocks, i_bytes. - * i_blocks and i_bytes updates itself are guarded by i_lock acquired directly - * in inode_add_bytes() and inode_sub_bytes(). dq_state_lock protects - * modifications of quota state (on quotaon and quotaoff) and readers who care - * about latest values take it as well. - * - * The spinlock ordering is hence: dq_data_lock > dq_list_lock > i_lock, + * dq_data_lock protects data from dq_dqb and also mem_dqinfo structures. + * dq_state_lock protects modifications of quota state (on quotaon and quotaoff) + * and readers who care about latest values take it as well. + * + * The spinlock ordering is hence: i_lock > dq_data_lock > dq_list_lock, * dq_list_lock > dq_state_lock * * Note that some things (eg. sb pointer, type, id) doesn't change during * the life of the dquot structure and so needn't to be protected by a lock * - * Any operation working on dquots via inode pointers must hold dqptr_sem. If - * operation is just reading pointers from inode (or not using them at all) the - * read lock is enough. If pointers are altered function must hold write lock. + * Any operation working on dquots via inode pointers must hold i_lock. * Special care needs to be taken about S_NOQUOTA inode flag (marking that * inode is a quota file). Functions adding pointers from inode to dquots have - * to check this flag under dqptr_sem and then (if S_NOQUOTA is not set) they - * have to do all pointer modifications before dropping dqptr_sem. This makes + * to check this flag under i_lock and then (if S_NOQUOTA is not set) they + * have to do all pointer modifications before dropping i_lock. This makes * sure they cannot race with quotaon which first sets S_NOQUOTA flag and * then drops all pointers to dquots from an inode. * @@ -116,16 +111,9 @@ * spinlock to internal buffers before writing. * * Lock ordering (including related VFS locks) is the following: - * dqonoff_mutex > i_mutex > journal_lock > dqptr_sem > dquot->dq_lock > - * dqio_mutex + * i_mutex > dqonoff_sem > journal_lock > dquot->dq_lock > dqio_mutex * dqonoff_mutex > i_mutex comes from dquot_quota_sync, dquot_enable, etc. - * The lock ordering of dqptr_sem imposed by quota code is only dqonoff_sem > - * dqptr_sem. But filesystem has to count with the fact that functions such as - * dquot_alloc_space() acquire dqptr_sem and they usually have to be called - * from inside a transaction to keep filesystem consistency after a crash. Also - * filesystems usually want to do some IO on dquot from ->mark_dirty which is - * called with dqptr_sem held. - */ + */ static __cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_list_lock); static __cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_state_lock); @@ -974,7 +962,6 @@ static inline int dqput_blocks(struct dquot *dquot) /* * Remove references to dquots from inode and add dquot to list for freeing * if we have the last reference to dquot - * We can't race with anybody because we hold dqptr_sem for writing... */ static int remove_inode_dquot_ref(struct inode *inode, int type, struct list_head *tofree_head) @@ -1035,13 +1022,15 @@ static void remove_dquot_ref(struct super_block *sb, int type, * We have to scan also I_NEW inodes because they can already * have quota pointer initialized. Luckily, we need to touch * only quota pointers and these have separate locking - * (dqptr_sem). + * (i_lock). */ + spin_lock(&inode->i_lock); if (!IS_NOQUOTA(inode)) { if (unlikely(inode_get_rsv_space(inode) > 0)) reserved = 1; remove_inode_dquot_ref(inode, type, tofree_head); } + spin_unlock(&inode->i_lock); } spin_unlock(&inode_sb_list_lock); #ifdef CONFIG_QUOTA_DEBUG @@ -1059,9 +1048,7 @@ static void drop_dquot_ref(struct super_block *sb, int type) LIST_HEAD(tofree_head); if (sb->dq_op) { - down_write(&sb_dqopt(sb)->dqptr_sem); remove_dquot_ref(sb, type, &tofree_head); - up_write(&sb_dqopt(sb)->dqptr_sem); put_dquot_list(&tofree_head); } } @@ -1392,25 +1379,27 @@ static int dquot_active(const struct inode *inode) /* * Initialize quota pointers in inode * - * We do things in a bit complicated way but by that we avoid calling - * dqget() and thus filesystem callbacks under dqptr_sem. - * * It is better to call this function outside of any transaction as it * might need a lot of space in journal for dquot structure allocation. */ static void __dquot_initialize(struct inode *inode, int type) { - int cnt; - struct dquot *got[MAXQUOTAS]; + int cnt, dq_get = 0; + struct dquot *got[MAXQUOTAS] = { NULL, NULL }; struct super_block *sb = inode->i_sb; qsize_t rsv; - /* First test before acquiring mutex - solves deadlocks when we - * re-enter the quota code and are already holding the mutex */ if (!dquot_active(inode)) return; - /* First get references to structures we might need. */ + /* In most case, the i_dquot should have been initialized, except + * the newly allocated one. We'd always try to skip the dqget() and + * dqput() calls to avoid unnecessary global lock contention. */ + if (!(inode->i_state & I_NEW)) + goto init_idquot; + +get_dquots: + dq_get = 1; for (cnt = 0; cnt < MAXQUOTAS; cnt++) { struct kqid qid; got[cnt] = NULL; @@ -1426,8 +1415,8 @@ static void __dquot_initialize(struct inode *inode, int type) } got[cnt] = dqget(sb, qid); } - - down_write(&sb_dqopt(sb)->dqptr_sem); +init_idquot: + spin_lock(&inode->i_lock); if (IS_NOQUOTA(inode)) goto out_err; for (cnt = 0; cnt < MAXQUOTAS; cnt++) { @@ -1437,9 +1426,13 @@ static void __dquot_initialize(struct inode *inode, int type) if (!sb_has_quota_active(sb, cnt)) continue; /* We could race with quotaon or dqget() could have failed */ - if (!got[cnt]) + if (!got[cnt] && dq_get) continue; if (!inode->i_dquot[cnt]) { + if (dq_get == 0) { + spin_unlock(&inode->i_lock); + goto get_dquots; + } inode->i_dquot[cnt] = got[cnt]; got[cnt] = NULL; /* @@ -1455,7 +1448,7 @@ static void __dquot_initialize(struct inode *inode, int type) } } out_err: - up_write(&sb_dqopt(sb)->dqptr_sem); + spin_unlock(&inode->i_lock); /* Drop unused references */ dqput_all(got); } @@ -1474,12 +1467,12 @@ static void __dquot_drop(struct inode *inode) int cnt; struct dquot *put[MAXQUOTAS]; - down_write(&sb_dqopt(inode->i_sb)->dqptr_sem); + spin_lock(&inode->i_lock); for (cnt = 0; cnt < MAXQUOTAS; cnt++) { put[cnt] = inode->i_dquot[cnt]; inode->i_dquot[cnt] = NULL; } - up_write(&sb_dqopt(inode->i_sb)->dqptr_sem); + spin_unlock(&inode->i_lock); dqput_all(put); } @@ -1519,36 +1512,57 @@ static qsize_t *inode_reserved_space(struct inode * inode) return inode->i_sb->dq_op->get_reserved_space(inode); } +static inline void __inode_add_rsv_space(struct inode *inode, qsize_t number) +{ + *inode_reserved_space(inode) += number; +} + void inode_add_rsv_space(struct inode *inode, qsize_t number) { spin_lock(&inode->i_lock); - *inode_reserved_space(inode) += number; + __inode_add_rsv_space(inode, number); spin_unlock(&inode->i_lock); } EXPORT_SYMBOL(inode_add_rsv_space); +static inline void __inode_claim_rsv_space(struct inode *inode, qsize_t number) +{ + *inode_reserved_space(inode) -= number; + __inode_add_bytes(inode, number); +} + void inode_claim_rsv_space(struct inode *inode, qsize_t number) { spin_lock(&inode->i_lock); - *inode_reserved_space(inode) -= number; - __inode_add_bytes(inode, number); + __inode_claim_rsv_space(inode, number); spin_unlock(&inode->i_lock); } EXPORT_SYMBOL(inode_claim_rsv_space); -void inode_reclaim_rsv_space(struct inode *inode, qsize_t number) +static inline void __inode_reclaim_rsv_space(struct inode *inode, + qsize_t number) { - spin_lock(&inode->i_lock); *inode_reserved_space(inode) += number; __inode_sub_bytes(inode, number); +} + +void inode_reclaim_rsv_space(struct inode *inode, qsize_t number) +{ + spin_lock(&inode->i_lock); + __inode_reclaim_rsv_space(inode, number); spin_unlock(&inode->i_lock); } EXPORT_SYMBOL(inode_reclaim_rsv_space); +static inline void __inode_sub_rsv_space(struct inode *inode, qsize_t number) +{ + *inode_reserved_space(inode) -= number; +} + void inode_sub_rsv_space(struct inode *inode, qsize_t number) { spin_lock(&inode->i_lock); - *inode_reserved_space(inode) -= number; + __inode_sub_rsv_space(inode, number); spin_unlock(&inode->i_lock); } EXPORT_SYMBOL(inode_sub_rsv_space); @@ -1559,9 +1573,8 @@ static qsize_t inode_get_rsv_space(struct inode *inode) if (!inode->i_sb->dq_op->get_reserved_space) return 0; - spin_lock(&inode->i_lock); + ret = *inode_reserved_space(inode); - spin_unlock(&inode->i_lock); return ret; } @@ -1569,17 +1582,17 @@ static void inode_incr_space(struct inode *inode, qsize_t number, int reserve) { if (reserve) - inode_add_rsv_space(inode, number); + __inode_add_rsv_space(inode, number); else - inode_add_bytes(inode, number); + __inode_add_bytes(inode, number); } static void inode_decr_space(struct inode *inode, qsize_t number, int reserve) { if (reserve) - inode_sub_rsv_space(inode, number); + __inode_sub_rsv_space(inode, number); else - inode_sub_bytes(inode, number); + __inode_sub_bytes(inode, number); } /* @@ -1602,10 +1615,6 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags) struct dquot **dquots = inode->i_dquot; int reserve = flags & DQUOT_SPACE_RESERVE; - /* - * First test before acquiring mutex - solves deadlocks when we - * re-enter the quota code and are already holding the mutex - */ if (!dquot_active(inode)) { inode_incr_space(inode, number, reserve); goto out; @@ -1614,7 +1623,7 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags) for (cnt = 0; cnt < MAXQUOTAS; cnt++) warn[cnt].w_type = QUOTA_NL_NOWARN; - down_read(&sb_dqopt(inode->i_sb)->dqptr_sem); + spin_lock(&inode->i_lock); spin_lock(&dq_data_lock); for (cnt = 0; cnt < MAXQUOTAS; cnt++) { if (!dquots[cnt]) @@ -1623,6 +1632,7 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags) !(flags & DQUOT_SPACE_WARN), &warn[cnt]); if (ret && !(flags & DQUOT_SPACE_NOFAIL)) { spin_unlock(&dq_data_lock); + spin_unlock(&inode->i_lock); goto out_flush_warn; } } @@ -1636,12 +1646,12 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags) } inode_incr_space(inode, number, reserve); spin_unlock(&dq_data_lock); + spin_unlock(&inode->i_lock); if (reserve) goto out_flush_warn; mark_all_dquot_dirty(dquots); out_flush_warn: - up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); flush_warnings(warn); out: return ret; @@ -1657,13 +1667,12 @@ int dquot_alloc_inode(const struct inode *inode) struct dquot_warn warn[MAXQUOTAS]; struct dquot * const *dquots = inode->i_dquot; - /* First test before acquiring mutex - solves deadlocks when we - * re-enter the quota code and are already holding the mutex */ if (!dquot_active(inode)) return 0; for (cnt = 0; cnt < MAXQUOTAS; cnt++) warn[cnt].w_type = QUOTA_NL_NOWARN; - down_read(&sb_dqopt(inode->i_sb)->dqptr_sem); + + spin_lock((spinlock_t *)&inode->i_lock); spin_lock(&dq_data_lock); for (cnt = 0; cnt < MAXQUOTAS; cnt++) { if (!dquots[cnt]) @@ -1681,9 +1690,9 @@ int dquot_alloc_inode(const struct inode *inode) warn_put_all: spin_unlock(&dq_data_lock); + spin_unlock((spinlock_t *)&inode->i_lock); if (ret == 0) mark_all_dquot_dirty(dquots); - up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); flush_warnings(warn); return ret; } @@ -1701,7 +1710,7 @@ int dquot_claim_space_nodirty(struct inode *inode, qsize_t number) return 0; } - down_read(&sb_dqopt(inode->i_sb)->dqptr_sem); + spin_lock(&inode->i_lock); spin_lock(&dq_data_lock); /* Claim reserved quotas to allocated quotas */ for (cnt = 0; cnt < MAXQUOTAS; cnt++) { @@ -1710,10 +1719,10 @@ int dquot_claim_space_nodirty(struct inode *inode, qsize_t number) number); } /* Update inode bytes */ - inode_claim_rsv_space(inode, number); + __inode_claim_rsv_space(inode, number); spin_unlock(&dq_data_lock); + spin_unlock(&inode->i_lock); mark_all_dquot_dirty(inode->i_dquot); - up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); return 0; } EXPORT_SYMBOL(dquot_claim_space_nodirty); @@ -1730,7 +1739,7 @@ void dquot_reclaim_space_nodirty(struct inode *inode, qsize_t number) return; } - down_read(&sb_dqopt(inode->i_sb)->dqptr_sem); + spin_lock(&inode->i_lock); spin_lock(&dq_data_lock); /* Claim reserved quotas to allocated quotas */ for (cnt = 0; cnt < MAXQUOTAS; cnt++) { @@ -1739,10 +1748,10 @@ void dquot_reclaim_space_nodirty(struct inode *inode, qsize_t number) number); } /* Update inode bytes */ - inode_reclaim_rsv_space(inode, number); + __inode_reclaim_rsv_space(inode, number); spin_unlock(&dq_data_lock); + spin_unlock(&inode->i_lock); mark_all_dquot_dirty(inode->i_dquot); - up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); return; } EXPORT_SYMBOL(dquot_reclaim_space_nodirty); @@ -1757,14 +1766,12 @@ void __dquot_free_space(struct inode *inode, qsize_t number, int flags) struct dquot **dquots = inode->i_dquot; int reserve = flags & DQUOT_SPACE_RESERVE; - /* First test before acquiring mutex - solves deadlocks when we - * re-enter the quota code and are already holding the mutex */ if (!dquot_active(inode)) { inode_decr_space(inode, number, reserve); return; } - down_read(&sb_dqopt(inode->i_sb)->dqptr_sem); + spin_lock(&inode->i_lock); spin_lock(&dq_data_lock); for (cnt = 0; cnt < MAXQUOTAS; cnt++) { int wtype; @@ -1782,12 +1789,12 @@ void __dquot_free_space(struct inode *inode, qsize_t number, int flags) } inode_decr_space(inode, number, reserve); spin_unlock(&dq_data_lock); + spin_unlock(&inode->i_lock); if (reserve) - goto out_unlock; + goto out; mark_all_dquot_dirty(dquots); -out_unlock: - up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); +out: flush_warnings(warn); } EXPORT_SYMBOL(__dquot_free_space); @@ -1801,12 +1808,10 @@ void dquot_free_inode(const struct inode *inode) struct dquot_warn warn[MAXQUOTAS]; struct dquot * const *dquots = inode->i_dquot; - /* First test before acquiring mutex - solves deadlocks when we - * re-enter the quota code and are already holding the mutex */ if (!dquot_active(inode)) return; - down_read(&sb_dqopt(inode->i_sb)->dqptr_sem); + spin_lock((spinlock_t *)&inode->i_lock); spin_lock(&dq_data_lock); for (cnt = 0; cnt < MAXQUOTAS; cnt++) { int wtype; @@ -1820,8 +1825,8 @@ void dquot_free_inode(const struct inode *inode) dquot_decr_inodes(dquots[cnt], 1); } spin_unlock(&dq_data_lock); + spin_unlock((spinlock_t *)&inode->i_lock); mark_all_dquot_dirty(dquots); - up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); flush_warnings(warn); } EXPORT_SYMBOL(dquot_free_inode); @@ -1847,8 +1852,6 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to) struct dquot_warn warn_from_inodes[MAXQUOTAS]; struct dquot_warn warn_from_space[MAXQUOTAS]; - /* First test before acquiring mutex - solves deadlocks when we - * re-enter the quota code and are already holding the mutex */ if (IS_NOQUOTA(inode)) return 0; /* Initialize the arrays */ @@ -1857,9 +1860,9 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to) warn_from_inodes[cnt].w_type = QUOTA_NL_NOWARN; warn_from_space[cnt].w_type = QUOTA_NL_NOWARN; } - down_write(&sb_dqopt(inode->i_sb)->dqptr_sem); + spin_lock(&inode->i_lock); if (IS_NOQUOTA(inode)) { /* File without quota accounting? */ - up_write(&sb_dqopt(inode->i_sb)->dqptr_sem); + spin_unlock(&inode->i_lock); return 0; } spin_lock(&dq_data_lock); @@ -1916,7 +1919,7 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to) inode->i_dquot[cnt] = transfer_to[cnt]; } spin_unlock(&dq_data_lock); - up_write(&sb_dqopt(inode->i_sb)->dqptr_sem); + spin_unlock(&inode->i_lock); mark_all_dquot_dirty(transfer_from); mark_all_dquot_dirty(transfer_to); @@ -1930,7 +1933,7 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to) return 0; over_quota: spin_unlock(&dq_data_lock); - up_write(&sb_dqopt(inode->i_sb)->dqptr_sem); + spin_unlock(&inode->i_lock); flush_warnings(warn_to); return ret; } diff --git a/fs/quota/quota.c b/fs/quota/quota.c index 2b363e2..e4851cb 100644 --- a/fs/quota/quota.c +++ b/fs/quota/quota.c @@ -79,13 +79,13 @@ static int quota_getfmt(struct super_block *sb, int type, void __user *addr) { __u32 fmt; - down_read(&sb_dqopt(sb)->dqptr_sem); + mutex_lock(&sb_dqopt(sb)->dqonoff_mutex); if (!sb_has_quota_active(sb, type)) { - up_read(&sb_dqopt(sb)->dqptr_sem); + mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex); return -ESRCH; } fmt = sb_dqopt(sb)->info[type].dqi_format->qf_fmt_id; - up_read(&sb_dqopt(sb)->dqptr_sem); + mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex); if (copy_to_user(addr, &fmt, sizeof(fmt))) return -EFAULT; return 0; diff --git a/fs/stat.c b/fs/stat.c index ae0c3ce..b0e6898 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -488,12 +488,17 @@ void inode_sub_bytes(struct inode *inode, loff_t bytes) EXPORT_SYMBOL(inode_sub_bytes); +loff_t __inode_get_bytes(struct inode *inode) +{ + return (((loff_t)inode->i_blocks) << 9) + inode->i_bytes; +} + loff_t inode_get_bytes(struct inode *inode) { loff_t ret; spin_lock(&inode->i_lock); - ret = (((loff_t)inode->i_blocks) << 9) + inode->i_bytes; + ret = __inode_get_bytes(inode); spin_unlock(&inode->i_lock); return ret; } diff --git a/fs/super.c b/fs/super.c index 48377f7..a97aecf 100644 --- a/fs/super.c +++ b/fs/super.c @@ -214,7 +214,6 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags) lockdep_set_class(&s->s_vfs_rename_mutex, &type->s_vfs_rename_key); mutex_init(&s->s_dquot.dqio_mutex); mutex_init(&s->s_dquot.dqonoff_mutex); - init_rwsem(&s->s_dquot.dqptr_sem); s->s_maxbytes = MAX_NON_LFS; s->s_op = &default_op; s->s_time_gran = 1000000000; diff --git a/include/linux/fs.h b/include/linux/fs.h index 8780312..cd2f427 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2518,6 +2518,7 @@ void __inode_add_bytes(struct inode *inode, loff_t bytes); void inode_add_bytes(struct inode *inode, loff_t bytes); void __inode_sub_bytes(struct inode *inode, loff_t bytes); void inode_sub_bytes(struct inode *inode, loff_t bytes); +loff_t __inode_get_bytes(struct inode *inode); loff_t inode_get_bytes(struct inode *inode); void inode_set_bytes(struct inode *inode, loff_t bytes); -- 1.7.1