diff -up linux-2.6.18-238.12.1/fs/ext4/acl.c linux-2.6.18-274.3.1/fs/ext4/acl.c --- linux-2.6.18-238.12.1/fs/ext4/acl.c 2011-06-09 18:00:08.084084781 +0800 +++ linux-2.6.18-274.3.1/fs/ext4/acl.c 2011-10-21 01:40:05.228945091 +0800 @@ -249,6 +249,7 @@ ext4_set_acl(handle_t *handle, struct in return error; else { inode->i_mode = mode; + inode->i_ctime = ext4_current_time(inode); ext4_mark_inode_dirty(handle, inode); if (error == 0) acl = NULL; diff -up linux-2.6.18-238.12.1/fs/ext4/ext4.h linux-2.6.18-274.3.1/fs/ext4/ext4.h --- linux-2.6.18-238.12.1/fs/ext4/ext4.h 2011-10-21 01:01:24.807691292 +0800 +++ linux-2.6.18-274.3.1/fs/ext4/ext4.h 2011-10-21 01:40:05.233945115 +0800 @@ -707,6 +707,8 @@ struct ext4_inode_info { spinlock_t i_completed_io_lock; /* current io_end structure for async DIO write*/ ext4_io_end_t *cur_aio_dio; + atomic_t i_aiodio_unwritten; /* Number of inflight conversions pending */ + struct mutex i_aio_mutex; /* big hammer for unaligned AIO */ /* * Transactions that contain inode's metadata needed to complete @@ -1850,6 +1852,11 @@ static inline void set_bitmap_uptodate(s #define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) +#define WQ_HASH_SZ 37 +extern wait_queue_head_t aio_wq[]; +#define to_aio_wq(v) (&aio_wq[((unsigned long)v) % WQ_HASH_SZ]) +extern void ext4_aio_wait(struct inode *inode); + #endif /* __KERNEL__ */ #endif /* _EXT4_H */ diff -up linux-2.6.18-238.12.1/fs/ext4/extents.c linux-2.6.18-274.3.1/fs/ext4/extents.c --- linux-2.6.18-238.12.1/fs/ext4/extents.c 2011-10-21 01:01:24.501691546 +0800 +++ linux-2.6.18-274.3.1/fs/ext4/extents.c 2011-10-21 01:40:05.236945128 +0800 @@ -3188,9 +3188,10 @@ ext4_ext_handle_uninitialized_extents(ha * that this IO needs to convertion to written when IO is * completed */ - if (io) + if (io && (io->flag != DIO_AIO_UNWRITTEN)) { io->flag = DIO_AIO_UNWRITTEN; - else + atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten); + } else ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); goto out; } @@ -3474,9 +3475,10 @@ int ext4_ext_get_blocks(handle_t *handle * that we need to perform convertion when IO is done. */ if (flags == EXT4_GET_BLOCKS_DIO_CREATE_EXT) { - if (io) + if (io && (io->flag != DIO_AIO_UNWRITTEN)) { io->flag = DIO_AIO_UNWRITTEN; - else + atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten); + } else ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); } diff -up linux-2.6.18-238.12.1/fs/ext4/file.c linux-2.6.18-274.3.1/fs/ext4/file.c --- linux-2.6.18-238.12.1/fs/ext4/file.c 2011-06-09 18:00:13.033084783 +0800 +++ linux-2.6.18-274.3.1/fs/ext4/file.c 2011-10-21 01:40:05.237945132 +0800 @@ -53,12 +53,45 @@ static int ext4_release_file(struct inod return 0; } +void ext4_aiodio_wait(struct inode *inode) +{ + wait_queue_head_t *wq = to_aio_wq(inode); + + wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_aiodio_unwritten) == 0)); +} + +/* + * This tests whether the IO in question is block-aligned or not. + * Ext4 utilizes unwritten extents when hole-filling during direct IO, and they + * are converted to written only after the IO is complete. Until they are + * mapped, these blocks appear as holes, so dio_zero_block() will assume that + * it needs to zero out portions of the start and/or end block. If 2 AIO + * threads are at work on the same unwritten block, they must be synchronized + * or one thread will zero the other's data, causing corruption. + */ +static int +ext4_unaligned_aio(struct inode *inode, size_t count, loff_t pos) +{ + struct super_block *sb = inode->i_sb; + int blockmask = sb->s_blocksize - 1; + loff_t final_size = pos + count; + + if (pos >= inode->i_size) + return 0; + + if ((pos & blockmask) || (final_size & blockmask)) + return 1; + + return 0; +} + static ssize_t ext4_file_write(struct kiocb *iocb, const char __user *buf, size_t count, loff_t pos) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_dentry->d_inode; + int unaligned_aio = 0; ssize_t ret; int err; @@ -75,9 +108,30 @@ ext4_file_write(struct kiocb *iocb, cons if (pos + count > sbi->s_bitmap_maxbytes) count = sbi->s_bitmap_maxbytes - pos; - } + } else if (unlikely((iocb->ki_filp->f_flags & O_DIRECT) && + !is_sync_kiocb(iocb))) + unaligned_aio = ext4_unaligned_aio(inode, count, pos); + + /* Unaligned direct AIO must be serialized; see comment above */ + if (unaligned_aio) { + static unsigned long unaligned_warn_time; + + /* Warn about this once per day */ + if (printk_timed_ratelimit(&unaligned_warn_time, 60*60*24*HZ)) + ext4_msg(inode->i_sb, KERN_WARNING, + "Unaligned AIO/DIO on inode %ld by %s; " + "performance will be poor.", + inode->i_ino, current->comm); + + mutex_lock(&EXT4_I(inode)->i_aio_mutex); + ext4_aiodio_wait(inode); + } ret = generic_file_aio_write(iocb, buf, count, pos); + + if (unaligned_aio) + mutex_unlock(&EXT4_I(inode)->i_aio_mutex); + /* * Skip flushing if there was an error, or if nothing was written. */ diff -up linux-2.6.18-238.12.1/fs/ext4/ialloc.c linux-2.6.18-274.3.1/fs/ext4/ialloc.c --- linux-2.6.18-238.12.1/fs/ext4/ialloc.c 2011-10-21 01:01:24.807691292 +0800 +++ linux-2.6.18-274.3.1/fs/ext4/ialloc.c 2011-10-21 01:40:05.239945140 +0800 @@ -242,8 +242,9 @@ void ext4_free_inode(handle_t *handle, s goto error_return; /* Ok, now we can actually update the inode bitmaps.. */ - cleared = ext4_clear_bit_atomic(ext4_group_lock_ptr(sb, block_group), - bit, bitmap_bh->b_data); + ext4_lock_group(sb, block_group); + cleared = ext4_clear_bit(bit, bitmap_bh->b_data); + ext4_unlock_group(sb, block_group); if (!cleared) ext4_error(sb, "bit already cleared for inode %lu", ino); else { diff -up linux-2.6.18-238.12.1/fs/ext4/inode.c linux-2.6.18-274.3.1/fs/ext4/inode.c --- linux-2.6.18-238.12.1/fs/ext4/inode.c 2011-10-21 01:01:24.840691266 +0800 +++ linux-2.6.18-274.3.1/fs/ext4/inode.c 2011-10-21 01:40:05.245945169 +0800 @@ -3615,6 +3615,7 @@ static int ext4_end_aio_dio_nolock(ext4_ struct inode *inode = io->inode; loff_t offset = io->offset; ssize_t size = io->size; + wait_queue_head_t *wq; int ret = 0; ext4_debug("end_aio_dio_onlock: io 0x%p from inode %lu,list->next 0x%p," @@ -3641,7 +3642,15 @@ static int ext4_end_aio_dio_nolock(ext4_ if (io->iocb) aio_complete(io->iocb, io->result, 0); /* clear the DIO AIO unwritten flag */ - io->flag = 0; + if (io->flag == DIO_AIO_UNWRITTEN) { + io->flag = 0; + /* Wake up anyone waiting on unwritten extent conversion */ + wq = to_aio_wq(inode); + if (atomic_dec_and_test(&EXT4_I(inode)->i_aiodio_unwritten) && + waitqueue_active(wq)) + wake_up_all(wq); + } + return ret; } /* diff -up linux-2.6.18-238.12.1/fs/ext4/super.c linux-2.6.18-274.3.1/fs/ext4/super.c --- linux-2.6.18-238.12.1/fs/ext4/super.c 2011-10-21 01:01:24.894691219 +0800 +++ linux-2.6.18-274.3.1/fs/ext4/super.c 2011-10-21 01:40:05.260945236 +0800 @@ -185,6 +185,7 @@ void ext4_itable_unused_set(struct super bg->bg_itable_unused_hi = cpu_to_le16(count >> 16); } +wait_queue_head_t aio_wq[WQ_HASH_SZ]; /* Just increment the non-pointer handle value */ static handle_t *ext4_get_nojournal(void) @@ -753,6 +754,7 @@ static struct inode *ext4_alloc_inode(st ei->cur_aio_dio = NULL; ei->i_sync_tid = 0; ei->i_datasync_tid = 0; + atomic_set(&ei->i_aiodio_unwritten, 0); return &ei->vfs_inode; } @@ -780,6 +782,7 @@ static void init_once(void * foo, struct init_rwsem(&ei->xattr_sem); #endif init_rwsem(&ei->i_data_sem); + mutex_init(&ei->i_aio_mutex); inode_init_once(&ei->vfs_inode); } @@ -1032,6 +1035,8 @@ static int bdev_try_to_free_page(struct #define QTYPE2NAME(t) ((t) == USRQUOTA ? "user" : "group") #define QTYPE2MOPT(on, t) ((t) == USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA)) +static int ext4_dquot_initialize(struct inode *inode, int type); +static int ext4_dquot_drop(struct inode *inode); static int ext4_write_dquot(struct dquot *dquot); static int ext4_acquire_dquot(struct dquot *dquot); static int ext4_release_dquot(struct dquot *dquot); @@ -1046,8 +1051,8 @@ static ssize_t ext4_quota_write(struct s const char *data, size_t len, loff_t off); static struct dquot_operations ext4_quota_operations = { - .initialize = dquot_initialize, - .drop = dquot_drop, + .initialize = ext4_dquot_initialize, + .drop = ext4_dquot_drop, .alloc_space = dquot_alloc_space, .reserve_space = dquot_reserve_space, .claim_space = dquot_claim_space, @@ -3773,6 +3778,44 @@ static inline struct inode *dquot_to_ino return sb_dqopt(dquot->dq_sb)->files[dquot->dq_type]; } +static int ext4_dquot_initialize(struct inode *inode, int type) +{ + handle_t *handle; + int ret, err; + + /* We may create quota structure so we need to reserve enough blocks */ + handle = ext4_journal_start(inode, 2*EXT4_QUOTA_INIT_BLOCKS(inode->i_sb)); + if (IS_ERR(handle)) + return PTR_ERR(handle); + ret = dquot_initialize(inode, type); + err = ext4_journal_stop(handle); + if (!ret) + ret = err; + return ret; +} + +static int ext4_dquot_drop(struct inode *inode) +{ + handle_t *handle; + int ret, err; + + /* We may delete quota structure so we need to reserve enough blocks */ + handle = ext4_journal_start(inode, 2*EXT4_QUOTA_DEL_BLOCKS(inode->i_sb)); + if (IS_ERR(handle)) { + /* + * We call dquot_drop() anyway to at least release references + * to quota structures so that umount does not hang. + */ + dquot_drop(inode); + return PTR_ERR(handle); + } + ret = dquot_drop(inode); + err = ext4_journal_stop(handle); + if (!ret) + ret = err; + return ret; +} + static int ext4_write_dquot(struct dquot *dquot) { int ret, err; @@ -4062,12 +4105,16 @@ MODULE_ALIAS("ext4dev"); static int __init init_ext4_fs(void) { int err; + int i; ext4_zero_page = alloc_page(GFP_USER); if (!ext4_zero_page) return -ENOMEM; zero_user(ext4_zero_page, 0, PAGE_CACHE_SIZE); + for (i = 0; i < WQ_HASH_SZ; i++) + init_waitqueue_head(&aio_wq[i]); + err = init_ext4_system_zone(); if (err) goto out5;