commit c3091cc1047d691fb7803422728bfcb69d58f474 Author: Takamura Tatsushi Date: Thu Nov 1 17:07:01 2018 +0900 issue#1 DL-SNAP: Directory Level Snapshot DL-SNAP is a feature designed for directory level file backups. It is implemented on top of lustre ldiskfs without modification of ext4 disk format, though we set a special original flag to FEATURE of the superblock in order to check whether DL-SNAP is enabled. The feature makes execution of e2fsck command failed although no modification to ext2 file system structure. DL-SNAP uses COW(Copy On Write) mechanism to reduce backup time and storage usage.vim lu_attr When we create a snapshot, only inodes are created. At this point, new data blocks are not allocated on OST. Data blocks of snapshot are allocated when original file is modified. Not only root users but also ordinary users can create snapshots. Users can create a snapshot of a directory using lfs command with snapshot option, and restore files by usual process such as cp command. diff --git a/ldiskfs/Makefile.in b/ldiskfs/Makefile.in index bc3f058..a5d95f8 100644 --- a/ldiskfs/Makefile.in +++ b/ldiskfs/Makefile.in @@ -18,9 +18,11 @@ ext3_new_headers := ext3_extents.h ext4_new_sources := fiemap.h mmp.c ext4_new_sources += htree_lock.c ext4_new_headers := +snapshot_new_sources := snapshot.c snapshot.h snapshot_debug.h new_sources := $(ext4_new_sources) new_headers := $(ext4_new_headers) +new_sources += $(snapshot_new_sources) ldiskfs_patched_sources := $(notdir $(backfs_sources) $(backfs_headers)) $(new_sources) $(new_headers) ldiskfs_sources := $(ldiskfs_patched_sources) diff --git a/ldiskfs/kernel_patches/patches/rhel6.5/dl_snapshot.patch b/ldiskfs/kernel_patches/patches/rhel6.5/dl_snapshot.patch new file mode 100644 index 0000000..1b6d786 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/rhel6.5/dl_snapshot.patch @@ -0,0 +1,3494 @@ +diff -urN -x .svn linux-stage.org/fs/ext4/ext4.h linux-stage/fs/ext4/ext4.h +--- linux-stage.org/fs/ext4/ext4.h 2018-08-31 20:53:57.000000000 +0900 ++++ linux-stage/fs/ext4/ext4.h 2018-09-03 14:15:26.000000000 +0900 +@@ -326,6 +326,9 @@ + #define EXT4_EXTENTS_FL 0x00080000 /* Inode uses extents */ + #define EXT4_EA_INODE_FL 0x00200000 /* Inode used for large EA */ + #define EXT4_EOFBLOCKS_FL 0x00400000 /* Blocks allocated beyond EOF */ ++#define EXT4_SNAPSHOT_SHARE_FL 0x01000000 /* snapshot data share */ ++#define EXT4_SNAPSHOT_SP_FL 0x04000000 /* snapshot flag */ ++#define EXT4_SNAPSHOT_FL 0x08000000 /* snapshot file/dir */ + #define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */ + + #define EXT4_FL_USER_VISIBLE 0x004BDFFF /* User visible flags */ +@@ -382,6 +385,9 @@ + EXT4_INODE_EXTENTS = 19, /* Inode uses extents */ + EXT4_INODE_EA_INODE = 21, /* Inode used for large EA */ + EXT4_INODE_EOFBLOCKS = 22, /* Blocks allocated beyond EOF */ ++ EXT4_INODE_SNAPSHOT_SHARE = 24, /* snapshot data share */ ++ EXT4_INODE_SNAPSHOT_SP = 26, /* snapshot flag */ ++ EXT4_INODE_SNAPSHOT = 27, /* snapshot file/dir */ + EXT4_INODE_RESERVED = 31, /* reserved for ext4 lib */ + }; + +@@ -1342,6 +1348,7 @@ + #define EXT4_FEATURE_RO_COMPAT_DIR_NLINK 0x0020 + #define EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE 0x0040 + #define EXT4_FEATURE_RO_COMPAT_QUOTA 0x0100 ++#define EXT4_FEATURE_RO_COMPAT_SNAPSHOT 0x40000000 + + #define EXT4_FEATURE_INCOMPAT_COMPRESSION 0x0001 + #define EXT4_FEATURE_INCOMPAT_FILETYPE 0x0002 +@@ -1375,7 +1382,9 @@ + EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE | \ + EXT4_FEATURE_RO_COMPAT_BTREE_DIR |\ + EXT4_FEATURE_RO_COMPAT_HUGE_FILE| \ +- EXT4_FEATURE_RO_COMPAT_QUOTA) ++ EXT4_FEATURE_RO_COMPAT_QUOTA| \ ++ EXT4_FEATURE_RO_COMPAT_SNAPSHOT) ++ + + /* + * Default values for user and/or group using reserved blocks +@@ -1772,6 +1781,8 @@ + struct super_block *sb; /* super block of the fs */ + }; + ++struct ext4_snapshot_gen_lock; ++ + /* + * Check interval multiplier + * The MMP block is written every update interval and initially checked every +@@ -1956,6 +1967,8 @@ + extern int flush_aio_dio_completed_IO(struct inode *inode); + extern void ext4_da_update_reserve_space(struct inode *inode, + int used, int quota_claim); ++extern int ext4_snapshot_orphan_truncate(struct inode *inode, ++ struct ext4_snapshot_gen_lock **lock); + /* ioctl.c */ + extern long ext4_ioctl(struct file *, unsigned int, unsigned long); + extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long); +@@ -2317,6 +2330,16 @@ + /* mmp.c */ + extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t); + ++/* snapshot.c */ ++extern int ext4_snapshot_get_enable(struct super_block *sb); ++extern int ext4_snapshot_set_enable(struct super_block *sb); ++extern int ext4_snapshot_clone(struct inode *snap, struct inode *orig); ++extern int ext4_snapshot_destroy(struct inode *inode, void *orig_fid); ++extern int ext4_snapshot_get_orphan(struct inode *inode, void *fid_buf, ++ int *array_num); ++extern int ext4_snapshot_get_old_list(struct inode *inode, void *buf); ++extern int ext4_snapshot_list_orphan(struct super_block *sb, void *buf); ++extern int ext4_snapshot_set_del_flag(struct inode *inode); + /* + * Add new method to test wether block and inode bitmaps are properly + * initialized. With uninit_bg reading the block from disk is not enough +diff -urN -x .svn linux-stage.org/fs/ext4/ext4_extents.h linux-stage/fs/ext4/ext4_extents.h +--- linux-stage.org/fs/ext4/ext4_extents.h 2018-08-31 20:53:57.000000000 +0900 ++++ linux-stage/fs/ext4/ext4_extents.h 2018-09-03 14:15:27.000000000 +0900 +@@ -135,6 +135,11 @@ + #define EXT_BREAK 1 + #define EXT_REPEAT 2 + ++#define WRITE_PREP_OP 0 ++#define WRITE_COMMIT_OP 1 ++#define READ_OP 2 ++#define WRITE_COPY_OP 3 ++ + /* + * structure for external API + */ +diff -urN -x .svn linux-stage.org/fs/ext4/extents.c linux-stage/fs/ext4/extents.c +--- linux-stage.org/fs/ext4/extents.c 2018-08-31 20:53:57.000000000 +0900 ++++ linux-stage/fs/ext4/extents.c 2018-10-24 14:01:03.000000000 +0900 +@@ -42,7 +42,8 @@ + #include + #include + #include "ext4_jbd2.h" +-#include "ext4_extents.h" ++#include "mballoc.h" ++#include "snapshot.h" + + /* + * used by extent splitting. +@@ -337,6 +338,12 @@ + + if (len == 0) + return 0; ++ ++ /* sparse extent for snapshot file */ ++ if (EXT4_TEST_OST_SNAPSHOT_FILE(inode) && ++ EXT4_SNAPSHOT_TEST_SPARSE_EXTENT(ext)) ++ return 1; ++ + return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len); + } + +@@ -1296,7 +1303,10 @@ + } + + *logical = le32_to_cpu(ex->ee_block) + ee_len - 1; +- *phys = ext4_ext_pblock(ex) + ee_len - 1; ++ if (!(EXT4_SNAPSHOT_TEST_SPARSE_EXTENT(ex) && ++ EXT4_TEST_OST_SNAPSHOT_FILE(inode))) ++ *phys = ext4_ext_pblock(ex) + ee_len - 1; ++ + return 0; + } + +@@ -1585,6 +1595,10 @@ + return 0; + #endif + ++ if (EXT4_TEST_OST_SNAPSHOT_FILE(inode) && ++ EXT4_SNAPSHOT_TEST_SPARSE_EXTENT(ex1) && ++ EXT4_SNAPSHOT_TEST_SPARSE_EXTENT(ex2)) ++ return 1; + if (ext4_ext_pblock(ex1) + ext1_ee_len == ext4_ext_pblock(ex2)) + return 1; + return 0; +@@ -1889,6 +1903,10 @@ + return err; + } + ++static int ext4_snapshot_read(struct inode *inode, ++ struct ext4_ext_cache *cbex, ++ int *exists, int *flags); ++ + static int ext4_fill_fiemap_extents(struct inode *inode, + ext4_lblk_t block, ext4_lblk_t num, + struct fiemap_extent_info *fieinfo) +@@ -1972,6 +1990,16 @@ + cbex.ec_block = start; + cbex.ec_len = end - start; + cbex.ec_start = 0; ++ if (EXT4_TEST_OST_SNAPSHOT_FILE(inode)) { ++ up_read(&EXT4_I(inode)->i_data_sem); ++ /* search for extents from new snapshots */ ++ err = ext4_snapshot_read(inode, &cbex, ++ &exists, &flags); ++ if (err < 0) ++ break; ++ down_read(&EXT4_I(inode)->i_data_sem); ++ } ++ + } else { + cbex.ec_block = le32_to_cpu(ex->ee_block); + cbex.ec_len = ext4_ext_get_actual_len(ex); +@@ -2331,6 +2359,17 @@ + unsigned short ee_len = ext4_ext_get_actual_len(ex); + int i, metadata = 0, flags =0; + ++ if (EXT4_TEST_OST_SNAPSHOT_FILE(inode) && ++ EXT4_SNAPSHOT_TEST_SPARSE_EXTENT(ex)) { ++ CDEBUG(D_INODE, "sparse extent [%u ->%u] " ++ "held by snapshot(=%lu)," ++ " it has no block\n", ++ le32_to_cpu(ex->ee_block), ++ le32_to_cpu(ex->ee_block) + ee_len - 1, ++ inode->i_ino); ++ return 0; ++ } ++ + if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) + metadata = 1; + flags = EXT4_FREE_BLOCKS_METADATA; +@@ -4487,6 +4526,1212 @@ + return (error < 0 ? error : 0); + } + ++/* ++ * ext4_snapshot_wait_writeback ++ * ++ * wait for writeback the un-written cache data ++ * ++ * \param[in] inode inode ++ * \param[in] offset file offset for read ++ * \param[in] len data size for read ++ * ++ * \retval none(void) ++ */ ++static void ext4_snapshot_wait_writeback(struct inode *inode, ++ loff_t offset, ssize_t len) ++{ ++ struct page *page; ++ ++ while (len > 0) { ++ int poff = offset & (PAGE_CACHE_SIZE - 1); ++ int plen = PAGE_CACHE_SIZE - poff; ++ ++ page = find_lock_page(inode->i_mapping, offset); ++ if (page) { ++ wait_on_page_writeback(page); ++ /* unlock and release cache */ ++ unlock_page(page); ++ page_cache_release(page); ++ } ++ /* next offset */ ++ offset += plen; ++ len -= plen; ++ } ++ return; ++} ++ ++static int ext4_snapshot_read(struct inode *inode, ++ struct ext4_ext_cache *cbex, ++ int *exists, int *flags) ++{ ++ struct ext4_ext_path *path = NULL, *p = NULL; ++ struct inode *new_inode, *cur; ++ struct ext4_snapshot_link link, next_link; ++ struct ext4_snapshot_link *cur_link_buf = NULL, ++ *next_link_buf = NULL, *backup = NULL; ++ struct ext4_extent *ex; ++ ext4_lblk_t next, end; ++ int depth = 0, err; ++ ++ err = ext4_snapshot_get_link(inode, &link); ++ if (err) { ++ SNAPSHOT_CONSOLE_ERR(err); ++ return err; ++ } ++ /* check existence of more new snapshot */ ++ if (!link.new_ino) ++ return 0; ++ ++ cur_link_buf = &link; ++ next_link_buf = &next_link; ++ ++ /* get snapshot new link data & new inode from xattr */ ++ new_inode = ext4_snapshot_read_link(inode, cur_link_buf, ++ next_link_buf, true, &err); ++ if (err) { ++ SNAPSHOT_CONSOLE_ERR(err); ++ return err; ++ } ++ cur = new_inode; ++ while (cur != NULL) { ++ err = 0; ++ ++ /* extent read lock */ ++ down_read(&EXT4_I(cur)->i_data_sem); ++ ++ if (path && ext_depth(cur) != depth) { ++ /* depth was changed. we have to realloc path */ ++ kfree(path); ++ path = NULL; ++ } ++ ++ /* find extent */ ++ p = ext4_ext_find_extent(cur, cbex->ec_block, ++ path); ++ if (IS_ERR(p)) { ++ err = PTR_ERR(p); ++ CERROR("cannot find extent on inode=%lu.\n", ++ cur->i_ino); ++ /* extent read unlock and release cur */ ++ up_read(&EXT4_I(cur)->i_data_sem); ++ break; ++ } ++ ++ path = p; ++ depth = ext_depth(cur); ++ if (unlikely(path[depth].p_hdr == NULL)) { ++ up_read(&EXT4_I(cur)->i_data_sem); ++ EXT4_ERROR_INODE(cur, "path[%d].p_hdr == NULL", ++ depth); ++ err = -EIO; ++ break; ++ } ++ /* get found extent pointer */ ++ ex = NULL; ++ ex = path[depth].p_ext; ++ ++ next = ext4_ext_next_allocated_block(path); ++ /* extent read lock */ ++ up_read(&EXT4_I(cur)->i_data_sem); ++ ++ if (ex) { ++ ext4_lblk_t ee_block = ++ le32_to_cpu(ex->ee_block); ++ ext4_lblk_t ee_end = ++ ee_block + ext4_ext_get_actual_len(ex); ++ ++ if (cbex->ec_block < ee_block) { ++ /*extent is higher than block */ ++ end = ee_block; ++ if (end < cbex->ec_block + cbex->ec_len) ++ cbex->ec_len = end - cbex->ec_block; ++ } else if (ee_end <= cbex->ec_block) { ++ /* extent is lower than block */ ++ end = cbex->ec_block + cbex->ec_len; ++ if (end >= next) ++ cbex->ec_len = next - cbex->ec_block; ++ } else if (ee_block <= cbex->ec_block) { ++ /* block is exist on extent */ ++ loff_t offset, flen; ++ ++ cbex->ec_start = ext4_ext_pblock(ex) + ++ (cbex->ec_block - ee_block); ++ ++ end = ee_end; ++ if (cbex->ec_block + cbex->ec_len > end) ++ cbex->ec_len = end - cbex->ec_block; ++ /* flush un-written cache data to disk */ ++ offset = cbex->ec_block * ++ EXT4_BLOCK_SIZE(inode->i_sb); ++ flen = cbex->ec_len * ++ EXT4_BLOCK_SIZE(inode->i_sb); ++ ext4_snapshot_wait_writeback(cur, ++ offset, flen); ++ *exists = 1; ++ if (flags && ++ EXT4_SNAPSHOT_TEST_SPARSE_EXTENT(ex)) ++ *flags |= FIEMAP_EXTENT_UNWRITTEN; ++ break; ++ } ++ } ++ ext4_ext_drop_refs(path); ++ ++ backup = cur_link_buf; ++ cur_link_buf = next_link_buf; ++ next_link_buf = backup; ++ ++ /* get snapshot link data from xattr */ ++ ++ new_inode = ext4_snapshot_read_link(cur, ++ cur_link_buf, ++ next_link_buf, ++ true, &err); ++ ++ if (err) { ++ SNAPSHOT_CONSOLE_ERR(err); ++ break; ++ } ++ /* release new_inode for next inode */ ++ iput(cur); ++ cur = new_inode; ++ } ++ if (path) { ++ ext4_ext_drop_refs(path); ++ kfree(path); ++ } ++ ++ /* release new_inode for exit */ ++ if (cur) ++ iput(cur); ++ return err; ++} ++ ++/* ++ * read block from source, and write block to destination ++ * ++ * \param[in] handle journal handle ++ * \param[in] sb super block ++ * \param[in] src source block ++ * \param[in] dest destination block ++ * \param[in] count number of write blocks ++ * ++ * \retval 0 success ++ * \retval less than 0 failure (-errno) ++ */ ++ ++struct snapshot_bio_wait { ++ wait_queue_head_t sb_wait; ++ atomic_t sb_count; ++ int sb_err; ++}; ++ ++/* reference mpage_end_io_read() */ ++static void snapshot_end_io_read(struct bio *bio, int err) ++{ ++ struct snapshot_bio_wait *snapshot_bio_wait = bio->bi_private; ++ const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); ++ struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; ++ ++ do { ++ struct page *page = bvec->bv_page; ++ ++ if (--bvec >= bio->bi_io_vec) ++ prefetchw(&bvec->bv_page->flags); ++ ++ if (uptodate) { ++ ClearPageError(page); ++ SetPageUptodate(page); ++ } else { ++ CERROR("page=%p index=%lu devno=%u,%u fail to read\n", ++ page, page->index, MAJOR(bio->bi_bdev->bd_dev), ++ MINOR(bio->bi_bdev->bd_dev)); ++ SNAPSHOT_CONSOLE_ERR(err ? err : -EIO); ++ ClearPageUptodate(page); ++ SetPageError(page); ++ snapshot_bio_wait->sb_err = (err ? err : -EIO); ++ } ++ } while (bvec >= bio->bi_io_vec); ++ ++ if (atomic_dec_and_test(&snapshot_bio_wait->sb_count)) ++ wake_up(&snapshot_bio_wait->sb_wait); ++ ++ bio_put(bio); ++} ++ ++/* reference mpage_end_io_write() */ ++static void snapshot_end_io_write(struct bio *bio, int err) ++{ ++ struct snapshot_bio_wait *snapshot_bio_wait = bio->bi_private; ++ const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); ++ struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; ++ ++ do { ++ struct page *page = bvec->bv_page; ++ if (--bvec >= bio->bi_io_vec) ++ prefetchw(&bvec->bv_page->flags); ++ ++ if (!uptodate) { ++ SNAPSHOT_CONSOLE_ERR(err ? err : -EIO); ++ CERROR("page=%p index=%lu devno=%u,%u fail to write\n", ++ page, page->index, MAJOR(bio->bi_bdev->bd_dev), ++ MINOR(bio->bi_bdev->bd_dev)); ++ snapshot_bio_wait->sb_err = (err ? err : -EIO); ++ } ++ ++ unlock_page(page); ++ page_cache_release(page); ++ } while (bvec >= bio->bi_io_vec); ++ ++ if (atomic_dec_and_test(&snapshot_bio_wait->sb_count)) ++ wake_up(&snapshot_bio_wait->sb_wait); ++ ++ bio_put(bio); ++} ++ ++static int can_be_merged(struct bio *bio, sector_t sector) ++{ ++ unsigned int size = bio->bi_size >> 9; ++ return bio->bi_sector + size == sector ? 1 : 0; ++} ++static int ext4_snapshot_do_bio(struct page *page, ++ struct inode *inode, ++ int nblocks, ++ ext4_fsblk_t pblock, ++ int rw, ++ struct bio **bio_p, ++ struct snapshot_bio_wait *snapshot_bio_wait, ++ int alloc_iovecs) ++{ ++ unsigned int blocksize = inode->i_sb->s_blocksize; ++ sector_t sector; ++ int sector_bits = inode->i_sb->s_blocksize_bits - 9; ++ struct bio *bio = *bio_p; ++ struct bio *tmp_bio = NULL; ++ ++ sector = (sector_t)pblock << sector_bits; ++ if (bio != NULL && ++ can_be_merged(bio, sector) && ++ bio_add_page(bio, page, ++ blocksize * nblocks, 0) != 0) { ++ return 0; ++ } ++ if (bio != NULL) { ++ atomic_inc(&snapshot_bio_wait->sb_count); ++ submit_bio(rw, bio); ++ *bio_p = NULL; ++ } ++ /* allocate new bio */ ++ tmp_bio = bio_alloc(GFP_NOIO, alloc_iovecs); ++ if (tmp_bio == NULL) { ++ CERROR("Can't allocate bio\n"); ++ return -ENOMEM; ++ } ++ tmp_bio->bi_bdev = inode->i_sb->s_bdev; ++ tmp_bio->bi_sector = sector; ++ tmp_bio->bi_rw = rw; ++ if (rw == READ) ++ tmp_bio->bi_end_io = snapshot_end_io_read; ++ else ++ tmp_bio->bi_end_io = snapshot_end_io_write; ++ ++ tmp_bio->bi_private = snapshot_bio_wait; ++ LASSERT(bio_add_page(tmp_bio, page, ++ blocksize * nblocks, 0) != 0); ++ *bio_p = tmp_bio; ++ return 0; ++} ++ ++static void init_snapshot_bio_wait(struct snapshot_bio_wait *snapshot_bio_wait) ++{ ++ init_waitqueue_head(&snapshot_bio_wait->sb_wait); ++ atomic_set(&snapshot_bio_wait->sb_count, 0); ++ snapshot_bio_wait->sb_err = 0; ++} ++ ++static int ext4_snapshot_submit_last_bio(struct bio *bio, ++ struct snapshot_bio_wait *snapshot_bio_wait) ++{ ++ atomic_inc(&snapshot_bio_wait->sb_count); ++ submit_bio(bio->bi_rw, bio); ++ wait_event(snapshot_bio_wait->sb_wait, ++ atomic_read(&snapshot_bio_wait->sb_count) == 0); ++ return snapshot_bio_wait->sb_err; ++} ++ ++static int ext4_snapshot_copy_page(struct inode *src_inode, ++ struct inode *dest_inode, ++ ext4_lblk_t start_block, ++ ext4_lblk_t end_block, ++ ext4_fsblk_t src, ++ ext4_fsblk_t dest) ++{ ++ struct super_block *sb = dest_inode->i_sb; ++ int i, j = 0, n = 0; ++ unsigned int blocksize = sb->s_blocksize; ++ int blocks_per_page = PAGE_CACHE_SIZE / blocksize; ++ struct snapshot_bio_wait snapshot_bio_wait; ++ struct page *src_page = NULL; ++ bool lock = false; ++ int nblocks; ++ struct bio *bio = NULL; ++ int rc = 0; ++ struct page **pages; ++ ++ pages = kmalloc(sizeof(struct page *) * (end_block - start_block), ++ GFP_NOFS); ++ if (!pages) { ++ CERROR("fail to alloc pages\n"); ++ SNAPSHOT_CONSOLE_ERR(-ENOMEM); ++ return -ENOMEM; ++ } ++ init_snapshot_bio_wait(&snapshot_bio_wait); ++ for (i = start_block; i < end_block;) { ++ int pnum = (i / blocks_per_page); ++ int poff = (i % blocks_per_page); ++ nblocks = blocks_per_page - poff; ++ ++ src_page = find_get_page(src_inode->i_mapping, ++ pnum); ++ if (src_page == NULL) { ++ retry: ++ src_page = find_or_create_page(src_inode->i_mapping, ++ pnum, ++ GFP_NOFS | __GFP_HIGHMEM); ++ if (unlikely(src_page == NULL)) { ++ CERROR("fail to get src_page " ++ "for src_inode=%lu\n", ++ src_inode->i_ino); ++ rc = -ENOMEM; ++ goto out_src_release; ++ } ++ lock = true; ++ } else if (!PageLocked(src_page)) { ++ CDEBUG(D_INODE, "inode=%lu page is not " ++ "locked src_page =%p, " ++ "flags=%ld src_page->index=%lu\n", ++ src_inode->i_ino, src_page, ++ src_page->flags, src_page->index); ++ page_cache_release(src_page); ++ goto retry; ++ } ++ wait_on_page_writeback(src_page); ++ ++ if (!PageUptodate(src_page)) { ++ CDEBUG(D_INODE, "inode=%lu NOT on cache src_page =%p, " ++ "flags=%ld src_page->index=%lu\n", ++ src_inode->i_ino, src_page, ++ src_page->flags, src_page->index); ++ ++ rc = ext4_snapshot_do_bio(src_page, ++ src_inode, nblocks, ++ src + (i - start_block), ++ READ, &bio, ++ &snapshot_bio_wait, ++ end_block - i); ++ if (rc < 0) { ++ SNAPSHOT_CONSOLE_ERR(rc); ++ goto out_bio_wait; ++ } ++ } ++ pages[j++] = src_page; ++ i += nblocks; ++ } ++ if (bio != NULL) { ++ rc = ext4_snapshot_submit_last_bio(bio, ++ &snapshot_bio_wait); ++ if (rc) { ++ CERROR("fail to read bio " ++ "rc =%d src_inode=%lu\n", ++ rc, src_inode->i_ino); ++ SNAPSHOT_CONSOLE_ERR(rc); ++ goto out_src_release; ++ } ++ } ++ init_snapshot_bio_wait(&snapshot_bio_wait); ++ bio = NULL; ++ ++ for (i = start_block; i < end_block;) { ++ void *src_addr = NULL, *dest_addr = NULL; ++ struct page *src_page = NULL, *dest_page = NULL; ++ int pnum = (i / blocks_per_page); ++ int poff = (i % blocks_per_page); ++ nblocks = blocks_per_page - poff; ++ ++ LASSERT(n < j); ++ src_page = pages[n++]; ++ ++ LASSERT(PageUptodate(src_page) && !PageError(src_page)); ++ src_addr = kmap(src_page) + (poff * sb->s_blocksize); ++ ++ /* get destination block */ ++ dest_page = find_or_create_page(dest_inode->i_mapping, pnum, ++ GFP_NOFS | __GFP_HIGHMEM); ++ if (unlikely(dest_page == NULL)) { ++ CERROR("fail to get dest_page for dest_inode=%lu\n", ++ dest_inode->i_ino); ++ ++ kunmap(src_page); ++ if (lock) ++ unlock_page(src_page); ++ page_cache_release(src_page); ++ rc = -ENOMEM; ++ goto out_bio_wait; ++ } ++ LASSERT(src_page->index == dest_page->index); ++ wait_on_page_writeback(dest_page); ++ dest_addr = kmap(dest_page) + (poff * blocksize); ++ ++ memcpy(dest_addr, src_addr, nblocks * blocksize); ++ ++ kunmap(dest_page); ++ kunmap(src_page); ++ ++ if (lock) ++ unlock_page(src_page); ++ page_cache_release(src_page); ++ ++ /* write */ ++ SetPageUptodate(dest_page); ++ rc = ext4_snapshot_do_bio(dest_page, dest_inode, ++ nblocks, dest + (i - start_block), ++ WRITE, &bio, &snapshot_bio_wait, ++ end_block - i); ++ if (rc) { ++ unlock_page(dest_page); ++ page_cache_release(dest_page); ++ SNAPSHOT_CONSOLE_ERR(rc); ++ goto out_bio_wait; ++ } ++ i += nblocks; ++ } ++ if (bio != NULL) { ++ rc = ext4_snapshot_submit_last_bio(bio, ++ &snapshot_bio_wait); ++ if (rc) { ++ CERROR("fail to write bio " ++ "rc =%d src_inode=%lu\n", ++ rc, src_inode->i_ino); ++ SNAPSHOT_CONSOLE_ERR(rc); ++ } ++ } ++ kfree(pages); ++ return rc; ++ ++out_bio_wait: ++ wait_event(snapshot_bio_wait.sb_wait, ++ atomic_read(&snapshot_bio_wait.sb_count) == 0); ++ if (bio) ++ bio_put(bio); ++out_src_release: ++ while (n < j) { ++ src_page = pages[n++]; ++ if (lock) ++ unlock_page(src_page); ++ page_cache_release(src_page); ++ ++ } ++ kfree(pages); ++ return rc; ++ ++} ++ ++static int ext4_snapshot_copy_data(struct inode *src_inode, ++ struct inode *dest_inode, ++ ext4_lblk_t start_block, ++ ext4_fsblk_t src, ++ ext4_fsblk_t dest, ++ unsigned long total_count) ++{ ++ int blocks_per_page = PAGE_CACHE_SIZE / src_inode->i_sb->s_blocksize; ++ int count; ++ int rc = 0; ++ ++ while (total_count > 0) { ++ if (total_count <= BIO_MAX_PAGES * blocks_per_page) ++ return ext4_snapshot_copy_page(src_inode, ++ dest_inode, ++ start_block, ++ start_block + total_count, ++ src, dest); ++ else { ++ count = BIO_MAX_PAGES * blocks_per_page; ++ rc = ext4_snapshot_copy_page(src_inode, ++ dest_inode, ++ start_block, ++ start_block + count, ++ src, dest); ++ if (rc) ++ return rc; ++ total_count -= count; ++ start_block += count; ++ src += count; ++ dest += count; ++ } ++ } ++ return 0; ++} ++ ++static int ext4_snapshot_do_copy_extent(handle_t *handle, ++ struct inode *src_inode, ++ struct inode *inode, ++ struct ext4_ext_path *path, ++ ext4_lblk_t *start_block, ++ ext4_lblk_t end_block, ++ ext4_fsblk_t *start_pblock) ++{ ++ struct ext4_extent newext; ++ struct ext4_allocation_request ar; ++ int err = 0; ++ ext4_fsblk_t newblock; ++ ext4_lblk_t copy_blocks = (end_block - *start_block); ++ ++ /* reference ext4_ext_get_blocks() */ ++ ar.lleft = *start_block; ++ err = ext4_ext_search_left(inode, path, &ar.lleft, &ar.pleft); ++ if (err) ++ return err; ++ ++ ar.lright = *start_block; ++ err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright); ++ if (err) ++ return err; ++ ++ ar.inode = inode; ++ ar.goal = ext4_ext_find_goal(inode, path, *start_block); ++ ar.logical = *start_block; ++ ar.len = copy_blocks; ++ ar.flags = EXT4_MB_HINT_DATA; ++ ++ newblock = ext4_mb_new_blocks(handle, &ar, &err); ++ if (!newblock) { ++ CERROR("fail to alloc blocks " ++ "goal=%llu len=%u inode=%lu err=%d\n", ++ ar.goal, ar.len, inode->i_ino, err); ++ return err; ++ } ++ newext.ee_block = cpu_to_le32(*start_block); ++ ext4_ext_store_pblock(&newext, newblock); ++ newext.ee_len = cpu_to_le16(ar.len); ++ ++ err = ext4_snapshot_copy_data(src_inode, inode, ++ *start_block, ++ *start_pblock, newblock, ++ ar.len); ++ if (err) ++ goto out_free; ++ ++ err = ext4_ext_insert_extent(handle, inode, path, &newext, 0); ++ if (err) { ++ CERROR("fail to insert extent inode=%lu err=%d\n", ++ inode->i_ino, err); ++ goto out_free; ++ } ++ ++ *start_block += ar.len; ++ *start_pblock += ar.len; ++ ++ out_free: ++ if (err) { ++ ext4_discard_preallocations(inode); ++ ext4_free_blocks(handle, inode, newblock, ar.len, 0); ++ } ++ ++ return err; ++} ++ ++static int ext4_snapshot_copy_extent(handle_t *handle, ++ struct inode *src_inode, ++ struct inode *inode, ++ struct ext4_ext_path *path, ++ ext4_lblk_t start_block, ++ ext4_lblk_t end_block, ++ ext4_fsblk_t start_pblock) ++{ ++ int err = 0; ++ unsigned int credits; ++ ++ while (start_block < end_block) { ++ credits = ext4_chunk_trans_blocks(inode, end_block - start_block); ++ if (unlikely(!ext4_handle_has_enough_credits(handle, credits))) { ++ err = ext4_journal_extend(handle, credits); ++ if (err < 0) { ++ CERROR("couldn't extend journal inode=%lu " ++ "handle=%p need=%d has=%d err=%d\n", ++ inode->i_ino, handle, credits, ++ handle->h_buffer_credits, err); ++ return err; ++ } else if (err != 0) { ++ err = ext4_journal_restart(handle, credits); ++ if (err) { ++ /* ++ * This should never happen. ++ * It may panic with ext4_journal_stop() ++ * if ext4_journal_restart() fails. ++ */ ++ CERROR("couldn't extend and restart " ++ "journal inode=%lu handle=%p " ++ "need=%d err=%d\n", ++ inode->i_ino, handle, credits, ++ err); ++ return err; ++ } ++ } ++ } ++ err = ext4_snapshot_do_copy_extent(handle, ++ src_inode, inode, ++ path, &start_block, ++ end_block, ++ &start_pblock); ++ if (err) ++ return err; ++ } ++ return err; ++} ++ ++static int ext4_snapshot_insert_sparse_extent(handle_t *handle, ++ struct inode *inode, ++ struct ext4_ext_path *path, ++ ext4_lblk_t start_block, ++ ext4_lblk_t end_block) ++{ ++ struct ext4_extent newext; ++ int err = 0; ++ unsigned int credits; ++ ext4_lblk_t copy_blocks = (end_block - start_block); ++ ++ credits = ext4_chunk_trans_blocks(inode, 0); ++ if (unlikely(!ext4_handle_has_enough_credits(handle, credits))) { ++ err = ext4_journal_extend(handle, credits); ++ if (err < 0) { ++ CERROR("couldn't extend journal " ++ "inode=%lu handle=%p need=%d has=%d err=%d\n", ++ inode->i_ino, handle, credits, ++ handle->h_buffer_credits, err); ++ return err; ++ } else if (err != 0) { ++ err = ext4_journal_restart(handle, credits); ++ if (err) { ++ /* This should never happen. ++ * It may panic with ext4_journal_stop() if ++ * ext4_journal_restart() fails. ++ */ ++ CERROR("couldn't extend and restart journal " ++ "inode=%lu handle=%p need=%d err=%d\n", ++ inode->i_ino, handle, credits, ++ err); ++ ++ return err; ++ } ++ } ++ } ++ ++ /* reference ext4_ext_get_blocks() */ ++ newext.ee_block = cpu_to_le32(start_block); ++ newext.ee_len = cpu_to_le16(copy_blocks); ++ ++ EXT4_SNAPSHOT_SET_SPARSE_EXTENT(&newext); ++ ++ err = ext4_ext_insert_extent(handle, inode, path, &newext, 0); ++ if (err) ++ CERROR("fail to insert extent inode=%lu err=%d\n", ++ inode->i_ino, err); ++ return err; ++} ++ ++static int ext4_snapshot_do_copy(handle_t *handle, ++ struct inode *src_inode, ++ struct inode *inode, ++ ext4_lblk_t start_block, ++ ext4_lblk_t end_block, ++ ext4_fsblk_t start_pblock, ++ bool sparse, bool sync_journal) ++{ ++ ext4_lblk_t block = start_block; ++ ext4_fsblk_t pblock = start_pblock; ++ struct ext4_ext_path *path = NULL, *p = NULL; ++ int depth = 0, err = 0; ++ int block_copy = 0; ++ ++ down_write(&EXT4_I(inode)->i_data_sem); ++ while (block < end_block) { ++ struct ext4_extent *ex = NULL; ++ ext4_lblk_t copy_start, copy_end, next; ++ ext4_fsblk_t copy_pstart; ++ ++ if (path && ext_depth(inode) != depth) { ++ /* depth was changed. we have to realloc path */ ++ kfree(path); ++ path = NULL; ++ } ++ ++ p = ext4_ext_find_extent(inode, block, path); ++ if (IS_ERR(p)) { ++ err = PTR_ERR(p); ++ CERROR("cannot find extent on inode=%lu err=%d\n", ++ inode->i_ino, err); ++ ++ break; ++ } ++ ++ path = p; ++ depth = ext_depth(inode); ++ if (unlikely(path[depth].p_hdr == NULL)) { ++ EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", ++ depth); ++ err = -EIO; ++ break; ++ } ++ ex = path[depth].p_ext; ++ next = ext4_ext_next_allocated_block(path); ++ /* reference ext4_ext_walk_space() */ ++ if (!ex) { ++ /* there is no extent. ++ * so try to copy all */ ++ copy_start = block; ++ copy_end = end_block; ++ copy_pstart = pblock; ++ } else { ++ ext4_lblk_t ee_block = ++ le32_to_cpu(ex->ee_block); ++ ext4_lblk_t ee_end = ++ ee_block + ext4_ext_get_actual_len(ex); ++ ++ if (block < ee_block) { ++ /* need to copy before found extent */ ++ copy_start = block; ++ copy_end = ee_block; ++ copy_pstart = pblock; ++ } else if (ee_end <= block) { ++ /* need to copy after found extent */ ++ copy_start = block; ++ copy_end = end_block; ++ copy_pstart = pblock; ++ } else if (ee_end < end_block) { ++ /* some part of requested space is covered ++ * by found extent */ ++ copy_start = ee_end; ++ copy_end = end_block; ++ copy_pstart = pblock + (copy_start - block); ++ } else ++ break; ++ ++ } ++ if (next <= copy_end) ++ copy_end = next; ++ if (end_block < copy_end) ++ copy_end = end_block; ++ if (copy_start >= copy_end) ++ goto skip_copy; ++ ++ if (sparse) { ++ err = ext4_snapshot_insert_sparse_extent(handle, ++ inode, path, ++ copy_start, ++ copy_end); ++ } else { ++ err = ext4_snapshot_copy_extent(handle, ++ src_inode, ++ inode, path, ++ copy_start, ++ copy_end, ++ copy_pstart); ++ block_copy++; ++ } ++ skip_copy: ++ ext4_ext_drop_refs(path); ++ if (err) ++ break; ++ ++ pblock += (copy_end - block); ++ block = copy_end; ++ } ++ up_write(&EXT4_I(inode)->i_data_sem); ++ ++ if (path) { ++ ext4_ext_drop_refs(path); ++ kfree(path); ++ } ++ if (err == 0 && sync_journal && block_copy) ++ ext4_handle_sync(handle); ++ ++ return err; ++} ++ ++/* ++ * ext4_snapshot_copy_sparse ++ * ++ * insert sparse extents to old snapshot file. ++ * ++ * \param[in] inode inode of snapshot file ++ * \param[in] start start lblock ++ * \param[in] end end lblock ++ * ++ * \retval 0 success ++ * \retval less than 0 failure (-errno) ++ */ ++static int ext4_snapshot_copy_sparse(struct inode *inode, ++ ext4_lblk_t start, ++ ext4_lblk_t end) ++{ ++ handle_t *handle; ++ struct inode *old_inode; ++ struct super_block *old_sb; ++ struct ext4_snapshot_link link, next_link; ++ ext4_lblk_t max; ++ int credits, err; ++ bool clear_link = false; ++ ++ err = ext4_snapshot_get_link(inode, &link); ++ if (err) { ++ SNAPSHOT_CONSOLE_ERR(err); ++ credits = 1; /* iflag only */ ++ goto clear_snap; ++ } ++ ++ /* get snapshot old link data & old inode from xattr */ ++ old_inode = ext4_snapshot_read_link(inode, &link, ++ &next_link, false, &err); ++ if (err == 0 && old_inode == NULL) { ++ CERROR("invalid orig link link.old_ino == 0," ++ " inode=%lu\n", inode->i_ino); ++ err = -ENOENT; ++ } ++ if (err) { ++ SNAPSHOT_CONSOLE_ERR(err); ++ credits = ext4_calc_snapshot_link_credits( ++ inode, SNAPSHOT_CLEAR_LINK_OP); ++ clear_link = true; ++ goto clear_snap; ++ } ++ old_sb = old_inode->i_sb; ++ ++ /* ignore block bigger than file size */ ++ max = (old_inode->i_size + EXT4_BLOCK_SIZE(old_sb) - 1) ++ >> EXT4_BLOCK_SIZE_BITS(old_sb); ++ if (end > max) ++ end = max; ++ if (start >= end) ++ goto out; ++ ++ /* start journal */ ++ credits = ext4_ext_calc_credits_for_insert(old_inode, NULL) ++ + EXT4_ALLOC_NEEDED + 1; ++ handle = ext4_journal_start(old_inode, credits); ++ if (IS_ERR(handle)) { ++ err = PTR_ERR(handle); ++ SNAPSHOT_CONSOLE_ERR(err); ++ goto out; ++ } ++ ++ err = ext4_snapshot_do_copy(handle, inode, ++ old_inode, start, ++ end, 0, true, false); ++ ++ ext4_journal_stop(handle); ++out: ++ iput(old_inode); ++ return err; ++ ++/* When an error occurs, change the inode to no snapshot, ++ * and continue the process. */ ++clear_snap: ++ handle = ext4_journal_start(inode, credits); ++ if (IS_ERR(handle)) { ++ err = PTR_ERR(handle); ++ SNAPSHOT_CONSOLE_ERR(err); ++ return err; ++ } ++ if (clear_link) ++ ext4_snapshot_del_link(handle, inode); ++ EXT4_CLEAR_OST_SNAPSHOT_FLAGS(inode); ++ ext4_mark_inode_dirty(handle, inode); ++ ext4_journal_stop(handle); ++ return 0; ++} ++ ++int ext4_snapshot_copy_blocks(handle_t *handle, ++ struct inode *inode, ++ struct inode *old_snap, ++ ext4_lblk_t start_block, ++ ext4_lblk_t end_block, ++ bool sync_journal) ++{ ++ int depth = 0, err = 0; ++ ext4_lblk_t block; ++ struct ext4_ext_path *path = NULL, *p = NULL; ++ ++ block = start_block; ++ ++ while (block < end_block && block != EXT_MAX_BLOCKS) { ++ struct ext4_extent *ex = NULL; ++ ext4_lblk_t ee_block, ee_end, next; ++ ext4_lblk_t copy_start, copy_end; ++ ext4_fsblk_t copy_pstart; ++ ++ down_read(&EXT4_I(inode)->i_data_sem); ++ ++ if (path && ext_depth(inode) != depth) { ++ /* depth was changed. we have to realloc path */ ++ kfree(path); ++ path = NULL; ++ } ++ ++ p = ext4_ext_find_extent(inode, block, path); ++ if (IS_ERR(p)) { ++ err = PTR_ERR(p); ++ CERROR("cannot find extent on inode=%lu.\n", ++ inode->i_ino); ++ up_read(&EXT4_I(inode)->i_data_sem); ++ break; ++ } ++ ++ path = p; ++ depth = ext_depth(inode); ++ if (unlikely(path[depth].p_hdr == NULL)) { ++ up_read(&EXT4_I(inode)->i_data_sem); ++ EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", ++ depth); ++ err = -EIO; ++ break; ++ } ++ ex = path[depth].p_ext; ++ next = ext4_ext_next_allocated_block(path); ++ if (!ex) { ++ up_read(&EXT4_I(inode)->i_data_sem); ++ /* there is no extent. ++ * not necessary to do anything */ ++ break; ++ } ++ ++ ee_block = le32_to_cpu(ex->ee_block); ++ ee_end = ee_block + ext4_ext_get_actual_len(ex); ++ ++ if ((block < ee_block) && (ee_block < end_block)) { ++ copy_start = ee_block; ++ copy_pstart = ext4_ext_pblock(ex); ++ } else if ((ee_block <= block) && ++ (block < ee_end)) { ++ copy_start = block; ++ copy_pstart = ext4_ext_pblock(ex) ++ + (block - ee_block); ++ } else { ++ up_read(&EXT4_I(inode)->i_data_sem); ++ /* there is no extent in the target range. ++ * not necessary to do anything */ ++ break; ++ } ++ copy_end = ee_end; ++ if (end_block < copy_end) ++ copy_end = end_block; ++ ext4_ext_drop_refs(path); ++ up_read(&EXT4_I(inode)->i_data_sem); ++ ++ err = ext4_snapshot_do_copy(handle, inode, old_snap, ++ copy_start, copy_end, ++ copy_pstart, ++ EXT4_SNAPSHOT_TEST_SPARSE_EXTENT(ex), ++ sync_journal); ++ if (err) ++ break; ++ ++ block = next; ++ } ++ ++ if (path) { ++ ext4_ext_drop_refs(path); ++ kfree(path); ++ } ++ return err; ++} ++ ++static int ext4_snapshot_copy_writeblocks(struct inode *inode, ++ ext4_lblk_t start, ++ ext4_lblk_t end) ++{ ++ handle_t *handle; ++ struct ext4_snapshot_link link, next_link; ++ struct inode *old_inode; ++ struct super_block *old_sb; ++ ext4_lblk_t max; ++ int credits, err; ++ bool clear_link = false; ++ ++ err = ext4_snapshot_get_link(inode, &link); ++ if (err) { ++ SNAPSHOT_CONSOLE_ERR(err); ++ credits = 1; /* iflag only */ ++ goto clear_snap; ++ } ++ ++ /* get snapshot old link data & old inode from xattr */ ++ old_inode = ext4_snapshot_read_link(inode, &link, ++ &next_link, false, &err); ++ if (err == 0 && old_inode == NULL) { ++ CERROR("invalid orig link link.old_ino == 0," ++ " inode=%lu\n", inode->i_ino); ++ err = -ENOENT; ++ } ++ if (err) { ++ SNAPSHOT_CONSOLE_ERR(err); ++ credits = ext4_calc_snapshot_link_credits( ++ inode, SNAPSHOT_CLEAR_LINK_OP); ++ clear_link = true; ++ goto clear_snap; ++ } ++ old_sb = old_inode->i_sb; ++ ++ /* ignore block bigger than file size */ ++ max = (old_inode->i_size + EXT4_BLOCK_SIZE(old_sb) - 1) ++ >> EXT4_BLOCK_SIZE_BITS(old_sb); ++ if (end > max) ++ end = max; ++ if (start >= end) ++ goto out; ++ ++ /* start journal */ ++ credits = ext4_ext_calc_credits_for_insert(old_inode, NULL) ++ + EXT4_ALLOC_NEEDED + 1; ++ handle = ext4_journal_start(old_inode, credits); ++ if (IS_ERR(handle)) { ++ err = PTR_ERR(handle); ++ SNAPSHOT_CONSOLE_ERR(err); ++ goto out; ++ } ++ ++ err = ext4_snapshot_copy_blocks(handle, inode, ++ old_inode, start, end, ++ true); ++ ext4_journal_stop(handle); ++out: ++ iput(old_inode); ++ return err; ++ ++/* When an error occurs, change the inode to no snapshot, ++ * and continue the process. */ ++clear_snap: ++ handle = ext4_journal_start(inode, credits); ++ if (IS_ERR(handle)) { ++ err = PTR_ERR(handle); ++ SNAPSHOT_CONSOLE_ERR(err); ++ return err; ++ } ++ if (clear_link) ++ ext4_snapshot_del_link(handle, inode); ++ EXT4_CLEAR_OST_SNAPSHOT_FLAGS(inode); ++ ext4_mark_inode_dirty(handle, inode); ++ ext4_journal_stop(handle); ++ return 0; ++} ++ ++int ext4_snapshot_truncate_blocks(handle_t *handle, ++ struct inode *inode, ++ struct inode *old_inode, ++ ext4_lblk_t start_block, ++ ext4_lblk_t end_block) ++{ ++ struct ext4_ext_path *path = NULL, *p = NULL; ++ struct ext4_extent *ex; ++ ext4_lblk_t next, start = 0, end = 0; ++ int exists, depth = 0, err = 0; ++ ext4_lblk_t block = start_block; ++ ++ while (block < end_block && block != EXT_MAX_BLOCKS) { ++ /* find extent for this block */ ++ down_read(&EXT4_I(inode)->i_data_sem); ++ ++ if (path && ext_depth(inode) != depth) { ++ /* depth was changed. we have to realloc path */ ++ kfree(path); ++ path = NULL; ++ } ++ ++ p = ext4_ext_find_extent(inode, block, path); ++ if (IS_ERR(p)) { ++ err = PTR_ERR(p); ++ CERROR("cannot find extent on inode=%lu.\n", ++ inode->i_ino); ++ up_read(&EXT4_I(inode)->i_data_sem); ++ break; ++ } ++ ++ path = p; ++ depth = ext_depth(inode); ++ if (unlikely(path[depth].p_hdr == NULL)) { ++ up_read(&EXT4_I(inode)->i_data_sem); ++ EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", ++ depth); ++ err = -EIO; ++ break; ++ } ++ ex = path[depth].p_ext; ++ next = ext4_ext_next_allocated_block(path); ++ up_read(&EXT4_I(inode)->i_data_sem); ++ ++ exists = 0; ++ if (!ex) { ++ /* there is no extent yet, so try to allocate ++ * all requested space */ ++ start = block; ++ end = end_block; ++ } else if (le32_to_cpu(ex->ee_block) > block) { ++ /* need to allocate space before found extent */ ++ start = block; ++ end = le32_to_cpu(ex->ee_block); ++ if (end_block < end) ++ end = end_block; ++ } else if (block >= le32_to_cpu(ex->ee_block) ++ + ext4_ext_get_actual_len(ex)) { ++ /* need to allocate space after found extent */ ++ start = block; ++ end = end_block; ++ if (end >= next) ++ end = next; ++ } else if (block >= le32_to_cpu(ex->ee_block)) { ++ /* ++ * some part of requested space is covered ++ * by found extent ++ */ ++ start = block; ++ end = le32_to_cpu(ex->ee_block) ++ + ext4_ext_get_actual_len(ex); ++ if (end_block < end) ++ end = end_block; ++ exists = 1; ++ } else { ++ BUG(); ++ } ++ BUG_ON(end <= start); ++ ++ if (!exists) { ++ err = ext4_snapshot_do_copy(handle, inode, ++ old_inode, start, ++ end, 0, true, false); ++ } else { ++ err = ext4_snapshot_do_copy(handle, inode, ++ old_inode, start, ++ end, ++ ext4_ext_pblock(ex) ++ + (block - start), ++ false, false); ++ } ++ ext4_ext_drop_refs(path); ++ ++ if (err < 0) { ++ SNAPSHOT_CONSOLE_ERR(err); ++ break; ++ } ++ ++ block = end; ++ } ++ ++ if (path) { ++ ext4_ext_drop_refs(path); ++ kfree(path); ++ } ++ ++ return err; ++} ++ + int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block, + ext4_lblk_t num, ext_prepare_callback func, + void *cbdata) +@@ -4498,6 +5743,8 @@ + ext4_lblk_t last = block + num; + int depth, exists, err = 0; + ++ struct bpointers *bp = cbdata; ++ + BUG_ON(func == NULL); + BUG_ON(inode == NULL); + +@@ -4569,10 +5816,42 @@ + cbex.ec_block = start; + cbex.ec_len = end - start; + cbex.ec_start = 0; ++ if (EXT4_TEST_OST_SNAPSHOT_FILE(inode) ++ && (bp->create == READ_OP)) { ++ /* if READ process && inode is snapshot */ ++ err = ext4_snapshot_read(inode, &cbex, ++ &exists, NULL); ++ if (err < 0) ++ break; ++ } else if (EXT4_TEST_OST_SNAPSHOT_ORIG(inode) ++ && (bp->create == WRITE_COPY_OP)) { ++ /* snapshot copy sparse */ ++ err = ext4_snapshot_copy_sparse( ++ inode, start, end); ++ if (err < 0) ++ break; ++ } + } else { +- cbex.ec_block = le32_to_cpu(ex->ee_block); +- cbex.ec_len = ext4_ext_get_actual_len(ex); +- cbex.ec_start = ext4_ext_pblock(ex); ++ if (EXT4_TEST_OST_SNAPSHOT_ORIG(inode) ++ && (bp->create == WRITE_COPY_OP)) { ++ /* snapshot copy on write */ ++ err = ext4_snapshot_copy_writeblocks( ++ inode, start, end); ++ if (err) ++ break; ++ } ++ ++ if (EXT4_TEST_OST_SNAPSHOT_FILE(inode) && ++ bp->create == READ_OP && ++ EXT4_SNAPSHOT_TEST_SPARSE_EXTENT(ex)) { ++ cbex.ec_block = start; ++ cbex.ec_len = end - start; ++ cbex.ec_start = 0; ++ } else { ++ cbex.ec_block = le32_to_cpu(ex->ee_block); ++ cbex.ec_len = ext4_ext_get_actual_len(ex); ++ cbex.ec_start = ext4_ext_pblock(ex); ++ } + } + + if (unlikely(cbex.ec_len == 0)) { +diff -urN -x .svn linux-stage.org/fs/ext4/inode.c linux-stage/fs/ext4/inode.c +--- linux-stage.org/fs/ext4/inode.c 2018-08-31 20:53:57.000000000 +0900 ++++ linux-stage/fs/ext4/inode.c 2018-09-03 15:18:02.000000000 +0900 +@@ -40,9 +40,9 @@ + #include + + #include "ext4_jbd2.h" +-#include "xattr.h" + #include "acl.h" + #include "ext4_extents.h" ++#include "snapshot.h" + + #include + +@@ -5329,7 +5329,10 @@ + { + unsigned int flags = EXT4_I(inode)->i_flags; + +- inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC); ++ inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC| ++ EXT4_SNAPSHOT_SHARE_FL| ++ EXT4_SNAPSHOT_SP_FL| ++ EXT4_SNAPSHOT_FL); + if (flags & EXT4_SYNC_FL) + inode->i_flags |= S_SYNC; + if (flags & EXT4_APPEND_FL) +@@ -5340,6 +5343,13 @@ + inode->i_flags |= S_NOATIME; + if (flags & EXT4_DIRSYNC_FL) + inode->i_flags |= S_DIRSYNC; ++ if (flags & EXT4_SNAPSHOT_SHARE_FL) ++ inode->i_flags |= EXT4_SNAPSHOT_SHARE_FL; ++ if (flags & EXT4_SNAPSHOT_SP_FL) ++ inode->i_flags |= EXT4_SNAPSHOT_SP_FL; ++ if (flags & EXT4_SNAPSHOT_FL) ++ inode->i_flags |= EXT4_SNAPSHOT_FL; ++ + } + + /* Propagate flags from i_flags to EXT4_I(inode)->i_flags */ +@@ -5352,8 +5362,12 @@ + vfs_fl = ei->vfs_inode.i_flags; + old_fl = ei->i_flags; + new_fl = old_fl & ~(EXT4_SYNC_FL|EXT4_APPEND_FL| +- EXT4_IMMUTABLE_FL|EXT4_NOATIME_FL| +- EXT4_DIRSYNC_FL); ++ EXT4_IMMUTABLE_FL|EXT4_NOATIME_FL| ++ EXT4_DIRSYNC_FL | ++ EXT4_SNAPSHOT_SHARE_FL| ++ EXT4_SNAPSHOT_SP_FL| ++ EXT4_SNAPSHOT_FL); ++ + if (vfs_fl & S_SYNC) + new_fl |= EXT4_SYNC_FL; + if (vfs_fl & S_APPEND) +@@ -5364,6 +5378,12 @@ + new_fl |= EXT4_NOATIME_FL; + if (vfs_fl & S_DIRSYNC) + new_fl |= EXT4_DIRSYNC_FL; ++ if (vfs_fl & EXT4_SNAPSHOT_SHARE_FL) ++ new_fl |= EXT4_SNAPSHOT_SHARE_FL; ++ if (vfs_fl & EXT4_SNAPSHOT_SP_FL) ++ new_fl |= EXT4_SNAPSHOT_SP_FL; ++ if (vfs_fl & EXT4_SNAPSHOT_FL) ++ new_fl |= EXT4_SNAPSHOT_FL; + } while (cmpxchg(&ei->i_flags, old_fl, new_fl) != old_fl); + } + +@@ -6061,6 +6081,7 @@ + + return ret; + } ++EXPORT_SYMBOL(ext4_meta_trans_blocks); + + /* + * Calulate the total number of credits to reserve to fit +@@ -6500,3 +6521,26 @@ + return rc; + } + EXPORT_SYMBOL(ext4_map_inode_page); ++ ++int ext4_snapshot_orphan_truncate(struct inode *inode, ++ struct ext4_snapshot_gen_lock **lock) ++{ ++ handle_t *handle; ++ int err; ++ ++ err = ext4_snapshot_lock(inode, false, lock); ++ if ((err != 0) || (*lock == NULL)) ++ return err; ++ ++ handle = ext4_journal_start(inode, blocks_for_truncate(inode)); ++ if (IS_ERR(handle)) { ++ err = PTR_ERR(handle); ++ return err; ++ } ++ ++ err = ext4_snapshot_punch(handle, inode, inode->i_size, ++ ~0ULL /* = OBD_OBJECT_EOF */); ++ ++ ext4_journal_stop(handle); ++ return err; ++} +diff -urN -x .svn linux-stage.org/fs/ext4/snapshot.c linux-stage/fs/ext4/snapshot.c +--- linux-stage.org/fs/ext4/snapshot.c 1970-01-01 09:00:00.000000000 +0900 ++++ linux-stage/fs/ext4/snapshot.c 2018-11-01 11:49:59.000000000 +0900 +@@ -0,0 +1,1222 @@ ++/* ++ * GPL HEADER START ++ * ++ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 only, ++ * as published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License version 2 for more details. A copy is ++ * included in the COPYING file that accompanied this code. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA ++ * ++ * GPL HEADER END ++ */ ++/* ++ * Copyright(c) 2016-2018 FUJITSU LIMITED. ++ * All rights reserved. ++ */ ++ ++#include ++#include ++#include ++#include ++#include "linux/quotaops.h" ++#include ++#include ++#include "ext4_jbd2.h" ++#include "ext4.h" ++#include "acl.h" ++#include "snapshot.h" ++#include "snapshot_debug.h" ++ ++ ++/* snapshot generation lock list head */ ++static LIST_HEAD(snap_lock_list); ++ ++/* mutex object for snapshot generation lock list */ ++static struct mutex snap_list_mutex; ++ ++/* snapshot lock timeout var */ ++static int snapshot_lock_timeout = SNAPSHOT_LOCK_TIMEOUT; ++ ++ ++/* same as lustre/include/lustre/lustre_user.h */ ++struct lu_fid { ++ /** ++ * FID sequence. Sequence is a unit of migration: all files (objects) ++ * with FIDs from a given sequence are stored on the same server. ++ * Lustre should support 2^64 objects, so even if each sequence ++ * has only a single object we can still enumerate 2^64 objects. ++ **/ ++ __u64 f_seq; ++ /* FID number within sequence. */ ++ __u32 f_oid; ++ /** ++ * FID version, used to distinguish different versions (in the sense ++ * of snapshots, etc.) of the same file system object. Not currently ++ * used. ++ **/ ++ __u32 f_ver; ++}; ++ ++/* same as lustre/include/lu_object.h */ ++struct lu_buf { ++ void *lb_buf; ++ ssize_t lb_len; ++}; ++ ++/* snapshot request data */ ++struct snapshot_list_data { ++ struct lu_fid fid; ++ unsigned long ost_ino; ++}; ++ ++#define IOC_SNAPSHOT_LIST_MAX 256 ++ ++struct snapshot_list_buf { ++ struct snapshot_list_data list_data[IOC_SNAPSHOT_LIST_MAX]; ++ int list_num; ++}; ++ ++struct lustre_mdt_attrs { ++ /** ++ * Bitfield for supported data in this structure. From enum lma_compat. ++ * lma_self_fid and lma_flags are always available. ++ */ ++ __u32 lma_compat; ++ /** ++ * Per-file incompat feature list. Lustre version should support all ++ * flags set in this field. The supported feature mask is available in ++ * LMA_INCOMPAT_SUPP. ++ */ ++ __u32 lma_incompat; ++ /** FID of this inode */ ++ struct lu_fid lma_self_fid; ++}; ++ ++/* ++ * ext4_get_snapshot_lock_timeout() ++ * ++ * get timeout of snapshot rock wait ++ * ++ * \param[in] - ++ * ++ * \retval snapshot lock timeout (sec) ++ */ ++int ext4_get_snapshot_lock_timeout(void) ++{ ++ /* get the snapshot lock timeout */ ++ return snapshot_lock_timeout; ++} ++EXPORT_SYMBOL(ext4_get_snapshot_lock_timeout); ++ ++/* ++ * ext4_set_snapshot_lock_timeout() ++ * ++ * set timeout of snapshot lock wait ++ * ++ * \param[in] tout snapshot lock timeout (src) ++ * ++ * \retval none ++ */ ++ ++void ext4_set_snapshot_lock_timeout(int tout) ++{ ++ snapshot_lock_timeout = tout; ++ return; ++} ++EXPORT_SYMBOL(ext4_set_snapshot_lock_timeout); ++ ++/* ++ * ext4_snapshot_get_enable() ++ * ++ * get status of snapshot enable/disable ++ * ++ * \param[in] sb super block ++ * ++ * \retval 0 snapshot is disabled ++ * \retval 1 snapshot is enabled ++ */ ++int ext4_snapshot_get_enable(struct super_block *sb) ++{ ++ /* check the feature bits in super block */ ++ return EXT4_HAS_RO_COMPAT_FEATURE(sb, ++ EXT4_FEATURE_RO_COMPAT_SNAPSHOT); ++} ++EXPORT_SYMBOL(ext4_snapshot_get_enable); ++ ++/* ++ * ext4_snapshot_set_enable() ++ * ++ * enable snapshot functions ++ * ++ * \param[in] sb super block ++ * ++ * \retval 0 success ++ * \retval less than 0 failure (-errno) ++ */ ++int ext4_snapshot_set_enable(struct super_block *sb) ++{ ++ handle_t *handle; ++ int err; ++ ++ /* start journal */ ++ handle = ext4_journal_start_sb(sb, 1); ++ if (IS_ERR(handle)) { ++ err = PTR_ERR(handle); ++ CERROR("fail to start journal err=%d\n", err); ++ goto out; ++ } ++ ++ /* get journal */ ++ err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh); ++ if (err) { ++ CERROR("error %d on journal write access\n", err); ++ goto out_stop_journal; ++ } ++ ++ /* set */ ++ EXT4_SET_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_SNAPSHOT); ++ sb->s_dirt = 1; ++ ++ /* mark dirty */ ++ err = ext4_handle_dirty_metadata(handle, NULL, ++ EXT4_SB(sb)->s_sbh); ++ if (err) { ++ CERROR("error %d on handle dirty metadata\n", err); ++ goto out_stop_journal; ++ } ++ /* stop journal */ ++ err = ext4_journal_stop(handle); ++ if (err) { ++ CERROR("error %d on journal stop\n", err); ++ goto out; ++ } ++ return 0; ++ ++out_stop_journal: ++ ext4_journal_stop(handle); ++out: ++ return err; ++} ++EXPORT_SYMBOL(ext4_snapshot_set_enable); ++ ++/* ++ * ext4_snapshot_lock() ++ * ++ * lock between the snapshot generations ++ * ++ * \param[in] inode snapshot inode ++ * \param[in] create for create snapshot ++ * \param[out] lock snapshot lock object ++ * this data is used at unlock ++ * ++ * \retval 0 success ++ * \retval less than 0 failure (-errno) ++ */ ++int ext4_snapshot_lock(struct inode *inode, bool create, ++ struct ext4_snapshot_gen_lock **lock) ++{ ++ struct ext4_snapshot_gen_lock *entry; ++ int err = 0, delay, match_f, count = 0, max_count; ++ unsigned long org_ino; ++ struct timespec org_ts; ++ struct ext4_snapshot_link link; ++ /* calc lock delay time */ ++ delay = HZ / SNAPSHOT_LOCK_FREQ; ++ if (delay == 0) ++ delay = 1; ++ /* calc snapshot lock timeout count */ ++ max_count = snapshot_lock_timeout * SNAPSHOT_LOCK_FREQ; ++ /* initialize lock object */ ++ *lock = NULL; ++ ++ if (!create && !EXT4_TEST_OST_SNAPSHOT(inode)) ++ return 0; ++ ++ /* is it 1st time of create snapshot or snapshot orig file*/ ++ if (!EXT4_TEST_OST_SNAPSHOT_FILE(inode)) { ++ org_ino = inode->i_ino; ++ org_ts = EXT4_I(inode)->i_crtime; ++ } else { ++ err = ext4_snapshot_get_link(inode, &link); ++ if (err) ++ goto err_out; ++ org_ino = link.org_ino; ++ org_ts = link.org_ts; ++ } ++ ++repeat: ++ /* lock snapshot lock list */ ++ mutex_lock(&snap_list_mutex); ++ match_f = 0; ++ /* scan list, and find same lock object from list */ ++ list_for_each_entry(entry, &snap_lock_list, list) { ++ /* compare inode no & inode mtime */ ++ if (entry->org == org_ino && ++ SNAPSHOT_MATCH_TS(&entry->ts, &org_ts)) { ++ match_f = 1; ++ break; ++ } ++ } ++ ++ /* add snapshot lock object into list, if not found in list */ ++ if (!match_f) { ++ entry = NULL; ++ if (!create && !EXT4_TEST_OST_SNAPSHOT(inode)) { ++ mutex_unlock(&snap_list_mutex); ++ return 0; ++ } ++ entry = kmalloc(sizeof(*entry), GFP_NOFS); ++ if (entry == NULL) { ++ CERROR("fail to allocate snapshot lock.\n"); ++ err = -ENOMEM; ++ /* unlock snapshot lock list */ ++ mutex_unlock(&snap_list_mutex); ++ goto err_out; ++ } ++ entry->org = org_ino; ++ entry->ts = org_ts; ++ list_add_tail(&entry->list, &snap_lock_list); ++ /* now snapshot is locking */ ++ *lock = entry; ++ /* unlock snapshot lock list */ ++ mutex_unlock(&snap_list_mutex); ++ return 0; ++ } ++ /* unlock snapshot lock list */ ++ mutex_unlock(&snap_list_mutex); ++ ++ /* wait delay ms, if snapshot is already locked */ ++ count++; ++ if ((max_count > 0) && (count > max_count)) { ++ CWARN("snapshot lock timeout.\n"); ++ err = -ETIME; ++ goto err_out; ++ } ++ schedule_timeout_uninterruptible(delay); ++ goto repeat; ++ ++err_out: ++ return err; ++} ++EXPORT_SYMBOL(ext4_snapshot_lock); ++ ++/* ++ * ext4_snapshot_unlock() ++ * ++ * unlock between the snapshot generations ++ * ++ * \param[in] lock snapshot lock object ++ * ++ * \retval none ++ */ ++void ext4_snapshot_unlock(struct ext4_snapshot_gen_lock *lock) ++{ ++ LASSERT(lock); ++ /* lock snapshot lock list */ ++ mutex_lock(&snap_list_mutex); ++ /* delete lock object from list */ ++ list_del(&lock->list); ++ /* unlock snapshot lock list */ ++ mutex_unlock(&snap_list_mutex); ++ /* free napshot lock object */ ++ kfree(lock); ++ return; ++} ++EXPORT_SYMBOL(ext4_snapshot_unlock); ++ ++struct inode *ext4_snapshot_read_link(struct inode *inode, ++ struct ext4_snapshot_link *my_link, ++ struct ext4_snapshot_link *tgt_link, ++ bool new, int *err) ++{ ++ unsigned long tgt_ino; ++ struct inode *tgt_inode; ++ *err = 0; ++ if (new) { ++ if (my_link->new_ino == 0) ++ return NULL; ++ tgt_ino = my_link->new_ino; ++ } else { ++ if (my_link->old_ino == 0) ++ return NULL; ++ tgt_ino = my_link->old_ino; ++ } ++ *err = -ENOENT; ++ ++ tgt_inode = ext4_iget(inode->i_sb, tgt_ino); ++ if (IS_ERR(tgt_inode)) { ++ *err = PTR_ERR(tgt_inode); ++ CERROR("fail to get %s snapshot inode. " ++ "me=%lu tgt_inode=%lu err=%d\n", (new ? "new" : "old"), ++ inode->i_ino, tgt_ino, *err); ++ return NULL; ++ } ++ if (!EXT4_TEST_OST_SNAPSHOT(tgt_inode)) { ++ CERROR("%s snapshot inode is invalid flag. " ++ "inode=%lu flag=%lx\n", ++ (new ? "new" : "old"), tgt_ino, ++ EXT4_I(tgt_inode)->i_flags); ++ goto err_close; ++ } ++ *err = ext4_snapshot_get_link(tgt_inode, tgt_link); ++ if (*err) ++ goto err_close; ++ ++ if ((tgt_link->org_ino != my_link->org_ino) || ++ (!SNAPSHOT_MATCH_TS(&tgt_link->org_ts, &my_link->org_ts)) || ++ (new && (tgt_link->old_ino != inode->i_ino)) || ++ (!new && (tgt_link->new_ino != inode->i_ino))) { ++ CERROR("invalid %s snapshot link. " ++ "mylink : ino=%lu orig=%lu new=%lu old=%lu ts=%llu " ++ "tgtlink : ino=%lu orig=%lu new=%lu old=%lu ts=%llu\n", ++ (new ? "new" : "old"), ++ inode->i_ino, my_link->org_ino, my_link->new_ino, ++ my_link->old_ino, ++ SNAPSHOT_CLTIME(&my_link->org_ts), ++ tgt_inode->i_ino, tgt_link->org_ino, tgt_link->new_ino, ++ tgt_link->old_ino, SNAPSHOT_CLTIME(&tgt_link->org_ts)); ++ *err = -EXDEV; ++ goto err_close; ++ } ++ return tgt_inode; ++ ++ err_close: ++ iput(tgt_inode); ++ return NULL; ++} ++ ++/* ++ * ext4_snapshot_clone() ++ * ++ * update the snapshot link information in the xattr ++ * target inode is a original file, new created snapshot ++ * and previous generation of snapshot. ++ * ++ * \param[in] snap_inode new created snapshot inode ++ * \param[in] orig_inode snapshot original file inode ++ * ++ * \retval 0 success ++ * \retval less than 0 failure (retval is error code) ++ */ ++ ++int ext4_snapshot_clone(struct inode *snap_inode, ++ struct inode *orig_inode) ++{ ++ handle_t *handle; ++ struct inode *old_inode = NULL; ++ struct ext4_snapshot_link orig_link, snap_link, old_link; ++ loff_t rb_snap_size, rb_snap_disksize; ++ int err = 0, err2 = 0, credits; ++ ++ rb_snap_size = i_size_read(snap_inode); ++ rb_snap_disksize = EXT4_I(snap_inode)->i_disksize; ++ ++ if (!EXT4_TEST_OST_SNAPSHOT(orig_inode)) { ++ orig_link.org_ino = orig_inode->i_ino; ++ orig_link.org_ts = EXT4_I(orig_inode)->i_crtime; ++ orig_link.new_ino = 0; ++ orig_link.old_ino = 0; ++ } else { ++ err = ext4_snapshot_get_link(orig_inode, &orig_link); ++ if (err) { ++ SNAPSHOT_CONSOLE_ERR(err); ++ return err; ++ } ++ } ++ ++ /* get old link */ ++ if (orig_link.old_ino) { ++ old_inode = ext4_snapshot_read_link(orig_inode, ++ &orig_link, ++ &old_link, ++ false, &err); ++ if (err) { ++ SNAPSHOT_CONSOLE_ERR(err); ++ return err; ++ } ++ } ++ ++ credits = ext4_calc_snapshot_link_credits(orig_inode, ++ SNAPSHOT_CREATE_OP); ++ /* start journal */ ++ handle = ext4_journal_start(snap_inode, credits); ++ if (IS_ERR(handle)) { ++ err = PTR_ERR(handle); ++ CERROR("fail to start snapshot journal. inode=%lu err=%d\n", ++ snap_inode->i_ino, err); ++ SNAPSHOT_CONSOLE_ERR(err); ++ goto out_iput; ++ } ++ ++ /* update new snap xattr link */ ++ snap_link.org_ino = orig_inode->i_ino; ++ snap_link.org_ts = orig_link.org_ts; ++ snap_link.new_ino = orig_inode->i_ino; ++ snap_link.old_ino = orig_link.old_ino; ++ ++ err = ext4_snapshot_set_link(handle, snap_inode, &snap_link); ++ if (err) { ++ SNAPSHOT_CONSOLE_ERR(err); ++ goto out_stop_journal; ++ } ++ ++ /* update xattr snapshot link of old snap */ ++ if (old_inode) { ++ old_link.new_ino = snap_inode->i_ino; ++ err = ext4_snapshot_set_link(handle, old_inode, &old_link); ++ if (err) { ++ SNAPSHOT_CONSOLE_ERR(err); ++ goto out_rb_snap_link; ++ } ++ } ++ ++ /* update original xattr link */ ++ orig_link.old_ino = snap_inode->i_ino; ++ err = ext4_snapshot_set_link(handle, orig_inode, &orig_link); ++ if (err) { ++ SNAPSHOT_CONSOLE_ERR(err); ++ goto out_rb_old_link; ++ } ++ ++ /* set new snapshot size & disksize */ ++ i_size_write(snap_inode, i_size_read(orig_inode)); ++ EXT4_I(snap_inode)->i_disksize = EXT4_I(orig_inode)->i_disksize; ++ /* set new snapshot i_flags */ ++ EXT4_SET_OST_SNAPSHOT_FILE(snap_inode); ++ /* update new snapshot inode */ ++ err = ext4_mark_inode_dirty(handle, snap_inode); ++ if (err) { ++ CERROR("fail to dirty new snapshot flags. inode=%lu err=%d\n", ++ snap_inode->i_ino, err); ++ SNAPSHOT_CONSOLE_ERR(err); ++ /* roll-back snapshot size & disksize */ ++ i_size_write(snap_inode, rb_snap_size); ++ EXT4_I(snap_inode)->i_disksize = rb_snap_size; ++ /* clear new snapshot i_flags */ ++ EXT4_CLEAR_OST_SNAPSHOT_FLAGS(snap_inode); ++ goto out_rb_orig_link; ++ } ++ ++ /* update original i_flags(SNAPSHOT_SHARE_FL) */ ++ if (!(EXT4_TEST_OST_SNAPSHOT_ORIG(orig_inode))) { ++ /* set inode flags (SNAPSHOT_SHARE_FL) */ ++ EXT4_SET_OST_SNAPSHOT_ORIG(orig_inode); ++ /* update original inode */ ++ err = ext4_mark_inode_dirty(handle, orig_inode); ++ if (err) { ++ SNAPSHOT_CONSOLE_ERR(err); ++ CERROR("fail to dirty orig snapshot flags. " ++ "inode=%lu err=%d\n", orig_inode->i_ino, err); ++ EXT4_CLEAR_OST_SNAPSHOT_FLAGS(orig_inode); ++ goto out_rb_snap_flag; ++ } ++ } ++ ++ ext4_journal_stop(handle); ++ if (old_inode) ++ iput(old_inode); ++ ++ return 0; ++ ++out_rb_snap_flag: ++ /* roll-back snapshot size & disksize */ ++ i_size_write(snap_inode, rb_snap_size); ++ EXT4_I(snap_inode)->i_disksize = rb_snap_size; ++ /* clear snapshot inode i_flags of SNAPSHOT_FLAGS */ ++ EXT4_CLEAR_OST_SNAPSHOT_FLAGS(snap_inode); ++ /* update original inode */ ++ err2 = ext4_mark_inode_dirty(handle, snap_inode); ++ if (err2) { ++ SNAPSHOT_CONSOLE_ERR(err2); ++ goto out_stop_journal; ++ } ++out_rb_orig_link: ++ /* roll-back xattr snapshot link of original */ ++ if (!(EXT4_TEST_OST_SNAPSHOT_ORIG(orig_inode))) ++ err2 = ext4_snapshot_del_link(handle, orig_inode); ++ else { ++ orig_link.old_ino = old_inode->i_ino; ++ err2 = ext4_snapshot_set_link(handle, ++ orig_inode, ++ &orig_link); ++ ++ } ++ if (err2) { ++ SNAPSHOT_CONSOLE_ERR(err2); ++ goto out_stop_journal; ++ } ++out_rb_old_link: ++ if (old_inode) { ++ old_link.new_ino = orig_inode->i_ino; ++ err2 = ext4_snapshot_set_link(handle, ++ old_inode, &old_link); ++ if (err2) { ++ SNAPSHOT_CONSOLE_ERR(err2); ++ goto out_stop_journal; ++ } ++ } ++out_rb_snap_link: ++ /* delete xattr snapshot link of new snapshot */ ++ err2 = ext4_snapshot_del_link(handle, snap_inode); ++ if (err2) ++ SNAPSHOT_CONSOLE_ERR(err2); ++out_stop_journal: ++ /* stop journal */ ++ ext4_journal_stop(handle); ++out_iput: ++ if (old_inode) ++ iput(old_inode); ++ return err; ++} ++EXPORT_SYMBOL(ext4_snapshot_clone); ++ ++int ext4_snapshot_punch(handle_t *handle, struct inode *inode, ++ __u64 start, __u64 end) ++{ ++ struct ext4_snapshot_link my_link, old_link; ++ int err, credits; ++ struct inode *old_snap = NULL; ++ ext4_lblk_t start_block, end_block; ++ ++ err = ext4_snapshot_get_link(inode, &my_link); ++ if (err) { ++ SNAPSHOT_CONSOLE_ERR(err); ++ goto out_clear; ++ } ++ old_snap = ext4_snapshot_read_link(inode, ++ &my_link, &old_link, ++ false, &err); ++ if (err == 0 && old_snap == NULL) { ++ CERROR("invalid orig link link.old_ino == 0," ++ " inode=%lu\n", inode->i_ino); ++ err = -ENOENT; ++ } ++ if (err) { ++ SNAPSHOT_CONSOLE_ERR(err); ++ goto out_delete_link; ++ } ++ if (end > old_snap->i_size) ++ end = old_snap->i_size; ++ ++ if (start >= end) { ++ iput(old_snap); ++ return 0; ++ } ++ ++ start_block = start >> EXT4_BLOCK_SIZE_BITS(inode->i_sb); ++ end_block = (end + EXT4_BLOCK_SIZE(inode->i_sb) - 1) ++ >> EXT4_BLOCK_SIZE_BITS(inode->i_sb); ++ ++ err = ext4_snapshot_truncate_blocks(handle, inode, old_snap, ++ start_block, end_block); ++ iput(old_snap); ++ if (err) { ++ SNAPSHOT_CONSOLE_ERR(err); ++ return err; ++ } ++ return 0; ++ ++out_delete_link: ++ credits = ext4_calc_snapshot_link_credits(inode, ++ SNAPSHOT_CLEAR_LINK_OP); ++ if (!ext4_handle_has_enough_credits(handle, credits)) { ++ err = ext4_journal_extend(handle, credits); ++ if (err < 0) { ++ CERROR("couldn't extend journal inode=%lu " ++ "handle=%p need=%d has=%d err=%d\n", ++ inode->i_ino, handle, credits, ++ handle->h_buffer_credits, err); ++ goto out_clear; ++ } else if (err != 0) { ++ err = ext4_journal_restart(handle, credits); ++ if (err) { ++ /* ++ * This should never happen. ++ * It may panic with ext4_journal_stop() ++ * if ext4_journal_restart() fails. ++ */ ++ CERROR("couldn't extend and restart " ++ "journal inode=%lu handle=%p " ++ "need=%d err=%d\n", ++ inode->i_ino, handle, credits, err); ++ goto out_clear; ++ } ++ } ++ } ++ ext4_snapshot_del_link(handle, inode); ++out_clear: ++ /* When err, return the orig inode to an ordinary inode ++ * and continue to truncate processing */ ++ EXT4_CLEAR_OST_SNAPSHOT_FLAGS(inode); ++ ext4_mark_inode_dirty(handle, inode); ++ return 0; ++} ++EXPORT_SYMBOL(ext4_snapshot_punch); ++ ++static int ext4_snapshot_get_ostfid(struct inode *inode, ++ struct lu_fid *fid) ++{ ++ int rc = 0; ++ struct lustre_mdt_attrs lma; ++ rc = ext4_xattr_get(inode, ++ EXT4_XATTR_INDEX_TRUSTED, ++ "lma", (void *)&lma, ++ sizeof(lma)); ++ if (rc == sizeof(lma)) { ++ memcpy(fid, &(lma.lma_self_fid), ++ sizeof(lma.lma_self_fid)); ++ return 0; ++ } else if (rc > 0) ++ return -ENOENT; ++ else ++ return rc; ++} ++ ++int ext4_snapshot_get_orphan(struct inode *inode, ++ void *fid_buf, ++ int *array_num) ++{ ++ int err = 0, err2 = 0; ++ struct lu_fid *fid_array = fid_buf; ++ struct ext4_snapshot_link my_link, cur_link, next_link; ++ struct ext4_snapshot_link *cur_link_buf = NULL, ++ *next_link_buf = NULL, *tmp = NULL; ++ struct inode *new_snap = NULL, *cur = NULL; ++ int max_array = *array_num; ++ int num = 0; ++ unsigned long tmp_ino = 0; ++ handle_t *handle = NULL; ++ ++ err = ext4_snapshot_get_link(inode, &my_link); ++ if (err) { ++ SNAPSHOT_CONSOLE_ERR(err); ++ return err; ++ } ++ ++ if ((my_link.old_ino != 0) || ++ (my_link.new_ino == 0)) { ++ return -ENOENT; ++ } ++ new_snap = ext4_snapshot_read_link(inode, ++ &my_link, &next_link, ++ true, &err); ++ if (err) { ++ SNAPSHOT_CONSOLE_ERR(err); ++ return err; ++ } ++ if (!EXT4_TEST_OST_SNAPSHOT_DEL(new_snap)) { ++ iput(new_snap); ++ *array_num = 0; ++ return -ENOENT; ++ } ++ ++ cur = new_snap; ++ next_link_buf = &cur_link; ++ cur_link_buf = &next_link; ++ ++ while (cur && EXT4_TEST_OST_SNAPSHOT_DEL(cur)) { ++ new_snap = ext4_snapshot_read_link(cur, cur_link_buf, ++ next_link_buf, ++ true, &err); ++ if (err) { ++ iput(cur); ++ SNAPSHOT_CONSOLE_ERR(err); ++ return err; ++ } ++ if (num >= max_array) { ++ if (new_snap) ++ iput(new_snap); ++ err = -EAGAIN; ++ break; ++ } ++ ++ err = ext4_snapshot_get_ostfid(cur, &fid_array[num]); ++ if (err) { ++ iput(cur); ++ if (new_snap) ++ iput(new_snap); ++ SNAPSHOT_CONSOLE_ERR(err); ++ return err; ++ } ++ ++ num++; ++ iput(cur); ++ cur = new_snap; ++ tmp = cur_link_buf; ++ cur_link_buf = next_link_buf; ++ next_link_buf = tmp; ++ } ++ ++ handle = ext4_journal_start(inode, ++ ext4_calc_snapshot_link_credits(inode, ++ SNAPSHOT_DELETE_NEW_OP)); ++ if (IS_ERR(handle)) { ++ err2 = PTR_ERR(handle); ++ SNAPSHOT_CONSOLE_ERR(err); ++ CERROR("fail to journal start. inode=%lu err=%d\n", ++ inode->i_ino, err); ++ goto out_iput; ++ } ++ ++ if (cur == NULL) ++ my_link.new_ino = 0; ++ else if (my_link.new_ino != cur->i_ino) { ++ my_link.new_ino = cur->i_ino; ++ /* for rollback */ ++ tmp_ino = cur_link_buf->old_ino; ++ cur_link_buf->old_ino = inode->i_ino; ++ err2 = ext4_snapshot_set_link(handle, cur, cur_link_buf); ++ if (err2) { ++ SNAPSHOT_CONSOLE_ERR(err2); ++ goto out_stop; ++ } ++ } ++ err2 = ext4_snapshot_set_link(handle, inode, &my_link); ++ /* rollback */ ++ if (err2 && cur != NULL) { ++ cur_link_buf->old_ino = tmp_ino; ++ ext4_snapshot_set_link(handle, cur, cur_link_buf); ++ } ++ out_stop: ++ ext4_journal_stop(handle); ++ out_iput: ++ *array_num = num; ++ if (cur) ++ iput(cur); ++ ++ return err2 ? err2 : err; ++} ++EXPORT_SYMBOL(ext4_snapshot_get_orphan); ++ ++static int ext4_snapshot_delete_link(handle_t *handle, ++ struct inode *inode, ++ struct inode *new_snap, ++ struct inode *old_snap, ++ struct ext4_snapshot_link *my_link, ++ struct ext4_snapshot_link *new_link, ++ struct ext4_snapshot_link *old_link, ++ void *orig_fid) ++{ ++ int err = 0; ++ unsigned long old_snap_new_ino = 0; ++ ++ /* Older snapshot linked inode is exist ? */ ++ if (old_snap) { ++ /* for rollback */ ++ old_snap_new_ino = old_link->new_ino; ++ ++ /* update old snapshot link */ ++ old_link->new_ino = my_link->new_ino; ++ err = ext4_snapshot_set_link(handle, old_snap, old_link); ++ if (err) ++ return err; ++ } ++ ++ /* Newer snapshot linked inode is exist ? */ ++ if (new_snap) { ++ if (EXT4_TEST_OST_SNAPSHOT_ORIG(new_snap) ++ && my_link->old_ino == 0) { ++ EXT4_CLEAR_OST_SNAPSHOT_FLAGS(new_snap); ++ ++ /* delete snapshot link */ ++ err = ext4_snapshot_del_link(handle, new_snap); ++ if (err) ++ EXT4_SET_OST_SNAPSHOT_ORIG(new_snap); ++ ++ ext4_mark_inode_dirty(handle, new_snap); ++ if (err == 0) { ++ err = ext4_snapshot_get_ostfid(new_snap, ++ (struct lu_fid *)orig_fid); ++ if (err) { ++ SNAPSHOT_CONSOLE_ERR(err); ++ } else { ++ /* case of last snapshot deletion */ ++ err = 1; ++ } ++ } ++ } else { ++ new_link->old_ino = my_link->old_ino; ++ /* update newer snapshot link */ ++ err = ext4_snapshot_set_link(handle, ++ new_snap, new_link); ++ } ++ ++ if (err < 0) { ++ if (old_snap) { ++ /* rollback */ ++ old_link->new_ino = old_snap_new_ino; ++ ext4_snapshot_set_link(handle, ++ old_snap, old_link); ++ } else { ++ /* When I'm oldest snapshot, force delete. */ ++ err = 0; ++ } ++ return err; ++ } ++ } ++ /* Even if an error occurs, do not roll back, ++ * because it is node to be deleted. ++ */ ++ my_link->old_ino = my_link->new_ino = 0; ++ ext4_snapshot_set_link(handle, inode, my_link); ++ ++ return err; ++} ++ ++int ext4_snapshot_destroy(struct inode *inode, void *orig_fid) ++{ ++ struct ext4_snapshot_link my_link, new_link, old_link; ++ struct inode *old_snap = NULL, *new_snap = NULL; ++ handle_t *handle; ++ int err, credits; ++ ++ err = ext4_snapshot_get_link(inode, &my_link); ++ if (err) { ++ SNAPSHOT_CONSOLE_ERR(err); ++ return err; ++ } ++ ++ new_snap = ext4_snapshot_read_link(inode, ++ &my_link, &new_link, ++ true, &err); ++ if (err) { ++ SNAPSHOT_CONSOLE_ERR(err); ++ err = 0; ++ my_link.new_ino = 0; ++ } ++ ++ old_snap = ext4_snapshot_read_link(inode, ++ &my_link, &old_link, ++ false, &err); ++ if (err) { ++ SNAPSHOT_CONSOLE_ERR(err); ++ err = 0; ++ my_link.old_ino = 0; ++ } ++ ++ credits = ext4_calc_snapshot_link_credits(inode, SNAPSHOT_DELETE_OP); ++ ++ /* If inode is the oldest snapshot. ++ * not necessary to do anything */ ++ handle = ext4_journal_start(inode, credits); ++ if (IS_ERR(handle)) { ++ err = PTR_ERR(handle); ++ SNAPSHOT_CONSOLE_ERR(err); ++ CERROR("fail to journal start. inode=%lu err=%d\n", ++ inode->i_ino, err); ++ goto out_err; ++ } ++ ++ if (old_snap) { ++ ext4_lblk_t end_block; ++ __u64 end = inode->i_size; ++ if (end > old_snap->i_size) ++ end = old_snap->i_size; ++ ++ if (end == 0) ++ goto skip_copy; ++ ++ end_block = (end + EXT4_BLOCK_SIZE(inode->i_sb) - 1) ++ >> EXT4_BLOCK_SIZE_BITS(inode->i_sb); ++ ++ err = ext4_snapshot_copy_blocks(handle, inode, old_snap, ++ 0, end_block, false); ++ if (err) { ++ SNAPSHOT_CONSOLE_ERR(err); ++ goto out_stop; ++ } ++ } ++ ++ if (!ext4_handle_has_enough_credits(handle, credits)) { ++ err = ext4_journal_extend(handle, credits); ++ if (err < 0) { ++ CERROR("couldn't extend journal inode=%lu " ++ "handle=%p need=%d has=%d err=%d\n", ++ inode->i_ino, handle, credits, ++ handle->h_buffer_credits, err); ++ goto out_stop; ++ } else if (err != 0) { ++ err = ext4_journal_restart(handle, credits); ++ if (err) { ++ /* ++ * This should never happen. ++ * It may panic with ext4_journal_stop() ++ * if ext4_journal_restart() fails. ++ */ ++ CERROR("couldn't extend and restart " ++ "journal inode=%lu handle=%p " ++ "need=%d err=%d\n", ++ inode->i_ino, handle, credits, err); ++ goto out_stop; ++ } ++ } ++ } ++ skip_copy: ++ err = ext4_snapshot_delete_link(handle, inode, new_snap, old_snap, ++ &my_link, &new_link, &old_link, ++ orig_fid); ++ ++ SNAPSHOT_CONSOLE_ERR(err); ++ ++ out_stop: ++ ext4_journal_stop(handle); ++ out_err: ++ if (new_snap) ++ iput(new_snap); ++ if (old_snap) ++ iput(old_snap); ++ return err; ++} ++EXPORT_SYMBOL(ext4_snapshot_destroy); ++ ++int ext4_snapshot_get_old_list(struct inode *inode, ++ void *buf) ++{ ++ struct lu_buf *bufp = (struct lu_buf *)buf; ++ struct snapshot_list_buf *list_buf = NULL; ++ struct inode *cur = NULL, *next_inode = NULL; ++ struct ext4_snapshot_gen_lock *lock = NULL; ++ struct ext4_snapshot_link cur_link, next_link; ++ struct ext4_snapshot_link *cur_link_buf = NULL, ++ *next_link_buf = NULL, *tmp = NULL; ++ int i = 0, rc = 0; ++ ++ if (bufp) { ++ if (bufp->lb_len < sizeof(struct snapshot_list_buf)) { ++ CERROR("invalid snapshot_list_buf\n"); ++ return -EFAULT; ++ } ++ list_buf = (struct snapshot_list_buf *)bufp->lb_buf; ++ } ++ rc = ext4_snapshot_lock(inode, false, &lock); ++ if (rc) { ++ CERROR("fail to lock snapshot. err=%d\n", rc); ++ return rc; ++ } ++ if (!lock || !EXT4_TEST_OST_SNAPSHOT_DEL(inode)) { ++ CERROR("inode is not orphan snapshot ino=%ld\n", ++ inode->i_ino); ++ rc = -ENOENT; ++ goto out; ++ } ++ rc = ext4_snapshot_get_link(inode, &cur_link); ++ if (rc) ++ goto out; ++ ++ if (list_buf) { ++ list_buf->list_data[i].ost_ino = inode->i_ino; ++ list_buf->list_data[i].fid.f_seq = 0; ++ } ++ ++ next_link_buf = &next_link; ++ cur_link_buf = &cur_link; ++ cur = inode; ++ ++ i++; ++ ++ while (cur_link_buf->old_ino) { ++ next_inode = ext4_snapshot_read_link(cur, cur_link_buf, ++ next_link_buf, ++ false, &rc); ++ if (rc) { ++ if (rc == -EXDEV && ++ EXT4_TEST_OST_SNAPSHOT_DEL(cur)) ++ CWARN("orphan inode has invalid old snapshot link. " ++ "probably failed to delete inode " ++ "after being removed from snapshot link. " ++ "inode=%lu\n", cur->i_ino); ++ if (list_buf && (cur != inode)) { ++ /* if cur == inode, ++ * do not set the command to error ++ * in order to make inode deleteable. ++ */ ++ list_buf->list_data[i].fid.f_oid = -rc; ++ list_buf->list_data[i].ost_ino = 0; ++ i++; ++ } ++ CERROR("invalid old link err=%d inode=%lu\n", ++ rc, cur->i_ino); ++ rc = 0; ++ break; ++ } ++ cur = next_inode; ++ tmp = cur_link_buf; ++ cur_link_buf = next_link_buf; ++ next_link_buf = tmp; ++ ++ if (!list_buf) { ++ rc = -EEXIST; ++ break; ++ } ++ if (i >= IOC_SNAPSHOT_LIST_MAX) { ++ rc = -EAGAIN; ++ break; ++ } ++ ++ list_buf->list_data[i].ost_ino = cur->i_ino; ++ if (EXT4_TEST_OST_SNAPSHOT_DEL(cur)) { ++ list_buf->list_data[i].fid.f_seq = 0; ++ list_buf->list_data[i].fid.f_oid = 0; ++ } else { ++ struct lu_fid ff; ++ /* pack mdt_fid */ ++ rc = ext4_xattr_get(next_inode, ++ EXT4_XATTR_INDEX_TRUSTED, ++ "fid", (void *)&ff, ++ sizeof(ff)); ++ if (rc == sizeof(ff)) { ++ memcpy(&list_buf->list_data[i].fid, &ff, ++ sizeof(ff)); ++ /* Currently, the f_ver is not the real parent ++ * MDT-object's FID::f_ver, instead it ++ * is the OST-object index in its ++ * parent MDT-object's layout EA. */ ++ list_buf->list_data[i].fid.f_ver = 0; ++ } else { ++ list_buf->list_data[i].fid.f_seq = 0; ++ if (rc > 0) ++ list_buf->list_data[i].fid.f_oid = ENOENT; ++ else ++ list_buf->list_data[i].fid.f_oid = -rc; ++ } ++ rc = 0; ++ } ++ i++; ++ } ++ iput(cur); ++ ++ if (list_buf) ++ list_buf->list_num = i; ++out: ++ if (lock) ++ ext4_snapshot_unlock(lock); ++ return rc; ++} ++EXPORT_SYMBOL(ext4_snapshot_get_old_list); ++ ++int ext4_snapshot_list_orphan(struct super_block *sb, ++ void *buf) ++{ ++ struct lu_buf *bufp = (struct lu_buf *)buf; ++ struct snapshot_list_buf *list_buf; ++ struct buffer_head *inode_bitmap_bh = NULL; ++ ext4_group_t ngroups; ++ unsigned long ino = 0; ++ struct inode *inode; ++ ext4_group_t i; ++ int j = 0, rc = 0; ++ ++ if (!bufp || bufp->lb_len < sizeof(struct snapshot_list_buf)) { ++ CERROR("invalid snapshot_list_buf\n"); ++ return -EFAULT; ++ } ++ list_buf = (struct snapshot_list_buf *)bufp->lb_buf; ++ ++ ngroups = ext4_get_groups_count(sb); ++ for (i = 0, j = 0; i < ngroups; i++, ino = 0) { ++ inode_bitmap_bh = ext4_read_inode_bitmap(sb, i); ++ if (!inode_bitmap_bh) { ++ CERROR("fail to get inode_bitmap group=%u\n", ++ i); ++ return -EIO; ++ } ++repeat_in_this_group: ++ ino = ext4_find_next_bit((unsigned long *) ++ inode_bitmap_bh->b_data, ++ EXT4_INODES_PER_GROUP(sb), ino); ++ ++ if (++ino > EXT4_INODES_PER_GROUP(sb)) { ++ brelse(inode_bitmap_bh); ++ continue; ++ } ++ ++ inode = ext4_iget(sb, ++ ino + (EXT4_INODES_PER_GROUP(sb) * i)); ++ if (!IS_ERR(inode)) { ++ if (EXT4_TEST_OST_SNAPSHOT_DEL(inode)) { ++ if (j >= IOC_SNAPSHOT_LIST_MAX) { ++ rc = -EAGAIN; ++ brelse(inode_bitmap_bh); ++ iput(inode); ++ break; ++ } ++ list_buf->list_data[j].ost_ino = inode->i_ino; ++ /* pack ost_fid */ ++ rc = ext4_snapshot_get_ostfid(inode, ++ &list_buf->list_data[j].fid); ++ if (rc) { ++ list_buf->list_data[j].fid.f_seq = 0; ++ list_buf->list_data[j].fid.f_oid = -rc; ++ rc = 0; ++ } ++ j++; ++ } ++ iput(inode); ++ } else { ++ CDEBUG(D_INFO, "failed to get inode ino=%lu\n", ino); ++ } ++ goto repeat_in_this_group; ++ } ++ list_buf->list_num = j; ++ return rc; ++} ++EXPORT_SYMBOL(ext4_snapshot_list_orphan); ++ ++/* ++ * ext4_snapshot_init() ++ * ++ * initialize snapshot at module loading ++ * ++ * \param[in] none ++ * ++ * \retval none ++ */ ++void ext4_snapshot_init(void) ++{ ++ /* create mutex object for snapshot lock list */ ++ mutex_init(&snap_list_mutex); ++ ++ /* initialize snapshot lock list */ ++ INIT_LIST_HEAD(&snap_lock_list); ++ ++ return; ++} ++ ++/* ++ * ext4_snapshot_exit() ++ * ++ * finalize snapshot at module removing ++ * ++ * \param[in] none ++ * ++ * \retval none ++ */ ++void ext4_snapshot_exit(void) ++{ ++ struct ext4_snapshot_gen_lock *entry, *tmp; ++ ++ /* lock snapshot lock list */ ++ mutex_lock(&snap_list_mutex); ++ /* delete & free all snapshot generation lock object from list */ ++ list_for_each_entry_safe(entry, tmp, &snap_lock_list, list) { ++ /* delete lock object from list */ ++ list_del(&entry->list); ++ /* free napshot lock object */ ++ kfree(entry); ++ } ++ /* unlock snapshot lock list */ ++ mutex_unlock(&snap_list_mutex); ++ /* destroy snapshot list lock object */ ++ mutex_destroy(&snap_list_mutex); ++ return; ++} +diff -urN -x .svn linux-stage.org/fs/ext4/snapshot.h linux-stage/fs/ext4/snapshot.h +--- linux-stage.org/fs/ext4/snapshot.h 1970-01-01 09:00:00.000000000 +0900 ++++ linux-stage/fs/ext4/snapshot.h 2018-11-01 12:13:30.000000000 +0900 +@@ -0,0 +1,368 @@ ++/* ++ * GPL HEADER START ++ * ++ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 only, ++ * as published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License version 2 for more details. A copy is ++ * included in the COPYING file that accompanied this code. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA ++ * ++ * GPL HEADER END ++ */ ++/* ++ * Copyright(c) 2016-2018 FUJITSU LIMITED. ++ * All rights reserved. ++ */ ++ ++#ifndef _EXT4_SNAPSHOT_H ++#define _EXT4_SNAPSHOT_H ++ ++#include "ext4_extents.h" ++#include "xattr.h" ++#define DEBUG_SNAPSHOT ++ ++#ifdef DEBUG_SNAPSHOT ++extern void ext4_show_snapshot_link(struct inode *inode, ++ const char *msg); ++extern void ext4_show_snapshot_blocks(struct inode *inode, ++ const char *msg, int flag); ++#else ++#define ext4_show_snapshot_link(inode, msg) ++#define ext4_show_snapshot_blocks(inode, msg, flag) ++#endif ++ ++ ++#define EXT4_XATTR_NAME_SNAPSHOT_LINK "snapshot_link" ++ ++/* snapshot flags in ext4. ++ * other snapshot flags are defined in ++ * lustre/include/lustre_snapshot.h */ ++#define SNAPSHOT_FLAGS_MASK \ ++ (EXT4_SNAPSHOT_FL | EXT4_SNAPSHOT_SP_FL | EXT4_SNAPSHOT_SHARE_FL) ++ ++#define SNAPSHOT_FLAGS_NODELMASK \ ++ (EXT4_SNAPSHOT_SP_FL | EXT4_SNAPSHOT_SHARE_FL) ++ ++#define OST_SNAPSHOT_ORIG_PATT EXT4_SNAPSHOT_SHARE_FL ++#define OST_SNAPSHOT_FILE_PATT \ ++ (EXT4_SNAPSHOT_SHARE_FL | EXT4_SNAPSHOT_SP_FL) ++#define OST_SNAPSHOT_DEL_PATT \ ++ (EXT4_SNAPSHOT_SHARE_FL | EXT4_SNAPSHOT_FL) ++ ++#define EXT4_TEST_OST_SNAPSHOT_FILE(inode) \ ++ ((EXT4_I(inode)->i_flags & SNAPSHOT_FLAGS_NODELMASK) == \ ++ OST_SNAPSHOT_FILE_PATT) ++ ++#define EXT4_TEST_OST_SNAPSHOT_ORIG(inode) \ ++ ((EXT4_I(inode)->i_flags & SNAPSHOT_FLAGS_NODELMASK) == \ ++ OST_SNAPSHOT_ORIG_PATT) ++ ++/* test if OST_SNAPSHOT_FILE or OST_SNAPSHOT_ORIG */ ++#define EXT4_TEST_OST_SNAPSHOT(inode) \ ++ ((EXT4_I(inode)->i_flags & EXT4_SNAPSHOT_SHARE_FL) != 0) ++ ++#define EXT4_TEST_OST_SNAPSHOT_DEL(inode) \ ++ ((EXT4_I(inode)->i_flags & OST_SNAPSHOT_DEL_PATT) == \ ++ OST_SNAPSHOT_DEL_PATT) ++ ++#define EXT4_SET_OST_SNAPSHOT_ORIG(inode) \ ++do { \ ++ ext4_clear_inode_flag(inode, EXT4_INODE_SNAPSHOT); \ ++ ext4_clear_inode_flag(inode, EXT4_INODE_SNAPSHOT_SP); \ ++ ext4_set_inode_flag(inode, EXT4_INODE_SNAPSHOT_SHARE); \ ++ ext4_set_inode_flags(inode); \ ++} while (0) ++ ++#define EXT4_SET_OST_SNAPSHOT_FILE(inode) \ ++do { \ ++ ext4_clear_inode_flag(inode, EXT4_INODE_SNAPSHOT); \ ++ ext4_set_inode_flag(inode, EXT4_INODE_SNAPSHOT_SP); \ ++ ext4_set_inode_flag(inode, EXT4_INODE_SNAPSHOT_SHARE); \ ++ ext4_set_inode_flags(inode); \ ++} while (0) ++ ++#define EXT4_CLEAR_OST_SNAPSHOT_FLAGS(inode) \ ++do { \ ++ ext4_clear_inode_flag(inode, EXT4_INODE_SNAPSHOT); \ ++ ext4_clear_inode_flag(inode, EXT4_INODE_SNAPSHOT_SP); \ ++ ext4_clear_inode_flag(inode, EXT4_INODE_SNAPSHOT_SHARE); \ ++ ext4_set_inode_flags(inode); \ ++} while (0) ++ ++#define EXT4_SNAPSHOT_SET_SPARSE_EXTENT(ext) \ ++do { \ ++ ext4_ext_store_pblock(ext, 0); \ ++ ext4_ext_mark_uninitialized(ext); \ ++} while (0) ++ ++#define EXT4_SNAPSHOT_TEST_SPARSE_EXTENT(ext) \ ++ ((ext4_ext_pblock(ext) == 0) \ ++ && ext4_ext_is_uninitialized(ext)) ++ ++#define SNAPSHOT_LINK_SIZE (sizeof(struct ext4_snapshot_link)) ++ ++/* snapshot error code in ext4. ++ * other snapshot error codes are defined in ++ * lustre/include/lustre_snapshot.h */ ++#define SNAPSHOT_ERR_NOMEM 2450 ++#define SNAPSHOT_ERR_NOSPC 2451 ++#define SNAPSHOT_ERR_SYSERR 2452 ++ ++#define SNAPSHOT_ERR_MSG1 "Snapshot cannot allocate memory.\n" ++#define SNAPSHOT_ERR_MSG2 "Snapshot no disk space left.\n" ++#define SNAPSHOT_ERR_MSG3 "Snapshot system error. " \ ++ "func=%s route=%d code=%d\n" ++ ++ ++#define SNAPSHOT_CONSOLE_ERR(err) ++ ++enum { ++ SNAPSHOT_CREATE_OP, ++ SNAPSHOT_DELETE_OP, ++ SNAPSHOT_DELETE_NEW_OP, ++ SNAPSHOT_CLEAR_LINK_OP, ++ SNAPSHOT_OP_MAX ++}; ++ ++/* snapshot lock wait timeout [sec] */ ++#define SNAPSHOT_LOCK_TIMEOUT 200 ++/* snapshot lock wait frequency count (1000 / delay[ms] : delay=10ms) */ ++#define SNAPSHOT_LOCK_FREQ 100 ++ ++/* define snapshot unlink state */ ++#define SNAPSHOT_DELETE_BLOCK 0 ++#define SNAPSHOT_DELETE_FILE 1 ++#define SNAPSHOT_DELETE_ERROR 2 ++ ++#define SNAPSHOT_MATCH_TS(t1, t2) \ ++ (((t1)->tv_sec == (t2)->tv_sec) && \ ++ ((t1)->tv_nsec == (t2)->tv_nsec)) ++#define SNAPSHOT_CLTIME(t) \ ++ ((((__u64)(t)->tv_sec) << 30) + (t)->tv_nsec) ++ ++ ++#ifndef _EXT4_DEBUG_H_ ++#define _EXT4_DEBUG_H_ ++ ++#ifdef EXT4_DEBUG ++#define D_TRACE 0x00000001 /* ENTRY/EXIT markers */ ++ ++#define CDEBUG(mask, format, a...) \ ++ printk("<5>Lustre: %d:%d:(%s:%d:%s()) " format, \ ++ 0, 0, __FILE__, __LINE__, __FUNCTION__, ## a); ++ ++#define GOTO(label) \ ++do { \ ++ CDEBUG(D_TRACE, "Process going to %s\n", #label); \ ++ goto label; \ ++} while (0) ++ ++#define GOTO_ERROR(label, err) \ ++do { \ ++ long GOTO__err = (long)(err); \ ++ CDEBUG(D_TRACE, "Process leaving via %s (err=%lu : %ld : %lx)\n",\ ++ #label, (unsigned long)GOTO__err, (signed long)GOTO__err,\ ++ (signed long)GOTO__err); \ ++ goto label; \ ++} while (0) ++ ++#define RETURN(rc) \ ++do { \ ++ typeof(rc) RETURN__ret = (rc); \ ++ CDEBUG(D_TRACE, "Process leaving (rc=%lu : %ld : %lx)\n", \ ++ (long)RETURN__ret, (long)RETURN__ret, (long)RETURN__ret);\ ++ return RETURN__ret; \ ++} while (0) ++ ++#define RETURN_ERROR(rc, err) \ ++do { \ ++ typeof(rc) RETURN__ret = (rc); \ ++ typeof(err) RETURN__err = (err); \ ++ CDEBUG(D_TRACE, "Process leaving (rc=%lu : %ld : %lx) (err=%lu : %ld : %lx)\n", \ ++ (long)RETURN__ret, (long)RETURN__ret, (long)RETURN__ret, \ ++ (long)RETURN__err, (long)RETURN__err, (long)RETURN__err); \ ++ return RETURN__ret; \ ++} while (0) ++ ++#define ENTRY CDEBUG(D_TRACE, "Process entered\n"); ++ ++#define EXIT \ ++do { \ ++ CDEBUG(D_TRACE, "Process leaving\n"); \ ++ EXIT_NESTING; \ ++} while (0) ++ ++#define LOG(fmt, ...) printk("<5>EXT4 %s %d: " fmt, __FUNCTION__, __LINE__, ##__VA_ARGS__); ++ ++#else /* !EXT4_DEBUG */ ++#include ++#define LOG(fmt, ...) ++#endif ++ ++#endif ++ ++ ++/* snapshot generation linked list */ ++struct ext4_snapshot_link { ++ unsigned long new_ino; /* newer snapshot inode no */ ++ unsigned long old_ino; /* older snapshot inode no */ ++ unsigned long org_ino; /* original file inode no */ ++ struct timespec org_ts; /* original file timestamp */ ++}; ++ ++/* snapshot generation lock list */ ++struct ext4_snapshot_gen_lock { ++ unsigned long org; /* original file inode no */ ++ struct timespec ts; /* original file timestamp */ ++ struct list_head list; /* list_head structure */ ++}; ++ ++/* moved from lustre/osd-ext4/osd_io.c */ ++struct bpointers { ++ unsigned long *blocks; ++ unsigned long start; ++ int num; ++ int init_num; ++ int create; ++}; ++ ++ ++extern int ext4_get_snapshot_lock_timeout(void); ++extern void ext4_set_snapshot_lock_timeout(int); ++extern int ext4_snapshot_lock(struct inode *, bool, ++ struct ext4_snapshot_gen_lock **); ++extern void ext4_snapshot_unlock(struct ext4_snapshot_gen_lock *); ++extern struct inode *ext4_snapshot_read_link(struct inode *, ++ struct ext4_snapshot_link *, ++ struct ext4_snapshot_link *, ++ bool, int*); ++extern int ext4_snapshot_copy_blocks(handle_t *, struct inode *, ++ struct inode *, ++ ext4_lblk_t, ++ ext4_lblk_t, bool); ++extern int ext4_snapshot_truncate_blocks(handle_t *, struct inode *, ++ struct inode *, ++ ext4_lblk_t, ++ ext4_lblk_t); ++extern int ext4_snapshot_punch(handle_t *, struct inode *, ++ __u64, __u64); ++extern void ext4_snapshot_init(void); ++extern void ext4_snapshot_exit(void); ++ ++static inline int ext4_calc_snapshot_link_credits(struct inode *inode, ++ int op) ++{ ++ int credits = 0; ++ struct super_block *sb = inode->i_sb; ++ /* calc journal credits ++ * below section is calculating credits for ext4_xattr_set() */ ++ credits = EXT4_DATA_TRANS_BLOCKS(sb); ++ if ((SNAPSHOT_LINK_SIZE >= EXT4_XATTR_MIN_LARGE_EA_SIZE(sb->s_blocksize)) && ++ EXT4_HAS_INCOMPAT_FEATURE(sb, ++ EXT4_FEATURE_INCOMPAT_EA_INODE)) { ++ int nrblocks = (SNAPSHOT_LINK_SIZE + sb->s_blocksize - 1) >> ++ sb->s_blocksize_bits; ++ /* For new inode */ ++ credits += EXT4_SINGLEDATA_TRANS_BLOCKS(sb) + 3; ++ /* For data blocks of EA inode */ ++ credits += ext4_meta_trans_blocks(inode, nrblocks, 0); ++ } ++ switch (op) { ++ case SNAPSHOT_CREATE_OP: ++ /* below credits is ++ * 1 : for create original link ++ * 1 : for create new snapshot link ++ * 1 : for create old snapshot link ++ * 1 : for delete original link (rollback) ++ * 1 : for delete new snapshot link (rollback) ++ * 1 : for delete old snapshot link (rollback) */ ++ credits *= 6; ++ return credits; ++ ++ case SNAPSHOT_DELETE_OP: ++ /* below credits is ++ * 1 : for update target snapshot link ++ * 1 : for update newer snapshot link ++ * 1 : for update older snapshot link ++ * 1 : for update newer snapshot link (rollback) */ ++ credits *= 4; ++ return credits; ++ ++ case SNAPSHOT_DELETE_NEW_OP: ++ /* below credits is ++ * 1 : for update target snapshot link ++ * 1 : for update newer snapshot link */ ++ credits *= 2; ++ return credits; ++ case SNAPSHOT_CLEAR_LINK_OP: ++ /* below credits is ++ * 1 : for update target snapshot link */ ++ return credits; ++ default: ++ CERROR("invalid opc=%d\n", op); ++ return 0; ++ } ++} ++ ++static inline int ext4_snapshot_set_link(handle_t *handle, ++ struct inode *inode, ++ struct ext4_snapshot_link *link) ++{ ++ int err = 0; ++ err = ext4_xattr_set_handle(handle, inode, ++ EXT4_XATTR_INDEX_TRUSTED, ++ EXT4_XATTR_NAME_SNAPSHOT_LINK, ++ (void *)link, SNAPSHOT_LINK_SIZE, 0); ++ if (err) ++ CERROR("fail to set snapshot link. " ++ "inode=%lu err=%d\n", inode->i_ino, err); ++ return err; ++} ++ ++static inline int ext4_snapshot_get_link(struct inode *inode, ++ struct ext4_snapshot_link *link) ++{ ++ int err = 0; ++ if (link == NULL) ++ BUG(); ++ ++ err = ext4_xattr_get(inode, ++ EXT4_XATTR_INDEX_TRUSTED, ++ EXT4_XATTR_NAME_SNAPSHOT_LINK, ++ (void *)link, SNAPSHOT_LINK_SIZE); ++ if (err != SNAPSHOT_LINK_SIZE) { ++ if (err >= 0) ++ err = -ENOLINK; ++ if (err < 0) ++ CDEBUG(D_ERROR, "fail to get snapshot link. " ++ "inode=%lu err=%d\n", inode->i_ino, err); ++ } else { ++ err = 0; ++ } ++ return err; ++} ++ ++static inline int ext4_snapshot_del_link(handle_t *handle, ++ struct inode *inode) ++{ ++ int err = 0; ++ err = ext4_xattr_set_handle(handle, inode, ++ EXT4_XATTR_INDEX_TRUSTED, ++ EXT4_XATTR_NAME_SNAPSHOT_LINK, ++ NULL, 0, 0); ++ if (err) ++ CERROR("fail to delete snapshot link. " ++ "inode=%lu err=%d\n", inode->i_ino, err); ++ return err; ++} ++#endif /* _LDISKF_SNAPSHOT_H */ +diff -urN -x .svn linux-stage.org/fs/ext4/snapshot_debug.h linux-stage/fs/ext4/snapshot_debug.h +--- linux-stage.org/fs/ext4/snapshot_debug.h 1970-01-01 09:00:00.000000000 +0900 ++++ linux-stage/fs/ext4/snapshot_debug.h 2018-09-03 14:15:30.000000000 +0900 +@@ -0,0 +1,272 @@ ++/* ++ * GPL HEADER START ++ * ++ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 only, ++ * as published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License version 2 for more details. A copy is ++ * included in the COPYING file that accompanied this code. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA ++ * ++ * GPL HEADER END ++ */ ++/* ++ * Copyright(c) 2016-2018 FUJITSU LIMITED. ++ * All rights reserved. ++ */ ++ ++#ifndef _SNAPSHOT_DEBUG_H ++#define _SNAPSHOT_DEBUG_H ++ ++#ifdef DEBUG_SNAPSHOT ++/* ++ * ext4_show_snapshot_link() ++ * ++ * show snapshot link list to syslog ++ * ++ * \param[in] inode target inode ++ * \param[in] msg title message ++ * ++ * \retval none ++ */ ++void ext4_show_snapshot_link(struct inode *inode, const char *msg) ++{ ++ struct inode *next_inode; ++ int no, err; ++ struct ext4_snapshot_link link; ++ ++ if (!EXT4_TEST_OST_SNAPSHOT(inode)) ++ return; ++ ++ printk(KERN_ERR "----- snapshot link summary [%-10s] " ++ "------------------\n", msg); ++ /* find newest snapshot linked inode */ ++ next_inode = inode; ++ err = ext4_xattr_get(inode, EXT4_XATTR_INDEX_TRUSTED, ++ EXT4_XATTR_NAME_SNAPSHOT_LINK, ++ (void *)&link, sizeof(link)); ++ if (err < 0) { ++ printk(KERN_ERR "SNAPSHOT: ERROR: Cannot find xattr " ++ "ino=%ld\n", inode->i_ino); ++ return; ++ } ++ while (link.new_ino) { ++ next_inode = ext4_iget(inode->i_sb, link.new_ino); ++ if (IS_ERR(next_inode)) { ++ printk(KERN_ERR "SNAPSHOT: ERROR: Cannot find inode " ++ "ino=%ld\n", link.new_ino); ++ return; ++ } ++ err = ext4_xattr_get(next_inode, ++ EXT4_XATTR_INDEX_TRUSTED, ++ EXT4_XATTR_NAME_SNAPSHOT_LINK, ++ (void *)&link, sizeof(link)); ++ if (err < 0) { ++ printk(KERN_ERR "SNAPSHOT: ERROR: Cannot find xattr " ++ "ino=%ld\n", next_inode->i_ino); ++ return; ++ } ++ iput(next_inode); ++ } ++ /* show snapshot link info foreach */ ++ no = 0; ++ printk(KERN_ERR "SNAPSHOT: SNAP%02d ino=%lu flag=%08X new=%lu " ++ "old=%lu\n", no, next_inode->i_ino, ++ next_inode->i_flags, link.new_ino, link.old_ino); ++ ++ while (link.old_ino) { ++ bool me = false; ++ if (no > 10) { ++ printk(KERN_ERR "SNAPSHOT: ERROR: Too many inodes " ++ "of snapshot link\n"); ++ break; ++ } ++ if (link.old_ino != inode->i_ino) { ++ next_inode = ext4_iget(inode->i_sb, link.old_ino); ++ if (IS_ERR(next_inode)) { ++ printk(KERN_ERR "SNAPSHOT: ERROR: Cannot find inode " ++ " ino=%ld\n", link.old_ino); ++ return; ++ } ++ } else { ++ next_inode = inode; ++ me = true; ++ } ++ err = ext4_xattr_get(next_inode, ++ EXT4_XATTR_INDEX_TRUSTED, ++ EXT4_XATTR_NAME_SNAPSHOT_LINK, ++ (void *)&link, sizeof(link)); ++ if (err < 0) { ++ printk(KERN_ERR "SNAPSHOT: ERROR: Cannot find xattr " ++ "ino=%ld\n", next_inode->i_ino); ++ return; ++ } ++ no++; ++ printk(KERN_ERR "SNAPSHOT:%sSNAP%02d ino=%lu flag=%08X " ++ "new=%lu old=%lu\n", (me ? "*" : " "), ++ no, next_inode->i_ino, ++ next_inode->i_flags, link.new_ino, ++ link.old_ino); ++ if (!me) ++ iput(next_inode); ++ } ++ printk(KERN_ERR "--------------------------------------------------" ++ "---------\n"); ++ return; ++} ++EXPORT_SYMBOL(ext4_show_snapshot_link); ++ ++/* ++ * ext4_show_inode_blocks() ++ * ++ * show specified inode extent blocks summery to syslog ++ * ++ * \param[in] inode target inode ++ * ++ * \retval none ++ */ ++static void ext4_show_inode_blocks(struct inode *inode) ++{ ++ struct ext4_extent *ex = NULL; ++ struct ext4_ext_path *p, *path = NULL; ++ ext4_fsblk_t pblock; ++ ext4_lblk_t block, ee_block, max; ++ unsigned short ee_len; ++ int line; ++ ++ /* max : logical block no (to) */ ++ max = (inode->i_size + EXT4_BLOCK_SIZE(inode->i_sb) - 1) ++ >> EXT4_BLOCK_SIZE_BITS(inode->i_sb); ++ printk(KERN_ERR "SNAPSHOT: ino=%lu i_blocks=%lu i_size=%llu " ++ "max=%u\n", inode->i_ino, inode->i_blocks, ++ inode->i_size, max); ++ ++ /* repeat until block num */ ++ for (block = line = 0; ((block < max) && (line < 30)); line++) { ++ /* get extent path of compare inode */ ++ p = ext4_ext_find_extent(inode, block, path); ++ if (IS_ERR(p)) { ++ /* not exist extent path */ ++ printk(KERN_ERR "SNAPSHOT: fail to get extent: " ++ "lblk=%u, err=%ld", block, PTR_ERR(p)); ++ break; ++ } ++ path = p; ++ ++ /* get physical block no of compare inode */ ++ ex = path[ext_depth(inode)].p_ext; ++ if (!ex) { ++ printk(KERN_ERR "SNAPSHOT: ino=%lu No extent\n", ++ inode->i_ino); ++ break; ++ } ++ ++ /* get block num of compare extent */ ++ ee_block = le32_to_cpu(ex->ee_block); ++ ee_len = ext4_ext_get_actual_len(ex); ++ if (block >= ee_block + ee_len) { ++ block++; ++ continue; ++ } ++ ++ block = ee_block + ee_len; ++ pblock = ext4_ext_pblock(ex); ++ printk(KERN_ERR "SNAPSHOT: ino=%lu " ++ "block[%u - %u] pblock=[%Lu - %Lu]\n", ++ inode->i_ino, ee_block, block - 1, ++ pblock, pblock + ee_len - 1); ++ } ++ if (path) { ++ ext4_ext_drop_refs(path); ++ kfree(path); ++ } ++ ++ if ((block < max) && ex) ++ printk(KERN_ERR "SNAPSHOT: ino=%lu ... since too many " ++ "blocks, interrupted.\n", inode->i_ino); ++ return; ++} ++ ++/* ++ * ext4_show_snapshot_blocks() ++ * ++ * show specified inode & neary snapshot inode extent blocks summery to syslog ++ * ++ * \param[in] inode target inode ++ * \param[in] msg title message ++ * \param[in] flag show snapshot blocks flag 0=disable 1=enable ++ * ++ * \retval none ++ */ ++void ext4_show_snapshot_blocks(struct inode *inode, const char *msg, ++ int flag) ++{ ++ struct inode *next_inode; ++ struct ext4_snapshot_link link; ++ int err; ++ ++ /* check if target inode is snapshot file */ ++ if (!EXT4_TEST_OST_SNAPSHOT(inode)) ++ return; ++ ++ printk(KERN_ERR "===== inode blocks summary [%-10s] " ++ "===================\n", msg); ++ /* show specified inode blocks */ ++ ext4_show_inode_blocks(inode); ++ ++ /* show neary snapshot blocks too ? */ ++ if (!flag) ++ goto out; ++ ++ err = ext4_xattr_get(inode, EXT4_XATTR_INDEX_TRUSTED, ++ EXT4_XATTR_NAME_SNAPSHOT_LINK, ++ (void *)&link, sizeof(link)); ++ if (err < 0) ++ goto out; ++ ++ /* show older snapshot inode blocks, if exist */ ++ if (link.old_ino) { ++ /* get compare inode */ ++ next_inode = ext4_iget(inode->i_sb, link.old_ino); ++ if (IS_ERR(next_inode)) { ++ /* not exist compare inode */ ++ goto skip; ++ } ++ printk(KERN_ERR "----- old snapshot ----------------------" ++ "-------------\n"); ++ ext4_show_inode_blocks(next_inode); ++ iput(next_inode); ++ } ++ ++skip: ++ /* show newer snapshot inode blocks, if exist */ ++ if (link.new_ino) { ++ /* get compare inode */ ++ next_inode = ext4_iget(inode->i_sb, link.new_ino); ++ if (IS_ERR(next_inode)) { ++ /* not exist compare inode */ ++ goto out; ++ } ++ printk(KERN_ERR "----- new snapshot ----------------------" ++ "-------------\n"); ++ ext4_show_inode_blocks(next_inode); ++ iput(next_inode); ++ } ++ ++out: ++ printk(KERN_ERR "=================================================" ++ "==========\n"); ++ return; ++} ++EXPORT_SYMBOL(ext4_show_snapshot_blocks); ++#endif ++#endif +diff -urN -x .svn linux-stage.org/fs/ext4/super.c linux-stage/fs/ext4/super.c +--- linux-stage.org/fs/ext4/super.c 2018-08-31 20:53:57.000000000 +0900 ++++ linux-stage/fs/ext4/super.c 2018-10-24 14:05:04.000000000 +0900 +@@ -48,9 +48,9 @@ + + #include "ext4.h" + #include "ext4_jbd2.h" +-#include "xattr.h" + #include "acl.h" + #include "mballoc.h" ++#include "snapshot.h" + + #define CREATE_TRACE_POINTS + #include +@@ -2268,12 +2268,27 @@ + list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan); + vfs_dq_init(inode); + if (inode->i_nlink) { ++ int err; ++ struct ext4_snapshot_gen_lock *lock = NULL; ++ + ext4_msg(sb, KERN_DEBUG, + "%s: truncating inode %lu to %lld bytes", + __func__, inode->i_ino, inode->i_size); + jbd_debug(2, "truncating inode %lu to %lld bytes\n", + inode->i_ino, inode->i_size); +- ext4_truncate(inode); ++ ++ err = ext4_snapshot_orphan_truncate(inode, &lock); ++ if (err) { ++ CERROR("fail to truncate orphan inode " ++ "OST=%s ost_inode=%lu err=%d\n", ++ EXT4_SB(inode->i_sb)->s_es->s_volume_name, ++ inode->i_ino, err); ++ SNAPSHOT_CONSOLE_ERR(err); ++ } else ++ ext4_truncate(inode); ++ ++ if (lock) ++ ext4_snapshot_unlock(lock); + nr_truncates++; + } else { + ext4_msg(sb, KERN_DEBUG, +@@ -5173,6 +5188,10 @@ + err = init_inodecache(); + if (err) + goto out1; ++ ++ /* initialize snapshot function */ ++ ext4_snapshot_init(); ++ + err = register_filesystem(&ext4_fs_type); + if (err) + goto out; +@@ -5197,6 +5216,7 @@ + { + ext4_destroy_lazyinit_thread(); + unregister_filesystem(&ext4_fs_type); ++ ext4_snapshot_exit(); + destroy_inodecache(); + exit_ext4_xattr(); + exit_ext4_mballoc(); diff --git a/ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel6.5.series b/ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel6.5.series index fbbac67..9bbf181 100644 --- a/ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel6.5.series +++ b/ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel6.5.series @@ -48,3 +48,4 @@ rhel6.3/ext4-max-dir-size.patch rhel6.4/ext4-max-dir-size-options.patch rhel6.3/ext4-not-discard-preallocation-umount.patch rhel6.3/ext4-journal-path-opt.patch +rhel6.5/dl_snapshot.patch diff --git a/lustre/doc/lctl.8 b/lustre/doc/lctl.8 index dc71f4a..801e963 100644 --- a/lustre/doc/lctl.8 +++ b/lustre/doc/lctl.8 @@ -381,6 +381,17 @@ Stop LFSCK on all devices. .TP -h, --help Show this help. +.br +.PP +.SS Snapshot Operations +.TP +.B snapshot +The command controls the snapshot feature. The on argument enables the snapshot feature. The status argument shows whether the snapshot feature is enabled or not. +Root privileges are needed to execute this command. This command must be run on the MDS node which manages the MDT0 device. +.TP +.B snapshot_get_orphan < --list | --fid | --delete [-f] > +The command controls orphan object files at OSTs. The --list option shows orphan object files at the specified OST. The --fid option shows object files which refer the specified object file. The --delete option deletes the specified object file. If the -f option is specified, do not prompt before deleting. +Root privileges are needed to execute this command. This command must be run on the OSS node. .SS Debug .TP .BI debug_daemon diff --git a/lustre/doc/lfs.1 b/lustre/doc/lfs.1 index 1d5d9fb..9880fa2 100644 --- a/lustre/doc/lfs.1 +++ b/lustre/doc/lfs.1 @@ -84,6 +84,12 @@ lfs \- Lustre utility to create a file with specific striping pattern, find the .br .B lfs data_version [-n] \fB\fR .br +.B lfs snapshot --create [-s ] [-d ] +.br +.B lfs snapshot --delete -s [-d ] [-f] +.br +.B lfs snapshot --list [-d ] [-R] +.br .B lfs help .SH DESCRIPTION .B lfs @@ -284,6 +290,31 @@ MDT0000. This is restricted to avoid creating directory trees that have intermediate path components on a series different MDTs and become unavailable if any of the intermediate MDTs are offline. .TP +.B snapshot --create [-s ] [-d ] +To create a snapshot for the +.IR directory +with the +.IR snapname . +.TP +.B snapshot --delete -s [-d ] [-f] [-I] +To delete a snapshot named +.IR snapname +from the +.IR directory . +If the +.B -f +option is specified, do not prompt before deleting. +If the +.B -I +option is specified, ignore restriction by CoW size. +.TP +.B snapshot --list [-d ] [-R] +To list snapshot information for the +.IR directory . +If the +.B -R +option is specified, list their sub directory snapshots recursively. +.TP .B help Provides brief help on the various arguments .TP diff --git a/lustre/include/Makefile.am b/lustre/include/Makefile.am index 093b587..302a0c9 100644 --- a/lustre/include/Makefile.am +++ b/lustre/include/Makefile.am @@ -90,4 +90,5 @@ EXTRA_DIST = \ obd_class.h \ obd.h \ obd_support.h \ - obd_target.h + obd_target.h \ + lustre_snapshot.h diff --git a/lustre/include/dt_object.h b/lustre/include/dt_object.h index a39c461..676217c 100644 --- a/lustre/include/dt_object.h +++ b/lustre/include/dt_object.h @@ -173,6 +173,16 @@ struct dt_device_operations { struct dt_device *dev, int mode, unsigned long timeout, __u32 alg, struct lustre_capa_key *keys); + + /** + * snaphost is enabled, or tests. + */ + int (*dt_snapshot_get_enable)(const struct lu_env *env, + struct dt_device *dev); + int (*dt_snapshot_set_enable)(const struct lu_env *env, + struct dt_device *dev); + int (*dt_snapshot_list_orphan)(struct dt_device *dev, + void *buf); }; struct dt_index_features { @@ -476,6 +486,37 @@ struct dt_object_operations { int (*do_object_unlock)(const struct lu_env *env, struct dt_object *dt, struct ldlm_enqueue_info *einfo, union ldlm_policy_data *policy); + + /** + * snapshot + */ + int (*do_lod_snapshot_clone)(const struct lu_env *env, + struct dt_object *snap_dt, + struct lu_attr *attr, + struct lu_buf *buff); + int (*do_osp_snapshot_clone)(const struct lu_env *env, + struct dt_object *snap_dt, + struct lu_attr *attr, + const struct lu_fid *orig_fid, + const struct lu_fid *mdt_fid); + int (*do_osd_declare_snapshot_clone)(const struct lu_env *env, + struct dt_object *snap_dt, + struct dt_object *orig_dt, + struct thandle *thandle, + int ignore_flag); + int (*do_osd_snapshot_clone)(struct dt_object *snap_dt, + struct dt_object *orig_dt); + int (*do_osd_snapshot_get_old_list)(struct dt_object *snap_dt, + void *list_buf); + int (*do_osd_snapshot_lock)(struct dt_object *dt, bool create, + void **lock); + void (*do_osd_snapshot_unlock)(void *lock); + __u32 (*do_osd_snapshot_get_info)(struct dt_object *dt); + int (*do_osd_snapshot_set_del_flag)(struct dt_object *dt); + int (*do_osd_snapshot_get_orphan)(struct dt_object *dt, + void *fid_buf, + int *array_num); + int (*do_osd_snapshot_destroy)(struct dt_object *dt, void *orig_fid); }; /** @@ -1539,6 +1580,121 @@ static inline int dt_lookup(const struct lu_env *env, return ret; } +static inline int dt_snapshot_get_enable(const struct lu_env *env, + struct dt_device *dt) +{ + LASSERT(dt); + LASSERT(dt->dd_ops); + LASSERT(dt->dd_ops->dt_snapshot_get_enable); + return dt->dd_ops->dt_snapshot_get_enable(env, dt); +} + +static inline int dt_snapshot_set_enable(const struct lu_env *env, + struct dt_device *dt) +{ + LASSERT(dt); + LASSERT(dt->dd_ops); + LASSERT(dt->dd_ops->dt_snapshot_set_enable); + return dt->dd_ops->dt_snapshot_set_enable(env, dt); +} + +static inline int dt_snapshot_list_orphan(struct dt_device *dt, + void *buf) +{ + LASSERT(dt); + LASSERT(dt->dd_ops); + LASSERT(dt->dd_ops->dt_snapshot_list_orphan); + return dt->dd_ops->dt_snapshot_list_orphan(dt, buf); +} + +static inline int dt_snapshot_lock(struct dt_object *dt, + bool create, void **lock) +{ + LASSERT(dt); + LASSERT(dt->do_ops); + LASSERT(dt->do_ops->do_osd_snapshot_lock); + return dt->do_ops->do_osd_snapshot_lock(dt, create, lock); +} + +static inline void dt_snapshot_unlock(struct dt_object *dt, void *lock) +{ + LASSERT(dt); + LASSERT(dt->do_ops); + LASSERT(dt->do_ops->do_osd_snapshot_unlock); + dt->do_ops->do_osd_snapshot_unlock(lock); +} + +static inline int dt_snapshot_get_orphan(struct dt_object *dt, + void *fid_buf, + int *array_num) +{ + LASSERT(dt); + LASSERT(dt->do_ops); + LASSERT(dt->do_ops->do_osd_snapshot_get_orphan); + return dt->do_ops->do_osd_snapshot_get_orphan(dt, + fid_buf, + array_num); +} + +static inline int dt_snapshot_destroy(struct dt_object *dt, void *orig_fid) +{ + LASSERT(dt); + LASSERT(dt->do_ops); + LASSERT(dt->do_ops->do_osd_snapshot_destroy); + return dt->do_ops->do_osd_snapshot_destroy(dt, orig_fid); +} + +static inline int dt_osp_snapshot_clone(const struct lu_env *env, + struct dt_object *snap_dt, + struct lu_attr *attr, + const struct lu_fid *orig_fid, + const struct lu_fid *mdt_fid) +{ + LASSERT(snap_dt); + LASSERT(snap_dt->do_ops); + LASSERT(snap_dt->do_ops->do_osp_snapshot_clone); + return snap_dt->do_ops->do_osp_snapshot_clone(env, snap_dt, + attr, + orig_fid, + mdt_fid); +} + +static inline int dt_osd_declare_snapshot_clone(const struct lu_env *env, + struct dt_object *snap_dt, + struct dt_object *orig_dt, + struct thandle *thandle, + int ignore_flag) +{ + LASSERT(snap_dt); + LASSERT(snap_dt->do_ops); + LASSERT(snap_dt->do_ops->do_osd_declare_snapshot_clone); + return snap_dt->do_ops->do_osd_declare_snapshot_clone(env, + snap_dt, + orig_dt, + thandle, + ignore_flag); +} + +static inline int dt_osd_snapshot_clone(struct dt_object *snap_dt, + struct dt_object *orig_dt) +{ + LASSERT(snap_dt); + LASSERT(snap_dt->do_ops); + LASSERT(snap_dt->do_ops->do_osd_snapshot_clone); + return snap_dt->do_ops->do_osd_snapshot_clone(snap_dt, + orig_dt); +} + +static inline int dt_osd_snapshot_get_old_list(struct dt_object *snap_dt, + void *list_buf) +{ + LASSERT(snap_dt); + LASSERT(snap_dt->do_ops); + LASSERT(snap_dt->do_ops->do_osd_snapshot_get_old_list); + return snap_dt->do_ops->do_osd_snapshot_get_old_list(snap_dt, list_buf); +} + + #define LU221_BAD_TIME (0x80000000U + 24 * 3600) struct dt_find_hint { diff --git a/lustre/include/linux/lustre_compat25.h b/lustre/include/linux/lustre_compat25.h index 4b6fddf..cfa7e59 100644 --- a/lustre/include/linux/lustre_compat25.h +++ b/lustre/include/linux/lustre_compat25.h @@ -133,6 +133,14 @@ static inline void ll_set_fs_pwd(struct fs_struct *fs, struct vfsmount *mnt, # define inode_dio_done(i) up_read(&(i)->i_alloc_sem) #endif +#ifdef HAVE_IOP_ATOMIC_OPEN +#define ll_iop_lookup(parent, dentry) (parent->i_op->lookup(parent, \ + dentry, LOOKUP_PARENT)) +#else +#define ll_iop_lookup(parent, dentry) (parent->i_op->lookup(parent, \ + dentry, NULL)) +#endif + #ifndef FS_HAS_FIEMAP #define FS_HAS_FIEMAP (0) #endif diff --git a/lustre/include/lu_object.h b/lustre/include/lu_object.h index d3b234d..68cf69b 100644 --- a/lustre/include/lu_object.h +++ b/lustre/include/lu_object.h @@ -897,7 +897,8 @@ struct lu_rdpg { enum lu_xattr_flags { LU_XATTR_REPLACE = (1 << 0), - LU_XATTR_CREATE = (1 << 1) + LU_XATTR_CREATE = (1 << 1), + LU_XATTR_SNAPSHOT = (1 << 15) }; /** @} helpers */ diff --git a/lustre/include/lu_target.h b/lustre/include/lu_target.h index d01a49e..ac8a772 100644 --- a/lustre/include/lu_target.h +++ b/lustre/include/lu_target.h @@ -121,6 +121,9 @@ struct tgt_session_info { bool tsi_preprocessed; /* request JobID */ char *tsi_jobid; + + /* disable READONLY control in snapshot */ + int tsi_snapshot; }; static inline struct tgt_session_info *tgt_ses_info(const struct lu_env *env) @@ -165,6 +168,60 @@ static inline void tgt_opdata_clear(const struct lu_env *env, __u64 flags) } /* + * tgt_snapshot_set() + * + * set snapshot progress (not readonly for snapshot files) + * + * \param[in] env lu environment + */ +static inline void tgt_snapshot_set(const struct lu_env *env) +{ + struct tgt_session_info *tsi; + + LASSERT(env->le_ses); + tsi = tgt_ses_info(env); + tsi->tsi_snapshot = 1; +} + +/* + * tgt_snapshot_clear() + * + * clear snapshot progress (readonly for snapshot files) + * + * \param[in] env lu environment + */ +static inline void tgt_snapshot_clear(const struct lu_env *env) +{ + struct tgt_session_info *tsi; + + LASSERT(env->le_ses); + tsi = tgt_ses_info(env); + tsi->tsi_snapshot = 0; +} + +/* + * tgt_snapshot() + * + * check snapshot readonly progress + * + * \param[in] env lu environment + * + * \retval 0 not snapshot process (readonly for snapshot files) + * \retval 1 snapshot process (not readonly for snapshot files) + */ +static inline int tgt_snapshot(const struct lu_env *env) +{ + struct tgt_session_info *tsi; + int rc = 0; + + LASSERT(env->le_ses); + tsi = tgt_ses_info(env); + rc = tsi->tsi_snapshot; + + return rc; +} + +/* * Generic unified target support. */ enum tgt_handler_flags { diff --git a/lustre/include/lustre/lustre_idl.h b/lustre/include/lustre/lustre_idl.h index aa2de2b..fb8dd66 100644 --- a/lustre/include/lustre/lustre_idl.h +++ b/lustre/include/lustre/lustre_idl.h @@ -143,6 +143,8 @@ #define SEQ_CONTROLLER_PORTAL 32 #define MGS_BULK_PORTAL 33 +#define MDS_SNAPSHOT_PORTAL 49 + /* Portal 63 is reserved for the Cray Inc DVS - nic@cray.com, roe@cray.com, n8851@cray.com */ /* packet types */ @@ -1533,6 +1535,7 @@ typedef enum { OST_QUOTACHECK = 18, OST_QUOTACTL = 19, OST_QUOTA_ADJUST_QUNIT = 20, /* not used since 2.4 */ + OST_SNAPSHOT = 32, OST_LAST_OPC } ost_cmd_t; #define OST_FIRST_OPC OST_REPLY @@ -2184,6 +2187,7 @@ typedef enum { REINT_SETXATTR = 7, REINT_RMENTRY = 8, REINT_MIGRATE = 9, + REINT_SNAPSHOT = 21, REINT_MAX } mds_reint_t, mdt_reint_t; @@ -2274,6 +2278,11 @@ enum md_op_flags { #define LUSTRE_NOATIME_FL 0x00000080 /* do not update atime */ #define LUSTRE_DIRSYNC_FL 0x00010000 /* dirsync behaviour (dir only) */ +/* i_flags for snapshot */ +#define LUSTRE_SNAPSHOT_SHARE_FL 0x01000000 /* data block shared */ +#define LUSTRE_SNAPSHOT_SP_FL 0x04000000 /* snapshot sp flag */ +#define LUSTRE_SNAPSHOT_FL 0x08000000 /* snapshot dir/file */ + #ifdef __KERNEL__ /* Convert wire LUSTRE_*_FL to corresponding client local VFS S_* values * for the client inode i_flags. The LUSTRE_*_FL are the Lustre wire @@ -2289,7 +2298,12 @@ static inline int ll_ext_to_inode_flags(int flags) #if defined(S_DIRSYNC) ((flags & LUSTRE_DIRSYNC_FL) ? S_DIRSYNC : 0) | #endif - ((flags & LUSTRE_IMMUTABLE_FL) ? S_IMMUTABLE : 0)); + ((flags & LUSTRE_IMMUTABLE_FL) ? S_IMMUTABLE : 0) | + ((flags & LUSTRE_SNAPSHOT_SHARE_FL) + ? LUSTRE_SNAPSHOT_SHARE_FL : 0) | + ((flags & LUSTRE_SNAPSHOT_SP_FL) + ? LUSTRE_SNAPSHOT_SP_FL : 0) | + ((flags & LUSTRE_SNAPSHOT_FL) ? LUSTRE_SNAPSHOT_FL : 0)); } static inline int ll_inode_to_ext_flags(int iflags) @@ -2300,7 +2314,11 @@ static inline int ll_inode_to_ext_flags(int iflags) #if defined(S_DIRSYNC) ((iflags & S_DIRSYNC) ? LUSTRE_DIRSYNC_FL : 0) | #endif - ((iflags & S_IMMUTABLE) ? LUSTRE_IMMUTABLE_FL : 0)); + ((iflags & S_IMMUTABLE) ? LUSTRE_IMMUTABLE_FL : 0) | + ((iflags & LUSTRE_SNAPSHOT_SHARE_FL) + ? LUSTRE_SNAPSHOT_SHARE_FL : 0) | + ((iflags & LUSTRE_SNAPSHOT_SP_FL) ? LUSTRE_SNAPSHOT_SP_FL : 0) | + ((iflags & LUSTRE_SNAPSHOT_FL) ? LUSTRE_SNAPSHOT_FL : 0)); } #endif @@ -2521,6 +2539,7 @@ enum mds_op_bias { MDS_OWNEROVERRIDE = 1 << 11, MDS_HSM_RELEASE = 1 << 12, MDS_RENAME_MIGRATE = 1 << 13, + MDS_SNAPSHOT = 1 << 30, }; /* instance of mdt_reint_rec */ @@ -2674,6 +2693,30 @@ struct mdt_rec_setxattr { __u32 sx_padding_11; /* rr_padding_4 */ }; +struct mdt_rec_snapshot_create { + __u32 sc_opcode; + __u32 sc_cap; + __u32 sc_fsuid; + __u32 sc_fsuid_h; + __u32 sc_fsgid; + __u32 sc_fsgid_h; + __u32 sc_suppgid1; + __u32 sc_suppgid1_h; + __u32 sc_suppgid2; + __u32 sc_suppgid2_h; + struct lu_fid sc_snapdir_fid; + struct lu_fid sc_snapshot_fid; + obd_time sc_mtime; + obd_time sc_atime; + obd_time sc_ctime; + __u64 sc_file_owner; + struct lu_fid sc_orig_fid; + __u32 sc_padding_6; + __u32 sc_mode; + __u32 sc_umask; + __u32 sc_flags; +}; + /* * mdt_rec_reint is the template for all mdt_reint_xxx structures. * Do NOT change the size of various members, otherwise the value @@ -3549,6 +3592,9 @@ struct obdo { #define o_dropped o_misc #define o_cksum o_nlink #define o_grant_used o_data_version +#define o_snapshot_orig_seq o_data_version +#define o_snapshot_orig_oid o_uid_h +#define o_snapshot_orig_ver o_gid_h struct lfsck_request { __u32 lr_event; diff --git a/lustre/include/lustre/lustre_user.h b/lustre/include/lustre/lustre_user.h index 8b612a3..a79d29e 100644 --- a/lustre/include/lustre/lustre_user.h +++ b/lustre/include/lustre/lustre_user.h @@ -206,6 +206,40 @@ struct ost_id { #define DOSTID LPX64":"LPU64 #define POSTID(oi) ostid_seq(oi), ostid_id(oi) +/* snapshot request data for create */ +typedef struct { + int src_fd; + __u32 name_len; + char name[256]; + __u32 hidden_f:2; + __u32 mode; + uid_t uid; + gid_t gid; + struct timespec atim; + struct timespec mtim; +} snapshot_create_t; + +/* snapshot request data */ +struct snapshot_data { + __u32 subcmd; + union { + snapshot_create_t create; + } req; +}; + +/* snapshot request data */ +struct snapshot_list_data { + struct lu_fid fid; + unsigned long ost_ino; +}; + +#define IOC_SNAPSHOT_LIST_MAX 256 + +struct snapshot_list_buf { + struct snapshot_list_data list_data[IOC_SNAPSHOT_LIST_MAX]; + int list_num; +}; + /* * The ioctl naming rules: * LL_* - works on the currently opened filehandle instead of parent dir @@ -274,9 +308,18 @@ struct ost_id { #define IOC_MDC_GETFILESTRIPE _IOWR(IOC_MDC_TYPE, 21, struct lov_user_md *) #define IOC_MDC_GETFILEINFO _IOWR(IOC_MDC_TYPE, 22, struct lov_user_mds_data *) #define LL_IOC_MDC_GETINFO _IOWR(IOC_MDC_TYPE, 23, struct lov_user_mds_data *) +#define LL_IOC_SNAPSHOT _IOWR('F', 33, struct snapshot_data) #define MAX_OBD_NAME 128 /* If this changes, a NEW ioctl must be added */ +enum { + LL_SNAPSHOT_CHKENABLED = 1, + LL_SNAPSHOT_STAT, + LL_SNAPSHOT_CREATE, + LL_SNAPSHOT_CREATE_POST, + LL_SNAPSHOT_UNLINK, +}; + /* Define O_LOV_DELAY_CREATE to be a mask that is not useful for regular * files, but are unlikely to be used in practice and are not harmful if * used incorrectly. O_NOCTTY and FASYNC are only meaningful for character diff --git a/lustre/include/lustre_ioctl.h b/lustre/include/lustre_ioctl.h index dc48ad0..be3278a 100644 --- a/lustre/include/lustre_ioctl.h +++ b/lustre/include/lustre_ioctl.h @@ -402,4 +402,21 @@ obd_ioctl_unpack(struct obd_ioctl_data *data, char *pbuf, int max_len) #define IOC_OSC_SET_ACTIVE _IOWR('h', 21, void *) +#define OBD_IOC_SNAPSHOT _IOR('F', 31, OBD_IOC_DATA_TYPE) + +enum obd_ioc_snapshot_subcmd { + OBD_IOC_SNAPSHOT_ON, + OBD_IOC_SNAPSHOT_STATUS, + OBD_IOC_SNAPSHOT_LOCK, + OBD_IOC_SNAPSHOT_UNLOCK, +}; + +#define OBD_IOC_SNAPSHOT_ORPHAN _IOR('F', 30, OBD_IOC_DATA_TYPE) + +enum obd_ioc_snapshot_get_orphan_subcmd { + OBD_IOC_SNAPSHOT_ORPHAN_INODE, + OBD_IOC_SNAPSHOT_LIST_ORPHAN, + OBD_IOC_SNAPSHOT_ORPHAN_DEL, +}; + #endif /* LUSTRE_IOCTL_H_ */ diff --git a/lustre/include/lustre_net.h b/lustre/include/lustre_net.h index e10d40e..6c2a2d2 100644 --- a/lustre/include/lustre_net.h +++ b/lustre/include/lustre_net.h @@ -310,6 +310,12 @@ #define MDS_SETA_NTHRS_MAX MDS_MAX_OTHR_THREADS #define MDS_SETA_NTHRS_BASE min(48, MDS_SETA_NTHRS_MAX) +/* read-page service */ +#define MDS_SNAPSHOT_THR_FACTOR 4 +#define MDS_SNAPSHOT_NTHRS_INIT 2 +#define MDS_SNAPSHOT_NTHRS_MAX 8 +#define MDS_SNAPSHOT_NTHRS_BASE min(16, MDS_SNAPSHOT_NTHRS_MAX) + /* non-affinity threads */ #define MDS_OTHR_NTHRS_INIT PTLRPC_NTHRS_INIT #define MDS_OTHR_NTHRS_MAX MDS_MAX_OTHR_THREADS diff --git a/lustre/include/lustre_req_layout.h b/lustre/include/lustre_req_layout.h index fb57f19..1966171 100644 --- a/lustre/include/lustre_req_layout.h +++ b/lustre/include/lustre_req_layout.h @@ -183,6 +183,7 @@ extern struct req_format RQF_MDS_REINT_LINK; extern struct req_format RQF_MDS_REINT_RENAME; extern struct req_format RQF_MDS_REINT_SETATTR; extern struct req_format RQF_MDS_REINT_SETXATTR; +extern struct req_format RQF_MDS_REINT_SNAPSHOT; extern struct req_format RQF_MDS_QUOTACHECK; extern struct req_format RQF_MDS_QUOTACTL; extern struct req_format RQF_QC_CALLBACK; @@ -216,6 +217,7 @@ extern struct req_format RQF_OST_GET_INFO_LAST_ID; extern struct req_format RQF_OST_GET_INFO_LAST_FID; extern struct req_format RQF_OST_SET_INFO_LAST_FID; extern struct req_format RQF_OST_GET_INFO_FIEMAP; +extern struct req_format RQF_OST_SNAPSHOT; /* LDLM req_format */ extern struct req_format RQF_LDLM_ENQUEUE; @@ -282,6 +284,7 @@ extern struct req_msg_field RMF_LAYOUT_INTENT; extern struct req_msg_field RMF_MDT_MD; extern struct req_msg_field RMF_REC_REINT; extern struct req_msg_field RMF_EADATA; +extern struct req_msg_field RMF_EADATA2; extern struct req_msg_field RMF_EAVALS; extern struct req_msg_field RMF_EAVALS_LENS; extern struct req_msg_field RMF_ACL; @@ -301,6 +304,7 @@ extern struct req_msg_field RMF_HSM_USER_STATE; extern struct req_msg_field RMF_HSM_STATE_SET; extern struct req_msg_field RMF_MDS_HSM_CURRENT_ACTION; extern struct req_msg_field RMF_MDS_HSM_REQUEST; +extern struct req_msg_field RMF_SNAP_EANAME; /* seq-mgr fields */ extern struct req_msg_field RMF_SEQ_OPC; diff --git a/lustre/include/lustre_snapshot.h b/lustre/include/lustre_snapshot.h new file mode 100644 index 0000000..8c2447c --- /dev/null +++ b/lustre/include/lustre_snapshot.h @@ -0,0 +1,79 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License version 2 for more details. A copy is + * included in the COPYING file that accompanied this code. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * GPL HEADER END + */ +/* + * Copyright(c) 2016-2017 FUJITSU LIMITED. + * All rights reserved. + */ +#ifndef __LUSTRE_SNAPSHOT_H +#define __LUSTRE_SNAPSHOT_H + +#ifdef __KERNEL__ +#include +#include +#include +#include +#include +#include +#include +#else /* !__KERNEL__ */ +#include +#include +#include +#include +#include +#include +#include +#endif /* __KERNEL__ */ + +/* same as ldiskfs/snapshot.h */ + +/* snapshot flags in lustre. + * other snapshot flags are defined + * ldiskfs/snapshot.h */ + +#define SNAPSHOT_FLAGS_MASK \ + (LUSTRE_SNAPSHOT_FL | LUSTRE_SNAPSHOT_SP_FL | LUSTRE_SNAPSHOT_SHARE_FL) +#define MDT_SNAPSHOT_FILE_PATT LUSTRE_SNAPSHOT_FL +#define MDT_SNAPSHOT_DIR_PATT (LUSTRE_SNAPSHOT_FL | LUSTRE_SNAPSHOT_SP_FL) +#define OST_SNAPSHOT_FILE_PATT \ + (LUSTRE_SNAPSHOT_SHARE_FL | LUSTRE_SNAPSHOT_SP_FL) + +/* snapshot name dir or snapshot */ +#define LUSTRE_TEST_MDT_SNAPSHOT_FILE(flags) \ + (((flags) & SNAPSHOT_FLAGS_MASK) == MDT_SNAPSHOT_FILE_PATT) + +/* snapshot hidden dir */ +#define LUSTRE_TEST_MDT_SNAPSHOT_DIR(flags) \ + (((flags) & SNAPSHOT_FLAGS_MASK) == MDT_SNAPSHOT_DIR_PATT) + +#define LUSTRE_TEST_MDT_SNAPSHOT(flags) \ + (((flags) & (LUSTRE_SNAPSHOT_FL | LUSTRE_SNAPSHOT_SP_FL)) != 0) + +/* snapshot error code in lustre. + * other snapshot error codes are defined in + * ldiskfs/snapshot.h */ + +#define SNAPSHOT_OST_ERR_DEL 2453 +#define SNAPSHOT_OST_ERR_MSG \ + "Snapshot %s: error destroying object "DFID": %d.\n" + +#endif /* __LUSTRE_SNAPSHOT_H */ diff --git a/lustre/include/md_object.h b/lustre/include/md_object.h index 0b44b64..f39c36c 100644 --- a/lustre/include/md_object.h +++ b/lustre/include/md_object.h @@ -259,6 +259,11 @@ struct md_object_operations { struct md_object *obj, struct ldlm_enqueue_info *einfo, union ldlm_policy_data *policy); + + int (*moo_snapshot_clone)(const struct lu_env *env, + struct md_object *snap_obj, + struct md_attr *ma, + struct lu_buf *buf); }; /** @@ -351,6 +356,11 @@ struct md_device_operations { int (*mdo_iocontrol)(const struct lu_env *env, struct md_device *m, unsigned int cmd, int len, void *data); + + int (*mdo_snapshot_get_enable)(const struct lu_env *env, + struct md_device *m); + int (*mdo_snapshot_set_enable)(const struct lu_env *env, + struct md_device *m); }; enum md_upcall_event { @@ -693,6 +703,15 @@ static inline int mo_object_unlock(const struct lu_env *env, return m->mo_ops->moo_object_unlock(env, m, einfo, policy); } +static inline int mo_snapshot_clone(const struct lu_env *env, + struct md_object *snap, + struct md_attr *at, + struct lu_buf *buf) +{ + LASSERT(snap->mo_ops->moo_snapshot_clone); + return snap->mo_ops->moo_snapshot_clone(env, snap, at, buf); +} + static inline int mdo_lookup(const struct lu_env *env, struct md_object *p, const struct lu_name *lname, diff --git a/lustre/include/obd.h b/lustre/include/obd.h index a55bba0..00ad216 100644 --- a/lustre/include/obd.h +++ b/lustre/include/obd.h @@ -768,6 +768,8 @@ enum obd_cleanup_stage { #define KEY_CACHE_LRU_SHRINK "cache_lru_shrink" #define KEY_OSP_CONNECTED "osp_connected" +#define KEY_SNAPSHOT_ENABLED "snapshot_enabled" + struct lu_context; /* /!\ must be coherent with include/linux/namei.h on patched kernel */ @@ -851,6 +853,10 @@ struct md_op_data { /* File object data version for HSM release, on client */ __u64 op_data_version; struct lustre_handle op_lease_handle; + + /* snapshot data, on client */ + const char *op_eaname; + int op_eanamelen; }; #define op_stripe_offset op_ioepoch @@ -1000,6 +1006,20 @@ struct obd_ops { char *ostname); void (*o_getref)(struct obd_device *obd); void (*o_putref)(struct obd_device *obd); + + /* snapshot methods */ + int (*o_snapshot_lock)(const struct lu_env *env, + struct obd_export *exp, + struct obdo *oa, bool, + void **lock); + int (*o_snapshot_unlock)(const struct lu_env *env, + struct obd_export *exp, + struct obdo *oa, void *lock); + int (*o_snapshot_get_info)(const struct lu_env *env, + struct obd_export *exp, + struct obdo *oa, int *type); + int (*o_snapshot_cancel_lock)(struct obd_export *exp, + void *val); /* * NOTE: If adding ops, add another LPROCFS_OBD_OP_INIT() line * to lprocfs_alloc_obd_stats() in obdclass/lprocfs_status.c. @@ -1164,6 +1184,9 @@ struct md_ops { const struct lmv_stripe_md *, const char *name, int namelen, struct lu_fid *fid); + + int (*m_snapshot)(struct obd_export *, struct md_op_data *, + struct ptlrpc_request **); }; struct lsm_operations { diff --git a/lustre/include/obd_class.h b/lustre/include/obd_class.h index 331a9e6..b14ba57 100644 --- a/lustre/include/obd_class.h +++ b/lustre/include/obd_class.h @@ -1452,6 +1452,75 @@ static inline int obd_register_observer(struct obd_device *obd, RETURN(0); } +/* + * obd_snapshot_lock() + * + * snapshot lock inline function at ofd layer + * + * \param[in] env environment + * \param[in] exp obd export + * \param[in] oa object device + * \param[in] create for create snapshot + * \param[out] lock snapshot lock object + * + * \retval 0 success + * \retval less than 0 failure (-errno) + */ +static inline int obd_snapshot_lock(const struct lu_env *env, + struct obd_export *exp, + struct obdo *oa, + bool create, + void **lock) +{ + int rc; + ENTRY; + + EXP_CHECK_DT_OP(exp, snapshot_lock); + EXP_COUNTER_INCREMENT(exp, snapshot_lock); + + rc = OBP(exp->exp_obd, snapshot_lock)(env, exp, oa, create, lock); + RETURN(rc); +} + +/* + * obd_snapshot_unlock() + * + * snapshot unlock inline function at ofd layer + * + * \param[in] env environment + * \param[in] exp obd export + * \param[in] oa object device + * \param[out] lock snapshot lock object + * + * \retval 0 success + * \retval less than 0 failure (-errno) + */ +static inline int obd_snapshot_unlock(const struct lu_env *env, + struct obd_export *exp, + struct obdo *oa, + void *lock) +{ + int rc; + ENTRY; + + EXP_CHECK_DT_OP(exp, snapshot_unlock); + EXP_COUNTER_INCREMENT(exp, snapshot_unlock); + rc = OBP(exp->exp_obd, snapshot_unlock)(env, exp, oa, lock); + RETURN(rc); +} + +static inline int obd_snapshot_cancel_lock(struct obd_export *exp, + void *val) +{ + int rc; + ENTRY; + + EXP_CHECK_DT_OP(exp, snapshot_cancel_lock); + EXP_COUNTER_INCREMENT(exp, snapshot_cancel_lock); + rc = OBP(exp->exp_obd, snapshot_cancel_lock)(exp, val); + RETURN(rc); +} + /* metadata helpers */ static inline int md_getstatus(struct obd_export *exp, struct lu_fid *fid, struct obd_capa **pc) @@ -1873,6 +1942,18 @@ static inline int md_get_fid_from_lsm(struct obd_export *exp, RETURN(rc); } +static inline int md_snapshot(struct obd_export *exp, + struct md_op_data *op_data, + struct ptlrpc_request **req) +{ + int rc; + ENTRY; + EXP_CHECK_MD_OP(exp, snapshot); + EXP_MD_COUNTER_INCREMENT(exp, snapshot); + rc = MDP(exp->exp_obd, snapshot)(exp, op_data, req); + RETURN(rc); +} + /* OBD Metadata Support */ extern int obd_init_caches(void); diff --git a/lustre/include/obd_support.h b/lustre/include/obd_support.h index 0044fb7..fb8d9d7 100644 --- a/lustre/include/obd_support.h +++ b/lustre/include/obd_support.h @@ -316,6 +316,7 @@ int obd_alloc_fail(const void *ptr, const char *name, const char *type, #define OBD_FAIL_OST_STATFS_EINPROGRESS 0x231 #define OBD_FAIL_OST_SET_INFO_NET 0x232 #define OBD_FAIL_OST_NODESTROY 0x233 +#define OBD_FAIL_OST_SNAPSHOT_NET 0x2ff #define OBD_FAIL_LDLM 0x300 #define OBD_FAIL_LDLM_NAMESPACE_NEW 0x301 diff --git a/lustre/llite/Makefile.in b/lustre/llite/Makefile.in index 562b9d0..e3f49c4 100644 --- a/lustre/llite/Makefile.in +++ b/lustre/llite/Makefile.in @@ -6,6 +6,7 @@ lustre-objs += xattr.o xattr_cache.o remote_perm.o llite_rmtacl.o llite_capa.o lustre-objs += rw26.o super25.o statahead.o lustre-objs += ../lclient/glimpse.o ../lclient/lcommon_cl.o ../lclient/lcommon_misc.o lustre-objs += vvp_dev.o vvp_page.o vvp_lock.o vvp_io.o vvp_object.o +lustre-objs += llite_snapshot.o llite_lloop-objs := lloop.o diff --git a/lustre/llite/dir.c b/lustre/llite/dir.c index cea0dd3..6172b22 100644 --- a/lustre/llite/dir.c +++ b/lustre/llite/dir.c @@ -45,6 +45,7 @@ #include #include // for wait_on_buffer #include +#include #define DEBUG_SUBSYSTEM S_LLITE @@ -1040,6 +1041,442 @@ ll_getname(const char __user *filename) #define ll_putname(filename) __putname(filename) +/* + * ll_snapshot_check_enabled() + * + * check snapshot enabled + * + * \param[in] inode snapshot parent directory inode + * + * \retval 0 disabled + * \retval 1 enabled + * \retval not 0, 1 failure + */ +static int ll_snapshot_check_enabled(struct inode *inode) +{ + struct ll_sb_info *sbi = ll_i2sbi(inode); + int enabled, vallen; + int rc; + ENTRY; + + if (sbi->ll_snapshot == 1) + RETURN(1); + + /* lock */ + snapshot_llite_lock(); + + /* create request data */ + enabled = 0; + vallen = sizeof(enabled); + rc = obd_get_info(NULL, sbi->ll_md_exp, sizeof(KEY_SNAPSHOT_ENABLED), + KEY_SNAPSHOT_ENABLED, &vallen, &enabled, NULL); + if (rc) { + snapshot_llite_unlock(); + RETURN(rc); + } + + /* write sb_info when snapshot enabled */ + if (enabled == 1) + sbi->ll_snapshot = 1; + + /* unlock */ + snapshot_llite_unlock(); + + RETURN(enabled); +} + +/* + * ll_snapshot_create() + * + * create snapshot + * + * \param[in] inode snapshot parent directory inode + * \param[in] snap_data snapshot request data + * + * \retval 0 success + * \retval not 0 failure + */ +static int ll_snapshot_create(struct file *file, + struct ll_sb_info *sbi, + struct inode *inode, + struct snapshot_data *snap_data) +{ + struct obd_export *lmv = ll_i2mdexp(inode); + struct file *src_file = NULL; + struct inode *src_inode = NULL; + struct md_op_data *op_data; + struct ptlrpc_request *req = NULL; + struct ss_handle *handle; + struct lov_mds_md *lmm; + char *ptr, *ptr_val; + int size, size_val; + int i, len; + int rc = 0; + ENTRY; + + /* get file structure of snapshot directory */ + if (snap_data->req.create.src_fd != -1) { + src_file = fcheck(snap_data->req.create.src_fd); + if (!src_file) + GOTO(err, rc = -EINVAL); + src_inode = src_file->f_path.dentry->d_inode; + if (src_inode->i_flags & LUSTRE_SNAPSHOT_SP_FL) + GOTO(err, rc = -EALREADY); + } + + /* prepare md data */ + op_data = ll_prep_md_op_data(NULL, + inode, + NULL, + snap_data->req.create.name, + snap_data->req.create.name_len, + snap_data->req.create.mode, + LUSTRE_OPC_ANY, + NULL); + if (IS_ERR(op_data)) + GOTO(err, rc = PTR_ERR(op_data)); + + /* set fid3 */ + if (snap_data->req.create.src_fd != -1) + op_data->op_fid3 = + *ll_inode2fid(src_file->f_path.dentry->d_inode); + + /* set bias */ + op_data->op_bias = 0; + /* set MDS_SNAPSHOT only snapshot hidden directory (.l_snapshot) */ + if (snap_data->req.create.hidden_f == 0x1) + op_data->op_bias += MDS_SNAPSHOT; + + /* snapshot memory open */ + handle = snapshot_mem_open(); + if (handle == NULL) { + ll_finish_md_op_data(op_data); + GOTO(err, rc = -ENOMEM); + } + + /* MEMO: + * op_fid1 snapshot directory + * op_fid2 NULL + * op_fid3 original (src file/dir) + * op_name snapshot name + * op_namelen snapshot name length + * op_mode snapshot attr + * op_bias snapshot flag + */ + + /* set attribute */ + op_data->op_attr.ia_valid |= ATTR_MODE; + op_data->op_valid |= (OBD_MD_FLMODE | OBD_MD_FLTYPE); + + if (snap_data->req.create.hidden_f) { + /* snapshot hidden directory or snapshot name directory*/ + + /* only snapshot hidden directory */ + if (snap_data->req.create.hidden_f == 0x1 && + uid_eq(current_fsuid(), GLOBAL_ROOT_UID)) { + op_data->op_fsuid = snap_data->req.create.uid; + op_data->op_fsgid = snap_data->req.create.gid; + } + + op_data->op_attr.ia_mode = snap_data->req.create.mode; + + op_data->op_attr.ia_atime = CFS_CURRENT_TIME; + op_data->op_attr.ia_mtime = CFS_CURRENT_TIME; + op_data->op_attr.ia_ctime = CFS_CURRENT_TIME; + } else { + /* snapshot directory or file */ + op_data->op_attr.ia_mode = snap_data->req.create.mode; + op_data->op_attr.ia_atime = snap_data->req.create.atim; + op_data->op_attr.ia_mtime = snap_data->req.create.mtim; + op_data->op_attr.ia_ctime = CFS_CURRENT_TIME; + + /* get lov attribute buffer */ + rc = ll_dir_getstripe(src_inode, (void **)&lmm, + &size_val, &req, 0); + if (rc == -ENODATA) + size_val = 0; /* skip no lov */ + else if (rc != 0) + GOTO(err_free_data, rc); + + if (size_val > 0) { + /* adjust lmm_stripe_offset of file */ + if (S_ISREG(snap_data->req.create.mode)) { + struct lov_ost_data_v1 *objs; + if (lmm->lmm_magic == LOV_MAGIC_V1) + objs = &((struct lov_mds_md_v1 *)lmm) + ->lmm_objects[0]; + else + objs = &((struct lov_mds_md_v3 *)lmm) + ->lmm_objects[0]; + lmm->lmm_layout_gen = objs->l_ost_idx; + } + + /* alloc memory */ + rc = snapshot_mem_realloc(handle, + SNAPSHOT_MEM_VALUE, size_val); + if (rc) + GOTO(err_free_data, rc); + + /* get lov attribute */ + ptr_val = handle->value[SNAPSHOT_MEM_VALUE].lb_buf; + memcpy(ptr_val, lmm, size_val); + + /* set lov data */ + op_data->op_data = ptr_val; + op_data->op_data_size = size_val; + } + + /* finish req */ + ptlrpc_req_finished(req); + req = NULL; + } + + /* call lmv */ + rc = md_snapshot(lmv, op_data, &req); + if (rc) + GOTO(err_free_data, rc); + + /* update original file access time */ + if (snap_data->req.create.src_fd != -1) + ll_update_times(req, src_file->f_path.dentry->d_inode); + + /* finish req */ + ptlrpc_req_finished(req); + req = NULL; + + if (snap_data->req.create.hidden_f) + GOTO(out, rc); + + /* + * set extented attribute + */ + + /* get xattr list buffer */ + size = ll_listxattr(src_file->f_path.dentry, NULL, 0); + if (size < 0) + GOTO(err_free_data, rc = size); + if (size == 0) + GOTO(out, rc); + + rc = snapshot_mem_realloc(handle, SNAPSHOT_MEM_LIST, size); + if (rc) + GOTO(err_free_data, rc); + + /* get xattr list */ + ptr = handle->value[SNAPSHOT_MEM_LIST].lb_buf; + size = ll_listxattr(src_file->f_path.dentry, ptr, size); + if (size < 0) + GOTO(err_free_data, rc = size); + + for (i = 0; i < size; i += (len + 1), ptr += (len + 1)) { + + /* length of attrivute name */ + len = strlen(ptr); + + /* skip below extended attributes name */ + if (strncmp(ptr, XATTR_TRUSTED_PREFIX, + sizeof(XATTR_TRUSTED_PREFIX) - 1) == 0) { + if (strcmp(ptr, XATTR_NAME_LMA) == 0 || + strcmp(ptr, XATTR_NAME_LMV) == 0 || + strcmp(ptr, XATTR_NAME_LINK) == 0 || + strcmp(ptr, XATTR_NAME_FID) == 0 || + strcmp(ptr, XATTR_NAME_VERSION) == 0 || + strcmp(ptr, XATTR_NAME_SOM) == 0 || + strcmp(ptr, XATTR_NAME_HSM) == 0 || + strcmp(ptr, XATTR_NAME_LFSCK_NAMESPACE) == 0) { + continue; + } + } + + /* skip lov attribute */ + if (strcmp(ptr, XATTR_NAME_LOV) == 0 || + strcmp(ptr, XATTR_LUSTRE_PREFIX "lov") == 0) { + continue; + } + + /* get xattr value buffer */ + size_val = ll_getxattr(src_file->f_path.dentry, ptr, NULL, 0); + if (size_val < 0) + GOTO(err_free_data, rc = size_val); + if (size_val == 0) + ptr_val = ""; + else { + rc = snapshot_mem_realloc(handle, SNAPSHOT_MEM_VALUE, + size_val); + if (rc) + GOTO(err_free_data, rc); + + /* get xattr value */ + ptr_val = handle->value[SNAPSHOT_MEM_VALUE].lb_buf; + size_val = ll_getxattr(src_file->f_path.dentry, ptr, + ptr_val, size_val); + if (size_val < 0) + GOTO(err_free_data, rc = size_val); + } + + /* set xattr sname and data */ + op_data->op_eaname = ptr; + op_data->op_eanamelen = len; + op_data->op_data = ptr_val; + op_data->op_data_size = size_val; + + /* call lmv */ + rc = md_snapshot(lmv, op_data, &req); + if (rc) + GOTO(err_free_data, rc); + + /* finish req */ + ptlrpc_req_finished(req); + req = NULL; + } + +out: + /* free md data */ + ll_finish_md_op_data(op_data); + + /* snapshot memory close */ + snapshot_mem_close(handle); + + RETURN(0); + +err_free_data: + /* free md data */ + ll_finish_md_op_data(op_data); + + /* snapshot memory close */ + snapshot_mem_close(handle); + + /* request finished */ + if (req) + ptlrpc_req_finished(req); + +err: + RETURN(rc); +} + +/* + * ll_snapshot_unlink() + * + * delete snapshot + * + * \param[in] inode snapshot parent directory inode + * \param[in] snap_data snapshot request data + * + * \retval 0 success + * \retval not 0 failure + */ +static int ll_snapshot_unlink(struct file *file, + struct ll_sb_info *sbi, + struct inode *inode, + struct snapshot_data *snap_data) +{ + struct dentry *pdentry, *dentry = NULL; + struct qstr name; + struct kstat stat; + int rc = 0; + ENTRY; + + /* get target file name */ + name.name = snap_data->req.create.name; + name.len = strlen(snap_data->req.create.name); + + /* lookup unlink target */ + pdentry = file->f_path.dentry; + dentry = d_lookup(pdentry, &name); + if (IS_ERR(dentry)) + RETURN(PTR_ERR(dentry)); + + if (dentry == NULL) { + struct dentry *new; + + new = d_alloc(pdentry, &name); + if (new == NULL) + RETURN(-ENOMEM); + dentry = ll_iop_lookup(inode, new); + if (dentry) + dput(new); + else + dentry = new; + + if ((dentry == NULL) || (dentry->d_inode == NULL)) + GOTO(out, rc = -ENOENT); + } + + /* get target type */ + rc = ll_getattr(file->f_path.mnt, dentry, &stat); + if (rc) + GOTO(out, rc); + + /* delete target */ + if (S_ISREG(stat.mode)) + rc = ll_unlink_for_snapshot(inode, pdentry, dentry, &name); + else if (S_ISDIR(stat.mode)) + rc = ll_rmdir_for_snapshot(inode, pdentry, dentry, &name); + else + rc = -EINVAL; +out: + if (dentry) + dput(dentry); + + RETURN(rc); +} + +/* + * ll_snapshot() + * + * create snapshot + * + * \param[in] inode snapshot parent directory inode + * \param[in] snap_data snapshot request data + * + * \retval 0 success + * \retval not 0 failure + */ +static int ll_snapshot(struct file *file, struct ll_sb_info *sbi, + struct inode *inode, struct snapshot_data *snap_data) +{ + int rc; + ENTRY; + + rc = 0; + switch (snap_data->subcmd) { + /* check snapshot enabled */ + case LL_SNAPSHOT_CHKENABLED: + rc = ll_snapshot_check_enabled(inode); + break; + + /* snapshot create */ + case LL_SNAPSHOT_CREATE: + /* snapshot create */ + rc = ll_snapshot_create(file, sbi, inode, snap_data); + break; + + /* snapshot create(post) */ + case LL_SNAPSHOT_CREATE_POST: + /* memory clean */ + snapshot_mem_clean(); + break; + + /* snapshot delete */ + case LL_SNAPSHOT_UNLINK: + /* snapshot delete */ + rc = ll_snapshot_unlink(file, sbi, inode, snap_data); + break; + + /* get snapshot directory or not */ + case LL_SNAPSHOT_STAT: + /* get snapshot inode flags */ + rc = (inode->i_flags & LUSTRE_SNAPSHOT_SP_FL) ? 1 : 0; + break; + + default: + rc = -ENOTTY; + break; + } + + RETURN(rc); +} + static long ll_dir_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct inode *inode = file->f_dentry->d_inode; @@ -1812,6 +2249,24 @@ migrate_free: RETURN(rc); } + case LL_IOC_SNAPSHOT: { + struct snapshot_data *data; + + if (!S_ISDIR(inode->i_mode)) + RETURN(-EINVAL); + + OBD_ALLOC_PTR(data); + if (data == NULL) + RETURN(-ENOMEM); + if (copy_from_user(data, (void *)arg, sizeof(*data))) { + OBD_FREE_PTR(data); + RETURN(-EFAULT); + } + rc = ll_snapshot(file, sbi, inode, data); + OBD_FREE_PTR(data); + + RETURN(rc); + } default: RETURN(obd_iocontrol(cmd, sbi->ll_dt_exp, 0, NULL, (void *)arg)); diff --git a/lustre/llite/file.c b/lustre/llite/file.c index 6582be3..ace672f 100644 --- a/lustre/llite/file.c +++ b/lustre/llite/file.c @@ -51,6 +51,7 @@ #include #include "cl_object.h" +#include static int ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg); @@ -3346,6 +3347,26 @@ static int ll_merge_md_attr(struct inode *inode) RETURN(0); } +static int ll_snaphot_prepare_glimpse(struct inode *inode) +{ + struct lov_stripe_md *lsm = NULL; + int rc = 0; + ENTRY; + + if (!LUSTRE_TEST_MDT_SNAPSHOT_FILE(inode->i_flags)) + RETURN(0); + + lsm = ccc_inode_lsm_get(inode); + if (lsm == NULL) { + CERROR("ino=%lu lsm is NULL\n", inode->i_ino); + RETURN(-ENOENT); + } + + rc = obd_snapshot_cancel_lock(ll_i2dtexp(inode), lsm); + ccc_inode_lsm_put(inode, lsm); + RETURN(rc); +} + static int ll_inode_revalidate(struct dentry *dentry, __u64 ibits) { @@ -3377,8 +3398,10 @@ ll_inode_revalidate(struct dentry *dentry, __u64 ibits) * restore the MDT holds the layout lock so the glimpse will * block up to the end of restore (getattr will block) */ - if (!(ll_i2info(inode)->lli_flags & LLIF_FILE_RESTORING)) + if (!(ll_i2info(inode)->lli_flags & LLIF_FILE_RESTORING)) { + ll_snaphot_prepare_glimpse(inode); rc = ll_glimpse_size(inode); + } } RETURN(rc); } diff --git a/lustre/llite/llite_internal.h b/lustre/llite/llite_internal.h index 089b39d..2faa53a 100644 --- a/lustre/llite/llite_internal.h +++ b/lustre/llite/llite_internal.h @@ -553,6 +553,8 @@ struct ll_sb_info { /* root squash */ struct root_squash_info ll_squash; + + int ll_snapshot; /* snapshot enabled status */ }; #define LL_DEFAULT_MAX_RW_CHUNK (32 * 1024 * 1024) @@ -744,6 +746,10 @@ int ll_md_blocking_ast(struct ldlm_lock *, struct ldlm_lock_desc *, struct dentry *ll_splice_alias(struct inode *inode, struct dentry *de); int ll_rmdir_entry(struct inode *dir, char *name, int namelen); void ll_update_times(struct ptlrpc_request *request, struct inode *inode); +int ll_rmdir_for_snapshot(struct inode *dir, struct dentry *dparent, + struct dentry *dchild, struct qstr *name); +int ll_unlink_for_snapshot(struct inode *dir, struct dentry *dparent, + struct dentry *dchild, struct qstr *name); /* llite/rw.c */ int ll_writepage(struct page *page, struct writeback_control *wbc); @@ -1602,4 +1608,25 @@ void ll_xattr_fini(void); int ll_page_sync_io(const struct lu_env *env, struct cl_io *io, struct cl_page *page, enum cl_req_type crt); +/* llite/llite_snapshot.c */ +enum { + SNAPSHOT_MEM_LIST = 0, + SNAPSHOT_MEM_VALUE, + SNAPSHOT_MEM_MAXNUM +}; + +struct ss_handle { + struct lu_buf value[SNAPSHOT_MEM_MAXNUM]; + struct list_head list; +}; + +void snapshot_llite_init(void); +void snapshot_llite_destroy(void); +void snapshot_llite_lock(void); +void snapshot_llite_unlock(void); +struct ss_handle *snapshot_mem_open(void); +void snapshot_mem_close(struct ss_handle *ptr); +void snapshot_mem_clean(void); +int snapshot_mem_realloc(struct ss_handle *ptr, int id, ssize_t size); + #endif /* LLITE_INTERNAL_H */ diff --git a/lustre/llite/llite_snapshot.c b/lustre/llite/llite_snapshot.c new file mode 100644 index 0000000..f0afc8b --- /dev/null +++ b/lustre/llite/llite_snapshot.c @@ -0,0 +1,250 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License version 2 for more details. A copy is + * included in the COPYING file that accompanied this code. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * GPL HEADER END + */ +/* + * Copyright(c) 2016-2017 FUJITSU LIMITED. + * All rights reserved. + */ +#include +#include +#include +#include "llite_internal.h" + +/* snapshot memory management */ +static struct list_head ss_head; +static spinlock_t ss_lock; + +/* ll_snapshot_check_enable */ +static struct mutex ss_mutex; + +/* + * snapshot_mem_free() + * + * memory free + * + * \param[in] ptr memory handle + */ +static void snapshot_mem_free(struct ss_handle *ptr) +{ + /* free value area */ + if (ptr->value[0].lb_len > 0) + OBD_FREE(ptr->value[0].lb_buf, ptr->value[0].lb_len); + if (ptr->value[1].lb_len > 0) + OBD_FREE(ptr->value[1].lb_buf, ptr->value[1].lb_len); + + /* free handle area */ + OBD_FREE(ptr, sizeof(struct ss_handle)); + + return; +} + +/* + * snapshot_llite_init() + * + * initialize of snapshot memory management (for llite) + */ +void snapshot_llite_init(void) +{ + /* init table */ + INIT_LIST_HEAD(&ss_head); + + /* init spinlock object */ + spin_lock_init(&ss_lock); + + /* init mutex (for ll_snapshot_check_enable) */ + mutex_init(&ss_mutex); + + return; +} + +/* + * snapshot_llite_destroy() + * + * finalize of snapshot memory management (for llite) + */ +void snapshot_llite_destroy(void) +{ + /* free memory, and handle */ + snapshot_mem_clean(); + + return; +} + +/* + * snapshot_llite_lock() + * + * lock mutex for ll_snapshot_check_enable + */ +void snapshot_llite_lock(void) +{ + /* lock mutex (for ll_snapshot_check_enable) */ + mutex_lock(&ss_mutex); + + return; +} + +/* + * snapshot_llite_unlock() + * + * unlock mutex for ll_snapshot_check_enable + */ +void snapshot_llite_unlock(void) +{ + /* unlock mutex (for ll_snapshot_check_enable) */ + mutex_unlock(&ss_mutex); + + return; +} + +/* + * snapshot_mem_open() + * + * memory handle open + * + * \retval not NULL memory handle + * \retval NULL error + */ +struct ss_handle *snapshot_mem_open(void) +{ + struct ss_handle *ptr; + + /* lock table */ + spin_lock(&ss_lock); + + /* find non used table */ + if (!list_empty(&ss_head)) { + + /* get handle */ + ptr = list_entry(ss_head.next, struct ss_handle, list); + list_del_init(&ptr->list); + + /* unlock table */ + spin_unlock(&ss_lock); + + return ptr; + } + + /* unlock table */ + spin_unlock(&ss_lock); + + /* create new table */ + OBD_ALLOC(ptr, sizeof(struct ss_handle)); + if (ptr == NULL) + return NULL; + + /* initialize new table */ + memset(ptr, 0x0, sizeof(struct ss_handle)); + INIT_LIST_HEAD(&ptr->list); + + return ptr; +} + +/* + * snapshot_mem_close() + * + * memory handle close + * + * \param[in] ptr memory handle + */ +void snapshot_mem_close(struct ss_handle *ptr) +{ + /* lock table */ + spin_lock(&ss_lock); + + /* save handle */ + list_add(&ptr->list, &ss_head); + + /* unlock table */ + spin_unlock(&ss_lock); + + return; +} + +/* + * snapshot_mem_clean() + * + * memory handle clean. an unused area is released. + * + * \param[in] ptr memory handle + */ +void snapshot_mem_clean(void) +{ + struct ss_handle *ptr, *tmp; + + /* lock table */ + spin_lock(&ss_lock); + + /* free memory, and delete handle list */ + if (!list_empty(&ss_head)) { + list_for_each_entry_safe(ptr, tmp, &ss_head, list) { + list_del(&ptr->list); + snapshot_mem_free(ptr); + } + } + + /* unlock table */ + spin_unlock(&ss_lock); + + return; +} + +/* + * snapshot_mem_realloc() + * + * memory allocate. It recycles if it has already acquired it. If the area is + * insufficient, it acquires it by the unit of 4K. + * + * \param[in/out] ptr memory handle + * \param[in] id buffer id (0 = xattr name, or 1 = xattr value) + * \param[in] size size + * + * \retval 0 success + * \retval not 0 error code + */ +int snapshot_mem_realloc(struct ss_handle *ptr, int id, ssize_t size) +{ + ssize_t new_size; + void *new_ptr; + + if (ptr->value[id].lb_len < size) { + + /* free buffer */ + if (ptr->value[id].lb_len > 0) { + OBD_FREE(ptr->value[id].lb_buf, ptr->value[id].lb_len); + ptr->value[id].lb_buf = NULL; + ptr->value[id].lb_len = 0; + } + + /* set new size */ + new_size = ((size >> 12) + 1) << 12; + + /* realloc buffer */ + OBD_ALLOC(new_ptr, new_size); + if (new_ptr == NULL) + return -ENOMEM; + + /* set handle */ + ptr->value[id].lb_buf = new_ptr; + ptr->value[id].lb_len = new_size; + } + + return 0; +} diff --git a/lustre/llite/namei.c b/lustre/llite/namei.c index 7770a61..cd95251 100644 --- a/lustre/llite/namei.c +++ b/lustre/llite/namei.c @@ -1125,8 +1125,9 @@ static int ll_mkdir_generic(struct inode *dir, struct qstr *name, RETURN(err); } -static int ll_rmdir_generic(struct inode *dir, struct dentry *dparent, - struct dentry *dchild, struct qstr *name) +static int __ll_rmdir_generic(struct inode *dir, struct dentry *dparent, + struct dentry *dchild, struct qstr *name, + int flag) { struct ptlrpc_request *request = NULL; struct md_op_data *op_data; @@ -1147,6 +1148,10 @@ static int ll_rmdir_generic(struct inode *dir, struct dentry *dparent, if (dchild != NULL && dchild->d_inode != NULL) op_data->op_fid3 = *ll_inode2fid(dchild->d_inode); op_data->op_fid2 = op_data->op_fid3; + + if (flag == 1) + op_data->op_bias |= MDS_SNAPSHOT; + rc = md_unlink(ll_i2sbi(dir)->ll_md_exp, op_data, &request); ll_finish_md_op_data(op_data); if (rc == 0) { @@ -1158,6 +1163,18 @@ static int ll_rmdir_generic(struct inode *dir, struct dentry *dparent, RETURN(rc); } +static int ll_rmdir_generic(struct inode *dir, struct dentry *dparent, + struct dentry *dchild, struct qstr *name) +{ + return __ll_rmdir_generic(dir, dparent, dchild, name, 0); +} + +int ll_rmdir_for_snapshot(struct inode *dir, struct dentry *dparent, + struct dentry *dchild, struct qstr *name) +{ + return __ll_rmdir_generic(dir, dparent, dchild, name, 1); +} + /** * Remove dir entry **/ @@ -1267,8 +1284,9 @@ out: * Instead, ll_ddelete() and ll_d_iput() will update it based upon if there * is any lock existing. They will recycle dentries and inodes based upon locks * too. b=20433 */ -static int ll_unlink_generic(struct inode *dir, struct dentry *dparent, - struct dentry *dchild, struct qstr *name) +static int __ll_unlink_generic(struct inode *dir, struct dentry *dparent, + struct dentry *dchild, struct qstr *name, + int flag) { struct ptlrpc_request *request = NULL; struct md_op_data *op_data; @@ -1293,6 +1311,10 @@ static int ll_unlink_generic(struct inode *dir, struct dentry *dparent, op_data->op_fid3 = *ll_inode2fid(dchild->d_inode); op_data->op_fid2 = op_data->op_fid3; + + if (flag == 1) + op_data->op_bias |= MDS_SNAPSHOT; + rc = md_unlink(ll_i2sbi(dir)->ll_md_exp, op_data, &request); ll_finish_md_op_data(op_data); if (rc) @@ -1307,6 +1329,18 @@ static int ll_unlink_generic(struct inode *dir, struct dentry *dparent, RETURN(rc); } +static int ll_unlink_generic(struct inode *dir, struct dentry *dparent, + struct dentry *dchild, struct qstr *name) +{ + return __ll_unlink_generic(dir, dparent, dchild, name, 0); +} + +int ll_unlink_for_snapshot(struct inode *dir, struct dentry *dparent, + struct dentry *dchild, struct qstr *name) +{ + return __ll_unlink_generic(dir, dparent, dchild, name, 1); +} + static int ll_rename_generic(struct inode *src, struct dentry *src_dparent, struct dentry *src_dchild, struct qstr *src_name, struct inode *tgt, struct dentry *tgt_dparent, diff --git a/lustre/llite/super25.c b/lustre/llite/super25.c index f63472b..5a6e867 100644 --- a/lustre/llite/super25.c +++ b/lustre/llite/super25.c @@ -202,6 +202,9 @@ static int __init init_lustre_lite(void) if (rc == 0) rc = ll_xattr_init(); + /* snapshot init */ + snapshot_llite_init(); + return rc; } @@ -231,6 +234,9 @@ static void __exit exit_lustre_lite(void) kmem_cache_destroy(ll_file_data_slab); if (proc_lustre_fs_root) lprocfs_remove(&proc_lustre_fs_root); + + /* snapshot destroy */ + snapshot_llite_destroy(); } MODULE_AUTHOR("Sun Microsystems, Inc. "); diff --git a/lustre/lmv/lmv_obd.c b/lustre/lmv/lmv_obd.c index d18b856..48d72d2 100644 --- a/lustre/lmv/lmv_obd.c +++ b/lustre/lmv/lmv_obd.c @@ -2926,7 +2926,8 @@ static int lmv_get_info(const struct lu_env *env, struct obd_export *exp, KEY_IS(KEY_DEFAULT_EASIZE) || KEY_IS(KEY_MAX_COOKIESIZE) || KEY_IS(KEY_DEFAULT_COOKIESIZE) || - KEY_IS(KEY_CONN_DATA)) { + KEY_IS(KEY_CONN_DATA) || + KEY_IS(KEY_SNAPSHOT_ENABLED)) { rc = lmv_check_connect(obd); if (rc) RETURN(rc); @@ -3626,6 +3627,62 @@ int lmv_merge_attr(struct obd_export *exp, const struct lmv_stripe_md *lsm, return 0; } +/* + * lmv_snapshot() + * + * lm snapshot + * + * \param[in] exp lmv obd export + * \param[in] op_data operation data + * \param[in] req portal rpc request + * + * \retval 0 success + * \retval not 0 failure + */ +int lmv_snapshot(struct obd_export *exp, struct md_op_data *op_data, + struct ptlrpc_request **req) +{ + struct obd_device *obd = exp->exp_obd; + struct lmv_obd *lmv = &obd->u.lmv; + struct lmv_tgt_desc *tgt; + int rc; + ENTRY; + + /* check lmv */ + rc = lmv_check_connect(obd); + if (rc) + RETURN(rc); + + if (!lmv->desc.ld_active_tgt_count) + RETURN(-EIO); + + /* locate mds for snapshot directory */ + tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid1); + if (IS_ERR(tgt)) + RETURN(PTR_ERR(tgt)); + + /* allocate fid for snapshot */ + if (!fid_is_sane(&op_data->op_fid2)) { + rc = lmv_fid_alloc(NULL, exp, &op_data->op_fid2, op_data); + if (rc) + RETURN(rc); + } + /* Send the create request to the MDT where the object + * will be located */ + tgt = lmv_find_target(lmv, &op_data->op_fid2); + if (IS_ERR(tgt)) + RETURN(PTR_ERR(tgt)); + + op_data->op_mds = tgt->ltd_idx; + + op_data->op_flags |= MF_MDC_CANCEL_FID1; + + /* call mdc */ + rc = md_snapshot(tgt->ltd_exp, op_data, req); + + RETURN(rc); +} + struct obd_ops lmv_obd_ops = { .o_owner = THIS_MODULE, .o_setup = lmv_setup, @@ -3681,6 +3738,7 @@ struct md_ops lmv_md_ops = { .m_intent_getattr_async = lmv_intent_getattr_async, .m_revalidate_lock = lmv_revalidate_lock, .m_get_fid_from_lsm = lmv_get_fid_from_lsm, + .m_snapshot = lmv_snapshot, }; int __init lmv_init(void) diff --git a/lustre/lod/lod_internal.h b/lustre/lod/lod_internal.h index 22d248c..4716c56 100644 --- a/lustre/lod/lod_internal.h +++ b/lustre/lod/lod_internal.h @@ -451,6 +451,11 @@ int lod_pool_remove(struct obd_device *obd, char *poolname, char *ostname); int lod_qos_prep_create(const struct lu_env *env, struct lod_object *lo, struct lu_attr *attr, const struct lu_buf *buf, struct thandle *th); +int lod_qos_prep_create_for_snapshot(const struct lu_env *env, + struct lod_object *lo, + struct lu_attr *attr, + const struct lu_buf *buf, + struct thandle *th); int qos_add_tgt(struct lod_device*, struct lod_tgt_desc *); int qos_del_tgt(struct lod_device *, struct lod_tgt_desc *); @@ -465,6 +470,11 @@ int lod_object_set_pool(struct lod_object *o, char *pool); int lod_declare_striped_object(const struct lu_env *env, struct dt_object *dt, struct lu_attr *attr, const struct lu_buf *lovea, struct thandle *th); +int lod_declare_striped_object_for_snapshot(const struct lu_env *env, + struct dt_object *dt, + struct lu_attr *attr, + const struct lu_buf *lovea, + struct thandle *th); int lod_striping_create(const struct lu_env *env, struct dt_object *dt, struct lu_attr *attr, struct dt_object_format *dof, struct thandle *th); diff --git a/lustre/lod/lod_object.c b/lustre/lod/lod_object.c index 2ea9025..272494e 100644 --- a/lustre/lod/lod_object.c +++ b/lustre/lod/lod_object.c @@ -1863,7 +1863,11 @@ static int lod_declare_xattr_set(const struct lu_env *env, attr->la_valid = LA_TYPE | LA_MODE; attr->la_mode = S_IFREG; } - rc = lod_declare_striped_object(env, dt, attr, buf, th); + if (fl & LU_XATTR_SNAPSHOT) + rc = lod_declare_striped_object_for_snapshot(env, + dt, attr, buf, th); + else + rc = lod_declare_striped_object(env, dt, attr, buf, th); } else if (S_ISDIR(mode)) { rc = lod_dir_declare_xattr_set(env, dt, buf, name, fl, th); } else { @@ -2802,9 +2806,10 @@ static int lod_declare_init_size(const struct lu_env *env, /** * Create declaration of striped object */ -int lod_declare_striped_object(const struct lu_env *env, struct dt_object *dt, - struct lu_attr *attr, - const struct lu_buf *lovea, struct thandle *th) +int __lod_declare_striped_object(const struct lu_env *env, struct dt_object *dt, + struct lu_attr *attr, + const struct lu_buf *lovea, struct thandle *th, + int is_snapshot) { struct lod_thread_info *info = lod_env_info(env); struct dt_object *next = dt_object_child(dt); @@ -2821,7 +2826,11 @@ int lod_declare_striped_object(const struct lu_env *env, struct dt_object *dt, if (!dt_object_remote(next)) { /* choose OST and generate appropriate objects */ - rc = lod_qos_prep_create(env, lo, attr, lovea, th); + if (is_snapshot == 1) + rc = lod_qos_prep_create_for_snapshot(env, + lo, attr, lovea, th); + else + rc = lod_qos_prep_create(env, lo, attr, lovea, th); if (rc) { /* failed to create striping, let's reset * config so that others don't get confused */ @@ -2861,6 +2870,22 @@ out: RETURN(rc); } +int lod_declare_striped_object(const struct lu_env *env, struct dt_object *dt, + struct lu_attr *attr, + const struct lu_buf *lovea, struct thandle *th) +{ + return __lod_declare_striped_object(env, dt, attr, lovea, th, 0); +} + +int lod_declare_striped_object_for_snapshot(const struct lu_env *env, + struct dt_object *dt, + struct lu_attr *attr, + const struct lu_buf *lovea, + struct thandle *th) +{ + return __lod_declare_striped_object(env, dt, attr, lovea, th, 1); +} + static int lod_declare_object_create(const struct lu_env *env, struct dt_object *dt, struct lu_attr *attr, @@ -3254,6 +3279,72 @@ out: RETURN(rc); } +/* + * lod_snapshot_clone() + * + * snapshot clone + * + * \param[in] env environment + * \param[in] dt snapshot lod object + * \param[in] attr original attributes + * \param[in] buf original or snapshot lov + * (see mdt_snapshot_unpack() for details) + */ +static int lod_snapshot_clone(const struct lu_env *env, + struct dt_object *dt, + struct lu_attr *attr, + struct lu_buf *buf) +{ + struct lod_object *lod_obj = lod_dt_obj(dt); + struct lov_mds_md_v1 *lmm; + struct lov_ost_data_v1 *objs; + __u32 magic; + struct ost_id ost_id; + struct lu_fid orig_fid; + int i; + int rc = 0; + ENTRY; + + rc = lod_load_striping_locked(env, lod_obj); + if (rc) + RETURN(rc); + + lmm = (struct lov_mds_md_v1 *) buf->lb_buf; + magic = le32_to_cpu(lmm->lmm_magic); + if (buf->lb_len < lov_mds_md_size(lod_obj->ldo_stripenr, magic)) { + CDEBUG(D_WARNING, "invalid buf size %d\n", (int)buf->lb_len); + RETURN(-EFAULT); + } + + for (i = 0; i < lod_obj->ldo_stripenr; i++) { + + if (magic == LOV_MAGIC_V3) { + struct lov_mds_md_v3 *v3 = (struct lov_mds_md_v3 *)lmm; + objs = &v3->lmm_objects[i]; + } else { + objs = &lmm->lmm_objects[i]; + } + + ostid_le_to_cpu(&objs->l_ost_oi, &ost_id); + rc = ostid_to_fid(&orig_fid, &ost_id, + le32_to_cpu(objs->l_ost_idx)); + if (rc != 0) + break; + + rc = dt_osp_snapshot_clone(env, + lod_obj->ldo_stripe[i], + attr, &orig_fid, + lu_object_fid(&dt->do_lu)); + if (rc != 0) { + CDEBUG(D_ERROR, "failed to OST_SNAPSHOT, " + "send OST_DESTROY to rollback\n"); + break; + } + + } + RETURN(rc); +} + struct dt_object_operations lod_obj_ops = { .do_read_lock = lod_object_read_lock, .do_write_lock = lod_object_write_lock, @@ -3283,6 +3374,7 @@ struct dt_object_operations lod_obj_ops = { .do_object_sync = lod_object_sync, .do_object_lock = lod_object_lock, .do_object_unlock = lod_object_unlock, + .do_lod_snapshot_clone = lod_snapshot_clone, }; static ssize_t lod_read(const struct lu_env *env, struct dt_object *dt, diff --git a/lustre/lod/lod_qos.c b/lustre/lod/lod_qos.c index ba70a97..aee627b 100644 --- a/lustre/lod/lod_qos.c +++ b/lustre/lod/lod_qos.c @@ -1188,6 +1188,69 @@ out_nolock: RETURN(rc); } +/* + * lod_alloc_snapshot() + * + * lod alloc for snapshot + * + * \param[in] env environment information + * \param[in/out] lo lod object + * \param[in/out] stripe stripe information + * \param[in] th thread handle + * \param[in] orig original data + * + * \retval 0 success + * \retval not 0 failure + */ +static int lod_alloc_snapshot(const struct lu_env *env, struct lod_object *lo, + struct dt_object **stripe, struct thandle *th, + struct lov_user_ost_data_v1 *orig) +{ + struct lod_device *m = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev); + struct dt_object *o; + unsigned int ost_idx; + int i, rc; + struct pool_desc *pool = NULL; + struct ost_pool *osts; + ENTRY; + + rc = lod_qos_ost_in_use_clear(env, lo->ldo_stripenr); + if (rc) + GOTO(out, rc); + + if (lo->ldo_pool) + pool = lod_find_pool(m, lo->ldo_pool); + + if (pool != NULL) { + down_read(&pool_tgt_rw_sem(pool)); + osts = &(pool->pool_obds); + } else { + osts = &(m->lod_pool_info); + } + + for (i = 0; i < lo->ldo_stripenr; i++) { + ost_idx = orig[i].l_ost_idx; + o = lod_qos_declare_object_on(env, m, ost_idx, th); + if (IS_ERR(o)) { + CDEBUG(D_OTHER, + "cannot declare new object on #%u: %d\n", + ost_idx, (int) PTR_ERR(o)); + GOTO(out, rc = PTR_ERR(o)); + } + + lod_qos_ost_in_use(env, i, ost_idx); + stripe[i] = o; + } + +out: + if (pool != NULL) { + up_read(&pool_tgt_rw_sem(pool)); + lod_pool_putref(pool); + } + + RETURN(rc); +} + /* Find the max stripecount we should use */ static __u16 lod_get_stripecnt(struct lod_device *lod, __u32 magic, __u16 stripe_count) @@ -1252,9 +1315,10 @@ out: RETURN(rc); } -static int lod_qos_parse_config(const struct lu_env *env, - struct lod_object *lo, - const struct lu_buf *buf) +static int __lod_qos_parse_config(const struct lu_env *env, + struct lod_object *lo, + const struct lu_buf *buf, + int is_snapshot) { struct lod_device *d = lu2lod_dev(lod2lu_obj(lo)->lo_dev); struct lov_user_md_v1 *v1 = NULL; @@ -1273,10 +1337,18 @@ static int lod_qos_parse_config(const struct lu_env *env, if (magic == __swab32(LOV_USER_MAGIC_V1)) { lustre_swab_lov_user_md_v1(v1); magic = v1->lmm_magic; + if (is_snapshot == 1) { + lustre_swab_lov_user_md_objects(v1->lmm_objects, + v1->lmm_stripe_count); + } } else if (magic == __swab32(LOV_USER_MAGIC_V3)) { v3 = buf->lb_buf; lustre_swab_lov_user_md_v3(v3); magic = v3->lmm_magic; + if (is_snapshot == 1) { + lustre_swab_lov_user_md_objects(v3->lmm_objects, + v3->lmm_stripe_count); + } } if (unlikely(magic != LOV_MAGIC_V1 && magic != LOV_MAGIC_V3)) { @@ -1360,12 +1432,26 @@ static int lod_qos_parse_config(const struct lu_env *env, RETURN(0); } +static int lod_qos_parse_config(const struct lu_env *env, + struct lod_object *lo, + const struct lu_buf *buf) +{ + return __lod_qos_parse_config(env, lo, buf, 0); +} + +static int lod_qos_parse_config_for_snapshot(const struct lu_env *env, + struct lod_object *lo, + const struct lu_buf *buf) +{ + return __lod_qos_parse_config(env, lo, buf, 1); +} + /* * buf should be NULL or contain striping settings */ -int lod_qos_prep_create(const struct lu_env *env, struct lod_object *lo, - struct lu_attr *attr, const struct lu_buf *buf, - struct thandle *th) +int __lod_qos_prep_create(const struct lu_env *env, struct lod_object *lo, + struct lu_attr *attr, const struct lu_buf *buf, + struct thandle *th, int is_snapshot) { struct lod_device *d = lu2lod_dev(lod2lu_obj(lo)->lo_dev); struct dt_object **stripe; @@ -1390,7 +1476,10 @@ int lod_qos_prep_create(const struct lu_env *env, struct lod_object *lo, * in case the caller is passing lovea with new striping config, * we may need to parse lovea and apply new configuration */ - rc = lod_qos_parse_config(env, lo, buf); + if (is_snapshot == 1) + rc = lod_qos_parse_config_for_snapshot(env, lo, buf); + else + rc = lod_qos_parse_config(env, lo, buf); if (rc) GOTO(out, rc); @@ -1420,13 +1509,31 @@ int lod_qos_prep_create(const struct lu_env *env, struct lod_object *lo, /* XXX: support for non-0 files w/o objects */ CDEBUG(D_OTHER, "tgt_count %d stripenr %d\n", d->lod_desc.ld_tgt_count, stripe_len); - if (lo->ldo_def_stripe_offset >= d->lod_desc.ld_tgt_count) { - rc = lod_alloc_qos(env, lo, stripe, flag, th); - if (rc == -EAGAIN) - rc = lod_alloc_rr(env, lo, stripe, flag, th); + if (is_snapshot == 1) { + struct lov_user_md_v1 *v1 = buf->lb_buf; + struct lov_user_md_v3 *v3 = buf->lb_buf; + + if (lo->ldo_stripe_size != v1->lmm_stripe_size) + rc = -EINVAL; + else if (lo->ldo_stripenr != v1->lmm_stripe_count) + rc = -EINVAL; + else + rc = lod_alloc_snapshot(env, lo, stripe, th, + (v1->lmm_magic == LOV_USER_MAGIC) ? + v1->lmm_objects : v3->lmm_objects); } else { - rc = lod_alloc_specific(env, lo, stripe, flag, th); + if (lo->ldo_def_stripe_offset + >= d->lod_desc.ld_tgt_count) { + rc = lod_alloc_qos(env, lo, stripe, flag, th); + if (rc == -EAGAIN) + rc = lod_alloc_rr(env, + lo, stripe, flag, th); + } else { + rc = lod_alloc_specific(env, + lo, stripe, flag, th); + } } + lod_putref(d, &d->lod_ost_descs); if (rc < 0) { @@ -1464,3 +1571,18 @@ out: RETURN(rc); } +int lod_qos_prep_create(const struct lu_env *env, struct lod_object *lo, + struct lu_attr *attr, const struct lu_buf *buf, + struct thandle *th) +{ + return __lod_qos_prep_create(env, lo, attr, buf, th, 0); +} + +int lod_qos_prep_create_for_snapshot(const struct lu_env *env, + struct lod_object *lo, + struct lu_attr *attr, + const struct lu_buf *buf, + struct thandle *th) +{ + return __lod_qos_prep_create(env, lo, attr, buf, th, 1); +} diff --git a/lustre/lov/lov_obd.c b/lustre/lov/lov_obd.c index 7bacb3f..469415e 100644 --- a/lustre/lov/lov_obd.c +++ b/lustre/lov/lov_obd.c @@ -2390,6 +2390,37 @@ out: RETURN(rc); } +static int lov_snapshot_cancel_lock(struct obd_export *exp, + void *val) +{ + struct lov_stripe_md *lsm = val; + struct obd_device *obddev = class_exp2obd(exp); + struct lov_obd *lov = &obddev->u.lov; + int i, rc = 0; + ENTRY; + + obd_getref(obddev); + for (i = 0; i < lsm->lsm_stripe_count; i++) { + struct lov_oinfo *loi = lsm->lsm_oinfo[i]; + if (lov_oinfo_is_dummy(loi)) + continue; + if (!lov->lov_tgts[loi->loi_ost_idx]) { + CDEBUG(D_HA, "lov idx %d NULL\n", loi->loi_ost_idx); + continue; + } + rc = obd_snapshot_cancel_lock( + lov->lov_tgts[loi->loi_ost_idx]->ltd_exp, + &loi->loi_oi); + if (rc) { + CERROR("cancel lock failed. idx=%d\n", + loi->loi_ost_idx); + break; + } + } + obd_putref(obddev); + RETURN(rc); +} + static struct obd_ops lov_obd_ops = { .o_owner = THIS_MODULE, .o_setup = lov_setup, @@ -2419,6 +2450,7 @@ static struct obd_ops lov_obd_ops = { .o_putref = lov_putref, .o_quotactl = lov_quotactl, .o_quotacheck = lov_quotacheck, + .o_snapshot_cancel_lock = lov_snapshot_cancel_lock, }; struct kmem_cache *lov_oinfo_slab; diff --git a/lustre/mdc/mdc_internal.h b/lustre/mdc/mdc_internal.h index 1769f3e..d180dc5 100644 --- a/lustre/mdc/mdc_internal.h +++ b/lustre/mdc/mdc_internal.h @@ -69,6 +69,7 @@ void mdc_link_pack(struct ptlrpc_request *req, struct md_op_data *op_data); void mdc_rename_pack(struct ptlrpc_request *req, struct md_op_data *op_data, const char *old, int oldlen, const char *new, int newlen); void mdc_close_pack(struct ptlrpc_request *req, struct md_op_data *op_data); +void mdc_snapshot_pack(struct ptlrpc_request *req, struct md_op_data *op_data); /* mdc/mdc_locks.c */ int mdc_set_lock_data(struct obd_export *exp, @@ -136,6 +137,8 @@ int mdc_unlink(struct obd_export *exp, struct md_op_data *op_data, int mdc_cancel_unused(struct obd_export *exp, const struct lu_fid *fid, ldlm_policy_data_t *policy, ldlm_mode_t mode, ldlm_cancel_flags_t flags, void *opaque); +int mdc_snapshot(struct obd_export *exp, struct md_op_data *op_data, + struct ptlrpc_request **request); static inline void mdc_set_capa_size(struct ptlrpc_request *req, const struct req_msg_field *field, diff --git a/lustre/mdc/mdc_lib.c b/lustre/mdc/mdc_lib.c index 490204b..ce33656 100644 --- a/lustre/mdc/mdc_lib.c +++ b/lustre/mdc/mdc_lib.c @@ -45,6 +45,7 @@ #include #include #include +#include #include "mdc_internal.h" #ifndef __KERNEL__ @@ -549,3 +550,71 @@ void mdc_close_pack(struct ptlrpc_request *req, struct md_op_data *op_data) mdc_ioepoch_pack(epoch, op_data); mdc_hsm_release_pack(req, op_data); } + +/* + * mdc_snapshot_pack() + * + * pack md_op_data for snapshot + * + * \param[in] req portal rpc request + * \param[in/out] op_data operation data + */ +void mdc_snapshot_pack(struct ptlrpc_request *req, + struct md_op_data *op_data) +{ + struct mdt_rec_snapshot_create *rec; + char *tmp; + ENTRY; + + CLASSERT(sizeof(struct mdt_rec_reint) == + sizeof(struct mdt_rec_snapshot_create)); + rec = req_capsule_client_get(&req->rq_pill, &RMF_REC_REINT); + + rec->sc_opcode = REINT_SNAPSHOT; + rec->sc_fsuid = from_kuid(&init_user_ns, current_fsuid()); + rec->sc_fsgid = from_kgid(&init_user_ns, current_fsgid()); + rec->sc_cap = op_data->op_cap; + rec->sc_file_owner = + ((__u64)(op_data->op_fsgid) | ((__u64)op_data->op_fsuid << 32)); + rec->sc_suppgid1 = op_data->op_suppgids[0]; + rec->sc_suppgid2 = op_data->op_suppgids[1]; + memcpy(&rec->sc_snapdir_fid, &op_data->op_fid1, + sizeof(op_data->op_fid1)); + memcpy(&rec->sc_snapshot_fid, &op_data->op_fid2, + sizeof(op_data->op_fid2)); + memcpy(&rec->sc_orig_fid, &op_data->op_fid3, + sizeof(op_data->op_fid3)); + rec->sc_mode = op_data->op_attr.ia_mode; + rec->sc_atime = op_data->op_attr.ia_atime.tv_sec; + rec->sc_mtime = op_data->op_attr.ia_mtime.tv_sec; + rec->sc_ctime = op_data->op_attr.ia_ctime.tv_sec; + + /* + * As snapshots permissions that are trying to create is not changed by + * default mask, umask is set in advance to 0. + */ + rec->sc_umask = 0; + rec->sc_flags = 0; + /* hidden dir*/ + if (op_data->op_bias & MDS_SNAPSHOT) + rec->sc_flags |= MDT_SNAPSHOT_DIR_PATT; + + /* set snapshot name */ + mdc_pack_name(req, &RMF_NAME, op_data->op_name, op_data->op_namelen); + + /* set externt attribute */ + if (op_data->op_eanamelen) { + mdc_pack_name(req, &RMF_SNAP_EANAME, op_data->op_eaname, + op_data->op_eanamelen); + } + if (op_data->op_data) { + /* set lov */ + tmp = req_capsule_client_get(&req->rq_pill, &RMF_EADATA); + memcpy(tmp, op_data->op_data, op_data->op_data_size); + + tmp = req_capsule_client_get(&req->rq_pill, &RMF_EADATA2); + memcpy(tmp, op_data->op_data, op_data->op_data_size); + } + + EXIT; +} diff --git a/lustre/mdc/mdc_reint.c b/lustre/mdc/mdc_reint.c index c9ddfa3..41f1167 100644 --- a/lustre/mdc/mdc_reint.c +++ b/lustre/mdc/mdc_reint.c @@ -369,6 +369,9 @@ int mdc_unlink(struct obd_export *exp, struct md_op_data *op_data, obd->u.cli.cl_default_mds_cookiesize); ptlrpc_request_set_replen(req); + if (op_data->op_bias & MDS_SNAPSHOT) + req->rq_request_portal = MDS_SNAPSHOT_PORTAL; + *request = req; rc = mdc_reint(req, obd->u.cli.cl_rpc_lock, LUSTRE_IMP_FULL); @@ -491,3 +494,118 @@ int mdc_rename(struct obd_export *exp, struct md_op_data *op_data, RETURN(rc); } + +/* + * mdc_snapshot() + * + * @exp mdc obd export + * @op_data operation data + * @request portal rpc request + */ +int mdc_snapshot(struct obd_export *exp, struct md_op_data *op_data, + struct ptlrpc_request **request) +{ + struct obd_device *obd = exp->exp_obd; + struct ptlrpc_request *req; + struct list_head cancels = LIST_HEAD_INIT(cancels); + int count; + int rc; + ENTRY; + + /* get original ldlm list */ + count = 0; + if (op_data->op_fid3.f_seq != 0) { + count = mdc_resource_get_unused(exp, &op_data->op_fid3, + &cancels, LCK_CR, + MDS_INODELOCK_UPDATE); + } + + /* get snapshot directory ldlm list */ + count += mdc_resource_get_unused(exp, &op_data->op_fid1, + &cancels, LCK_EX, + MDS_INODELOCK_UPDATE); + /* allocate request */ + req = ptlrpc_request_alloc(class_exp2cliimp(exp), + &RQF_MDS_REINT_SNAPSHOT); + if (req == NULL) { + ldlm_lock_list_put(&cancels, l_bl_ast, count); + RETURN(-ENOMEM); + } + + /* set request: RMF_NAME */ + req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT, + op_data->op_namelen + 1); + + /* set request: RMF_SNAP_EANAME */ + req_capsule_set_size(&req->rq_pill, &RMF_SNAP_EANAME, RCL_CLIENT, + op_data->op_eanamelen + 1); + + /* set request: RMF_EADATA */ + req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_CLIENT, + op_data->op_data_size); + + req_capsule_set_size(&req->rq_pill, &RMF_EADATA2, RCL_CLIENT, + op_data->op_data_size); + + /* set request: RMF_DLM_REQ */ + rc = mdc_prep_elc_req(exp, req, MDS_REINT, &cancels, count); + if (rc) { + ptlrpc_request_free(req); + RETURN(rc); + } + + spin_lock(&req->rq_lock); + req->rq_replay = req->rq_import->imp_replayable; + spin_unlock(&req->rq_lock); + + /* set request: RMF_REC_REINT */ + mdc_snapshot_pack(req, op_data); + + /* set request: RMF_MDT_MD (reply) set orig stripe size */ + req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER, + op_data->op_data_size); + + /* set request: length of reply */ + ptlrpc_request_set_replen(req); + + /* send request */ + req->rq_request_portal = MDS_SNAPSHOT_PORTAL; + rc = mdc_reint(req, obd->u.cli.cl_rpc_lock, LUSTRE_IMP_FULL); + if (rc == -ERESTARTSYS) + rc = 0; + + /* + * We save the reply LOV EA in case we have to replay a + * mdt_md_snapshot() for recovery. + */ + if ((rc == 0) && req->rq_replay && S_ISREG(op_data->op_attr.ia_mode)) { + struct mdt_body *body = + req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY); + if (body == NULL) { + CERROR("Can't get mdt_body\n"); + GOTO(out, rc = -EPROTO); + } + if (body->mbo_valid & OBD_MD_FLEASIZE) { + void *eadata; + void *lmm; + eadata = req_capsule_server_sized_get(&req->rq_pill, + &RMF_MDT_MD, + body->mbo_eadatasize); + if (eadata == NULL) + GOTO(out, rc = -EPROTO); + lmm = req_capsule_client_get(&req->rq_pill, + &RMF_EADATA); + if (lmm == NULL) + GOTO(out, rc = -EPROTO); + memcpy(lmm, eadata, body->mbo_eadatasize); + } + } +out: + spin_lock(&req->rq_lock); + req->rq_replay = 0; + spin_unlock(&req->rq_lock); + + *request = req; + + RETURN(rc); +} diff --git a/lustre/mdc/mdc_request.c b/lustre/mdc/mdc_request.c index 9a38ea3..e6dca71 100644 --- a/lustre/mdc/mdc_request.c +++ b/lustre/mdc/mdc_request.c @@ -3215,7 +3215,8 @@ struct md_ops mdc_md_ops = { .m_unpack_capa = mdc_unpack_capa, .m_get_remote_perm = mdc_get_remote_perm, .m_intent_getattr_async = mdc_intent_getattr_async, - .m_revalidate_lock = mdc_revalidate_lock + .m_revalidate_lock = mdc_revalidate_lock, + .m_snapshot = mdc_snapshot }; int __init mdc_init(void) diff --git a/lustre/mdd/mdd_device.c b/lustre/mdd/mdd_device.c index d55f358..e305efd 100644 --- a/lustre/mdd/mdd_device.c +++ b/lustre/mdd/mdd_device.c @@ -1510,6 +1510,61 @@ static int mdd_iocontrol(const struct lu_env *env, struct md_device *m, RETURN (rc); } +/* + * mdd_snapshot_get_enable() + * + * snapshot get enable handler in metadata device + * + * \param[in] lu_env lustre enironment data + * \param[in] m metadata device + * + * \retval 0 snapshot is disabled + * \retval 1 snapshot is enabled + * \retval not 0,1 error code + */ +static int mdd_snapshot_get_enable(const struct lu_env *env, + struct md_device *m) +{ + struct mdd_device *mdd = lu2mdd_dev(&m->md_lu_dev); + struct dt_device *dt = mdd->mdd_bottom; + int rc; + ENTRY; + + LASSERT(dt); + + /* call osd */ + rc = dt_snapshot_get_enable(env, dt); + + RETURN(rc); +} + +/* + * mdd_snapshot_set_enable() + * + * snapshot set enable handler in metadata device + * + * \param[in] lu_env lustre enironment data + * \param[in] m metadata device + * + * \retval 0 success + * \retval not 0 error code + */ +static int mdd_snapshot_set_enable(const struct lu_env *env, + struct md_device *m) +{ + struct mdd_device *mdd = lu2mdd_dev(&m->md_lu_dev); + struct dt_device *dt = mdd->mdd_bottom; + int rc; + ENTRY; + + LASSERT(dt); + + /* call osd */ + rc = dt_snapshot_set_enable(env, dt); + + RETURN(rc); +} + /* type constructor/destructor: mdd_type_init, mdd_type_fini */ LU_TYPE_INIT_FINI(mdd, &mdd_thread_key); @@ -1521,6 +1576,8 @@ static const struct md_device_operations mdd_ops = { .mdo_llog_ctxt_get = mdd_llog_ctxt_get, .mdo_iocontrol = mdd_iocontrol, .mdo_maxeasize_get = mdd_maxeasize_get, + .mdo_snapshot_get_enable = mdd_snapshot_get_enable, + .mdo_snapshot_set_enable = mdd_snapshot_set_enable, }; static struct lu_device_type_operations mdd_device_type_ops = { diff --git a/lustre/mdd/mdd_dir.c b/lustre/mdd/mdd_dir.c index 49389e0..c902471 100644 --- a/lustre/mdd/mdd_dir.c +++ b/lustre/mdd/mdd_dir.c @@ -46,7 +46,7 @@ #include #include #include - +#include #include "mdd_internal.h" static const char dot[] = "."; @@ -547,6 +547,13 @@ static int mdd_link_sanity_check(const struct lu_env *env, if (mdd_is_dead_obj(src_obj)) RETURN(-ESTALE); + /* check readonly for snapshot */ + if (!tgt_snapshot(env)) { + if (LUSTRE_TEST_MDT_SNAPSHOT(tattr->la_flags) || + LUSTRE_TEST_MDT_SNAPSHOT(cattr->la_flags)) + RETURN(-EPERM); + } + /* Local ops, no lookup before link, check filename length here. */ rc = mdd_name_check(m, lname); if (rc < 0) @@ -2379,7 +2386,10 @@ static int mdd_create(const struct lu_env *env, struct md_object *pobj, /* update parent directory mtime/ctime */ *la = *attr; - la->la_valid = LA_CTIME | LA_MTIME; + if (LUSTRE_TEST_MDT_SNAPSHOT_FILE(attr->la_flags)) + la->la_valid = LA_CTIME; + else + la->la_valid = LA_CTIME | LA_MTIME; rc = mdd_update_time(env, mdd_pobj, pattr, la, handle); if (rc) GOTO(err_insert, rc); @@ -2516,6 +2526,10 @@ static int mdd_rename_sanity_check(const struct lu_env *env, if (rc) RETURN(rc); + if (!tgt_snapshot(env) && + (LUSTRE_TEST_MDT_SNAPSHOT(cattr->la_flags))) + RETURN(-EPERM); + /* XXX: when get here, "tobj == NULL" means tobj must * NOT exist (neither on remote MDS, such case has been * processed in cld_rename before mdd_rename and enable diff --git a/lustre/mdd/mdd_internal.h b/lustre/mdd/mdd_internal.h index 9183bba..c67f7e3 100644 --- a/lustre/mdd/mdd_internal.h +++ b/lustre/mdd/mdd_internal.h @@ -760,4 +760,20 @@ static inline struct obd_capa *mdo_capa_get(const struct lu_env *env, return next->do_ops->do_capa_get(env, next, old, opc); } +static inline int mdo_snapshot_clone(const struct lu_env *env, + struct mdd_object *snap_obj, + struct lu_attr *attr, + struct lu_buf *buf) +{ + struct dt_object *snap_next = mdd_object_child(snap_obj); + if (mdd_object_exists(snap_obj) == 0) { + CERROR("%s: object "DFID" not found: rc = -2\n", + mdd_obj_dev_name(snap_obj), + PFID(mdd_object_fid(snap_obj))); + return -ENOENT; + } + return snap_next->do_ops->do_lod_snapshot_clone(env, + snap_next, attr, buf); +} + #endif diff --git a/lustre/mdd/mdd_object.c b/lustre/mdd/mdd_object.c index 9f0f8c2..d558e39 100644 --- a/lustre/mdd/mdd_object.c +++ b/lustre/mdd/mdd_object.c @@ -53,6 +53,7 @@ #include #include #include +#include #include "mdd_internal.h" @@ -427,6 +428,12 @@ static int mdd_fix_attr(const struct lu_env *env, struct mdd_object *obj, LASSERT(oattr != NULL); + /* check readonly for snapshot */ + if (!tgt_snapshot(env)) { + if (LUSTRE_TEST_MDT_SNAPSHOT_FILE(oattr->la_flags)) + RETURN(-EPERM); + } + /* export destroy does not have ->le_ses, but we may want * to drop LUSTRE_SOM_FL. */ uc = lu_ucred_check(env); @@ -893,6 +900,12 @@ static int mdd_xattr_sanity_check(const struct lu_env *env, struct lu_ucred *uc = lu_ucred_assert(env); ENTRY; + /* check readonly for snapshot */ + if (!tgt_snapshot(env)) { + if (LUSTRE_TEST_MDT_SNAPSHOT(attr->la_flags)) + RETURN(-EPERM); + } + if (mdd_is_immutable(obj) || mdd_is_append(obj)) RETURN(-EPERM); @@ -1267,6 +1280,13 @@ static int mdd_layout_swap_allowed(const struct lu_env *env, RETURN(-EBADF); } + /* check readonly for snapshot */ + if (!tgt_snapshot(env)) { + if (LUSTRE_TEST_MDT_SNAPSHOT(attr1->la_flags) || + LUSTRE_TEST_MDT_SNAPSHOT(attr2->la_flags)) + RETURN(-EPERM); + } + if ((attr1->la_uid != attr2->la_uid) || (attr1->la_gid != attr2->la_gid)) RETURN(-EPERM); @@ -1578,9 +1598,10 @@ int accmode(const struct lu_env *env, const struct lu_attr *la, int flags) * NFSD uses the MDS_OPEN_OWNEROVERRIDE flag to say that a file * owner can write to a file even if it is marked readonly to hide * its brokenness. (bug 5781) */ - if (flags & MDS_OPEN_OWNEROVERRIDE) { - struct lu_ucred *uc = lu_ucred_check(env); + if ((flags & MDS_OPEN_OWNEROVERRIDE) && + !LUSTRE_TEST_MDT_SNAPSHOT(la->la_flags)) { + struct lu_ucred *uc = lu_ucred_check(env); if ((uc == NULL) || (la->la_uid == uc->uc_fsuid)) return 0; } @@ -2018,6 +2039,41 @@ static int mdd_object_unlock(const struct lu_env *env, return dt_object_unlock(env, mdd_object_child(mdd_obj), einfo, policy); } +/* + * mdd_snapshot_clone() + * + * @env environment + * @snap_obj snapshot object + * @ma snapshot attributes + * @buf original or snapshot lov (see mdt_snapshot_unpack() for details) + */ +static int mdd_snapshot_clone(const struct lu_env *env, + struct md_object *snap_obj, + struct md_attr *ma, + struct lu_buf *buf) +{ + struct mdd_object *snap_mdd_obj = md2mdd_obj(snap_obj); + int rc; + ENTRY; + + /* check object */ + if (mdd_object_exists(snap_mdd_obj) == 0) { + CERROR("%s: object "DFID" not found: rc = -2\n", + mdd_obj_dev_name(snap_mdd_obj), + PFID(mdd_object_fid(snap_mdd_obj))); + RETURN(-ENOENT); + } + + /* lock object */ + mdd_write_lock(env, snap_mdd_obj, MOR_TGT_CHILD); + /* call lod */ + rc = mdo_snapshot_clone(env, snap_mdd_obj, &ma->ma_attr, buf); + /* unlock object */ + mdd_write_unlock(env, snap_mdd_obj); + + RETURN(rc); +} + const struct md_object_operations mdd_obj_ops = { .moo_permission = mdd_permission, .moo_attr_get = mdd_attr_get, @@ -2036,4 +2092,5 @@ const struct md_object_operations mdd_obj_ops = { .moo_object_sync = mdd_object_sync, .moo_object_lock = mdd_object_lock, .moo_object_unlock = mdd_object_unlock, + .moo_snapshot_clone = mdd_snapshot_clone, }; diff --git a/lustre/mdd/mdd_permission.c b/lustre/mdd/mdd_permission.c index 2471c0c..0b7d719 100644 --- a/lustre/mdd/mdd_permission.c +++ b/lustre/mdd/mdd_permission.c @@ -48,6 +48,7 @@ #include #include #include +#include #include "mdd_internal.h" #ifdef CONFIG_FS_POSIX_ACL @@ -251,6 +252,11 @@ int __mdd_permission_internal(const struct lu_env *env, struct mdd_object *obj, if ((uc == NULL) || (uc->uc_valid == UCRED_INIT)) RETURN(0); + /* check readonly for snapshot */ + if ((mask & MAY_WRITE) && !tgt_snapshot(env)) { + if (LUSTRE_TEST_MDT_SNAPSHOT(la->la_flags)) + RETURN(-EPERM); + } /* Invalid user credit */ if (uc->uc_valid == UCRED_INVALID) RETURN(-EACCES); diff --git a/lustre/mdt/Makefile.in b/lustre/mdt/Makefile.in index 2b23a67..b343036 100644 --- a/lustre/mdt/Makefile.in +++ b/lustre/mdt/Makefile.in @@ -7,5 +7,6 @@ mdt-objs += mdt_hsm_cdt_requests.o mdt-objs += mdt_hsm_cdt_client.o mdt-objs += mdt_hsm_cdt_agent.o mdt-objs += mdt_coordinator.o +mdt-objs += mdt_snapshot.o @INCLUDE_RULES@ diff --git a/lustre/mdt/mdt_handler.c b/lustre/mdt/mdt_handler.c index 2e4e8c0..fc1116e 100644 --- a/lustre/mdt/mdt_handler.c +++ b/lustre/mdt/mdt_handler.c @@ -1867,7 +1867,8 @@ static int mdt_reint(struct tgt_session_info *tsi) [REINT_OPEN] = &RQF_MDS_REINT_OPEN, [REINT_SETXATTR] = &RQF_MDS_REINT_SETXATTR, [REINT_RMENTRY] = &RQF_MDS_REINT_UNLINK, - [REINT_MIGRATE] = &RQF_MDS_REINT_RENAME + [REINT_MIGRATE] = &RQF_MDS_REINT_RENAME, + [REINT_SNAPSHOT] = &RQF_MDS_REINT_SNAPSHOT }; ENTRY; @@ -2822,6 +2823,9 @@ void mdt_thread_info_init(struct ptlrpc_request *req, info->mti_spec.u.sp_ea.eadata = NULL; info->mti_spec.u.sp_ea.eadatalen = 0; + + info->mti_eaname.ln_name = NULL; + info->mti_eaname.ln_namelen = 0; } void mdt_thread_info_fini(struct mdt_thread_info *info) @@ -5584,6 +5588,22 @@ int mdt_get_info(struct tgt_session_info *tsi) rc = mdt_rpc_fid2path(info, key, valout, *vallen); mdt_thread_info_fini(info); + } else if (KEY_IS(KEY_SNAPSHOT_ENABLED)) { + struct mdt_thread_info *info = tsi2mdt_info(tsi); + struct md_device *next = info->mti_mdt->mdt_child; + const struct lu_env *env = info->mti_env; + __u32 enabled; + + /* call mdd */ + rc = next->md_ops->mdo_snapshot_get_enable(env, next); + mdt_thread_info_fini(info); + + /* set return value */ + if (rc == 0 || rc == 1) { + enabled = (__u32)rc; + memcpy(valout, &enabled, *vallen); + rc = 0; + } } else { rc = -EINVAL; } @@ -5661,6 +5681,60 @@ static int mdt_ioc_version_get(struct mdt_thread_info *mti, void *karg) RETURN(rc); } +/* + * mdt_lctl_snapshot + * + * lctl snapshot method of IOCTL + * + * \param[in] env lustre environment + * \param[in] mdt mdt device + * \param[in] subcmd sub command + * + * subcmd=OBD_IOC_SNAPSHOT_ON + * OBD_IOC_SNAPSHOT_LOCK + * OBD_IOC_SNAPSHOT_UNLOCK + * \retval: 0 success + * \retval: not 0 error code + * + * subcmd=OBD_IOC_SNAPSHOT_STATUS + * \retval: 0 snapshot is disapbled + * \retval: 1 snapshot is enabled + * \retval: not 0,1 error code + */ +static int mdt_lctl_snapshot(struct lu_env *env, + struct mdt_device *mdt, __u32 subcmd) +{ + struct md_device *next = mdt->mdt_child; + int rc; + ENTRY; + + rc = 0; + switch (subcmd) { + case OBD_IOC_SNAPSHOT_ON: + /* call mdd */ + rc = next->md_ops->mdo_snapshot_set_enable(env, next); + break; + case OBD_IOC_SNAPSHOT_STATUS: + /* call mdd */ + rc = next->md_ops->mdo_snapshot_get_enable(env, next); + break; + case OBD_IOC_SNAPSHOT_LOCK: + /* test and lock */ + if (test_and_set_bit(MDT_FL_SNAPSHOT_ENABLING, + &mdt->mdt_snapshot_flags)) + rc = -EBUSY; + break; + case OBD_IOC_SNAPSHOT_UNLOCK: + /* unlock */ + clear_bit(MDT_FL_SNAPSHOT_ENABLING, &mdt->mdt_snapshot_flags); + break; + default: + rc = -EOPNOTSUPP; + } + + RETURN(rc); +} + /* ioctls on obd dev */ static int mdt_iocontrol(unsigned int cmd, struct obd_export *exp, int len, void *karg, void *uarg) @@ -5745,6 +5819,12 @@ static int mdt_iocontrol(unsigned int cmd, struct obd_export *exp, int len, &mti->mti_tmp_fid1); break; } + case OBD_IOC_SNAPSHOT: { + struct obd_ioctl_data *data = karg; + + rc = mdt_lctl_snapshot(&env, mdt, data->ioc_command); + break; + } default: rc = -EOPNOTSUPP; CERROR("%s: Not supported cmd = %d, rc = %d\n", diff --git a/lustre/mdt/mdt_idmap.c b/lustre/mdt/mdt_idmap.c index f49c6dd..31cba12 100644 --- a/lustre/mdt/mdt_idmap.c +++ b/lustre/mdt/mdt_idmap.c @@ -322,7 +322,7 @@ int mdt_fix_attr_ucred(struct mdt_thread_info *info, __u32 op) if (uc == NULL) return -EINVAL; - if (op != REINT_SETATTR) { + if ((op != REINT_SETATTR) && (op != REINT_SNAPSHOT)) { if ((attr->la_valid & LA_UID) && (attr->la_uid != -1)) attr->la_uid = uc->uc_fsuid; /* for S_ISGID, inherit gid from his parent, such work will be diff --git a/lustre/mdt/mdt_internal.h b/lustre/mdt/mdt_internal.h index 9017957..6ee48bf 100644 --- a/lustre/mdt/mdt_internal.h +++ b/lustre/mdt/mdt_internal.h @@ -157,6 +157,9 @@ struct coordinator { #define MDT_FL_CFGLOG 0 #define MDT_FL_SYNCED 1 +/* snapshot state flag bits */ +#define MDT_FL_SNAPSHOT_ENABLING 0 + struct mdt_device { /* super-class */ struct lu_device mdt_lu_dev; @@ -230,6 +233,9 @@ struct mdt_device { struct lu_device *mdt_qmt_dev; struct coordinator mdt_coordinator; + + /* snapshot enable flag */ + unsigned long mdt_snapshot_flags; }; #define MDT_SERVICE_WATCHDOG_FACTOR (2) @@ -455,6 +461,9 @@ struct mdt_thread_info { /* should be enough to fit lustre_mdt_attrs */ char mti_xattr_buf[128]; struct ldlm_enqueue_info mti_einfo; + + /* for snapshot in request */ + struct lu_name mti_eaname; }; extern struct lu_context_key mdt_thread_key; @@ -1082,5 +1091,19 @@ static inline char *mdt_obd_name(struct mdt_device *mdt) int mds_mod_init(void); void mds_mod_exit(void); +/* mdt/mdt_snapshot.c */ +enum { + SNAPSHOT_COUNT_MKDIR = 0, + SNAPSHOT_COUNT_MKNOD, + SNAPSHOT_COUNT_RMDIR, + SNAPSHOT_COUNT_UNLINK, + SNAPSHOT_COUNT_SETXATTR, + SNAPSHOT_COUNT_LAST, +}; + +void mdt_snapshot_counter_init(void); +inline void mdt_snapshot_counter_incr(int op); +inline __u64 mdt_snapshot_counter_get(int op); + #endif /* __KERNEL__ */ #endif /* _MDT_H */ diff --git a/lustre/mdt/mdt_lib.c b/lustre/mdt/mdt_lib.c index 65ceaf3..2e436b2 100644 --- a/lustre/mdt/mdt_lib.c +++ b/lustre/mdt/mdt_lib.c @@ -1203,6 +1203,11 @@ static int mdt_unlink_unpack(struct mdt_thread_info *info) else ma->ma_attr_flags &= ~MDS_VTX_BYPASS; + if (rec->ul_bias & MDS_SNAPSHOT) + ma->ma_attr_flags |= MDS_SNAPSHOT; + else + ma->ma_attr_flags &= ~MDS_SNAPSHOT; + info->mti_spec.no_create = !!req_is_replay(mdt_info_req(info)); rc = mdt_dlmreq_unpack(info); @@ -1279,12 +1284,10 @@ static int mdt_rename_unpack(struct mdt_thread_info *info) /* * please see comment above LOV_MAGIC_V1_DEF */ -static void mdt_fix_lov_magic(struct mdt_thread_info *info) +static void mdt_fix_lov_magic(struct mdt_thread_info *info, void *lmm) { - struct mdt_reint_record *rr = &info->mti_rr; - struct lov_user_md_v1 *v1; + struct lov_user_md_v1 *v1 = lmm; - v1 = (void *)rr->rr_eadata; LASSERT(v1); if (unlikely(req_is_replay(mdt_info_req(info)))) { @@ -1374,7 +1377,7 @@ static int mdt_open_unpack(struct mdt_thread_info *info) sp->u.sp_ea.eadatalen = rr->rr_eadatalen; sp->u.sp_ea.eadata = rr->rr_eadata; sp->no_create = !!req_is_replay(req); - mdt_fix_lov_magic(info); + mdt_fix_lov_magic(info, (void *)rr->rr_eadata); } /* @@ -1454,6 +1457,87 @@ static int mdt_setxattr_unpack(struct mdt_thread_info *info) RETURN(0); } +/* + * mdt_snapshot_unpack() + * + * \param[in] info thread information + * + * \retval 0 success + * \retval not 0 error code + */ +static int mdt_snapshot_unpack(struct mdt_thread_info *info) +{ + struct lu_ucred *uc = mdt_ucred(info); + struct mdt_reint_record *rr = &info->mti_rr; + struct md_attr *ma = &info->mti_attr; + struct req_capsule *pill = info->mti_pill; + struct lu_attr *attr = &ma->ma_attr; + struct lu_fid *tmp_fid = &info->mti_tmp_fid1; + struct mdt_rec_snapshot_create *rec; + int rc; + ENTRY; + + CLASSERT(sizeof(struct mdt_rec_reint) == + sizeof(struct mdt_rec_snapshot_create)); + rec = req_capsule_client_get(pill, &RMF_REC_REINT); + if (rec == NULL) { + CERROR("snapshot no request data\n"); + RETURN(-EFAULT); + } + + /* set user cred */ + uc->uc_fsuid = rec->sc_fsuid; + uc->uc_fsgid = rec->sc_fsgid; + uc->uc_cap = rec->sc_cap; + uc->uc_umask = rec->sc_umask; + uc->uc_suppgids[0] = rec->sc_suppgid1; + uc->uc_suppgids[1] = -1; + + /* set reint record */ + rr->rr_fid1 = &rec->sc_snapdir_fid; + rr->rr_fid2 = &rec->sc_snapshot_fid; + mdt_name_unpack(pill, &RMF_NAME, &rr->rr_name, MNF_FIX_ANON); + mdt_name_unpack(pill, &RMF_SNAP_EANAME, &info->mti_eaname, 0); + + /* set orig lov */ + rr->rr_eadata = req_capsule_client_get(pill, &RMF_EADATA2); + rr->rr_eadatalen = req_capsule_get_size(pill, &RMF_EADATA2, RCL_CLIENT); + + + /* set attributes */ + attr->la_mode = rec->sc_mode; + attr->la_atime = rec->sc_atime; + attr->la_mtime = rec->sc_mtime; + attr->la_ctime = rec->sc_ctime; + attr->la_flags = rec->sc_flags; + attr->la_uid = (__u32)(rec->sc_file_owner >> 32); + attr->la_gid = (__u32)(rec->sc_file_owner & 0xFFFFFFFFUll); + attr->la_valid = LA_MODE | LA_UID | LA_GID | + LA_CTIME | LA_MTIME | LA_ATIME; + + /* set spec */ + memset(&info->mti_spec.u, 0, sizeof(info->mti_spec.u)); + info->mti_spec.sp_cr_flags = 0; + + /* orig mdt_md_snapshot() request: orig lov + * replay mdt_md_snapshot() request: snapshot lov + * orig and replay mdt_md_snapshot_xattr() request: orig xattr + */ + ma->ma_lmm = req_capsule_client_get(pill, &RMF_EADATA); + ma->ma_lmm_size = req_capsule_get_size(pill, &RMF_EADATA, RCL_CLIENT); + if (ma->ma_lmm_size > 0 && S_ISREG(attr->la_mode)) + mdt_fix_lov_magic(info, ma->ma_lmm); + + /* set original fid */ + *tmp_fid = rec->sc_orig_fid; + + /* set ldlm request */ + rc = mdt_dlmreq_unpack(info); + if (rc) + CERROR("snapshot ldlm request error rc=%d\n", rc); + + RETURN(rc); +} typedef int (*reint_unpacker)(struct mdt_thread_info *info); @@ -1467,6 +1551,7 @@ static reint_unpacker mdt_reint_unpackers[REINT_MAX] = { [REINT_SETXATTR] = mdt_setxattr_unpack, [REINT_RMENTRY] = mdt_rmentry_unpack, [REINT_MIGRATE] = mdt_rename_unpack, + [REINT_SNAPSHOT] = mdt_snapshot_unpack, }; int mdt_reint_unpack(struct mdt_thread_info *info, __u32 op) diff --git a/lustre/mdt/mdt_lproc.c b/lustre/mdt/mdt_lproc.c index ff7a260..80b327a 100644 --- a/lustre/mdt/mdt_lproc.c +++ b/lustre/mdt/mdt_lproc.c @@ -826,6 +826,34 @@ LPROC_SEQ_FOPS_RW_TYPE(mdt, ir_factor); LPROC_SEQ_FOPS_RW_TYPE(mdt, nid_stats_clear); LPROC_SEQ_FOPS(mdt_hsm_cdt_control); +/* + * mdt_snapshot_stats_seq_show() + * + * /proc/fs/mdt/-MDT0000/snapshot_stats show process + * + * \param[in] m data within struct obd_device + * \param[in] data callback data (unuse) + * + * \retval 0, and over size of result message + * \retval -1 error + */ +static int mdt_snapshot_stats_seq_show(struct seq_file *m, void *data) +{ + return seq_printf(m, + "%-25s %llu samples [reqs]\n" + "%-25s %llu samples [reqs]\n" + "%-25s %llu samples [reqs]\n" + "%-25s %llu samples [reqs]\n" + "%-25s %llu samples [reqs]\n", + "mknod", mdt_snapshot_counter_get(SNAPSHOT_COUNT_MKNOD), + "unlink", mdt_snapshot_counter_get(SNAPSHOT_COUNT_UNLINK), + "mkdir", mdt_snapshot_counter_get(SNAPSHOT_COUNT_MKDIR), + "rmdir", mdt_snapshot_counter_get(SNAPSHOT_COUNT_RMDIR), + "setxattr", mdt_snapshot_counter_get(SNAPSHOT_COUNT_SETXATTR)); +} + +LPROC_SEQ_FOPS_RO(mdt_snapshot_stats); + static struct lprocfs_seq_vars lprocfs_mdt_obd_vars[] = { { .name = "uuid", .fops = &mdt_uuid_fops }, @@ -879,6 +907,8 @@ static struct lprocfs_seq_vars lprocfs_mdt_obd_vars[] = { .fops = &mdt_enable_remote_dir_gid_fops }, { .name = "hsm_control", .fops = &mdt_hsm_cdt_control_fops }, + { .name = "snapshot_stats", + .fops = &mdt_snapshot_stats_fops }, { 0 } }; @@ -974,6 +1004,9 @@ int mdt_procfs_init(struct mdt_device *mdt, const char *name) LASSERT(name != NULL); + /* initialized snapshot counter */ + mdt_snapshot_counter_init(); + obd->obd_vars = lprocfs_mdt_obd_vars; rc = lprocfs_seq_obd_setup(obd); if (rc) { diff --git a/lustre/mdt/mdt_mds.c b/lustre/mdt/mdt_mds.c index 424ddf5..ceaab97 100644 --- a/lustre/mdt/mdt_mds.c +++ b/lustre/mdt/mdt_mds.c @@ -64,6 +64,7 @@ struct mds_device { struct ptlrpc_service *mds_mdsc_service; struct ptlrpc_service *mds_mdss_service; struct ptlrpc_service *mds_fld_service; + struct ptlrpc_service *mds_snapshot_service; }; /* @@ -99,6 +100,14 @@ static char *mds_attr_num_cpts; CFS_MODULE_PARM(mds_attr_num_cpts, "c", charp, 0444, "CPU partitions MDS setattr threads should run on"); +static unsigned long mds_snapshot_num_threads; +CFS_MODULE_PARM(mds_snapshot_num_threads, "ul", ulong, 0444, + "number of MDS snapshot service threads to start"); + +static char *mds_snapshot_num_cpts; +CFS_MODULE_PARM(mds_snapshot_num_cpts, "c", charp, 0444, + "CPU partitions MDS snapshot threads should run on"); + /* device init/fini methods */ static void mds_stop_ptlrpc_service(struct mds_device *m) { @@ -131,6 +140,10 @@ static void mds_stop_ptlrpc_service(struct mds_device *m) ptlrpc_unregister_service(m->mds_fld_service); m->mds_fld_service = NULL; } + if (m->mds_snapshot_service != NULL) { + ptlrpc_unregister_service(m->mds_snapshot_service); + m->mds_snapshot_service = NULL; + } EXIT; } @@ -428,6 +441,51 @@ static int mds_start_ptlrpc_service(struct mds_device *m) GOTO(err_mds_svc, rc); } + /* snapshot service start */ + memset(&conf, 0, sizeof(conf)); + conf = (typeof(conf)) { + .psc_name = LUSTRE_MDT_NAME "_snapshot", + .psc_watchdog_factor = MDT_SERVICE_WATCHDOG_FACTOR, + .psc_buf = { + .bc_nbufs = MDS_NBUFS, + .bc_buf_size = MDS_REG_BUFSIZE, + .bc_req_max_size = MDS_REG_MAXREQSIZE, + .bc_rep_max_size = MDS_REG_MAXREPSIZE, + .bc_req_portal = MDS_SNAPSHOT_PORTAL, + .bc_rep_portal = MDC_REPLY_PORTAL, + }, + /* + * We'd like to have a mechanism to set this on a per-device + * basis, but alas... + */ + .psc_thr = { + .tc_thr_name = LUSTRE_MDT_NAME "_snap", + .tc_thr_factor = MDS_SNAPSHOT_THR_FACTOR, + .tc_nthrs_init = MDS_SNAPSHOT_NTHRS_INIT, + .tc_nthrs_base = MDS_SNAPSHOT_NTHRS_BASE, + .tc_nthrs_max = MDS_SNAPSHOT_NTHRS_MAX, + .tc_nthrs_user = mds_snapshot_num_threads, + .tc_cpu_affinity = 1, + .tc_ctx_tags = LCT_MD_THREAD, + }, + .psc_cpt = { + .cc_pattern = mds_snapshot_num_cpts, + }, + .psc_ops = { + .so_req_handler = tgt_request_handle, + .so_req_printer = target_print_req, + .so_hpreq_handler = ptlrpc_hpreq_handler, + }, + }; + m->mds_snapshot_service = ptlrpc_register_service(&conf, procfs_entry); + if (IS_ERR(m->mds_snapshot_service)) { + rc = PTR_ERR(m->mds_snapshot_service); + CERROR("failed to start snapshot service: %d\n", rc); + m->mds_snapshot_service = NULL; + + GOTO(err_mds_svc, rc); + } + EXIT; err_mds_svc: if (rc) diff --git a/lustre/mdt/mdt_recovery.c b/lustre/mdt/mdt_recovery.c index f5a49da..9de1661 100644 --- a/lustre/mdt/mdt_recovery.c +++ b/lustre/mdt/mdt_recovery.c @@ -367,7 +367,8 @@ static mdt_reconstructor reconstructors[REINT_MAX] = { [REINT_UNLINK] = mdt_reconstruct_generic, [REINT_RENAME] = mdt_reconstruct_generic, [REINT_OPEN] = mdt_reconstruct_open, - [REINT_SETXATTR] = mdt_reconstruct_generic + [REINT_SETXATTR] = mdt_reconstruct_generic, + [REINT_SNAPSHOT] = mdt_reconstruct_create }; void mdt_reconstruct(struct mdt_thread_info *mti, diff --git a/lustre/mdt/mdt_reint.c b/lustre/mdt/mdt_reint.c index 2f98026..b909647 100644 --- a/lustre/mdt/mdt_reint.c +++ b/lustre/mdt/mdt_reint.c @@ -48,6 +48,7 @@ #include "mdt_internal.h" #include +#include static inline void mdt_reint_init_ma(struct mdt_thread_info *info, struct md_attr *ma) @@ -867,6 +868,7 @@ static int mdt_reint_unlink(struct mdt_thread_info *info, struct mdt_object *s0_obj = NULL; int rc; int no_name = 0; + int snapshot_flag = 0; ENTRY; DEBUG_REQ(D_INODE, req, "unlink "DFID"/"DNAME"", PFID(rr->rr_fid1), @@ -881,6 +883,12 @@ static int mdt_reint_unlink(struct mdt_thread_info *info, if (!fid_is_md_operative(rr->rr_fid1)) RETURN(-EPERM); + /* set snapshot readonly flag */ + if (ma->ma_attr_flags & MDS_SNAPSHOT) { + tgt_snapshot_set(info->mti_env); + snapshot_flag = 1; + } + /* * step 1: Found the parent. */ @@ -1042,6 +1050,8 @@ static int mdt_reint_unlink(struct mdt_thread_info *info, switch (ma->ma_attr.la_mode & S_IFMT) { case S_IFDIR: mdt_counter_incr(req, LPROC_MDT_RMDIR); + if (snapshot_flag == 1) + mdt_snapshot_counter_incr(SNAPSHOT_COUNT_RMDIR); break; case S_IFREG: case S_IFLNK: @@ -1050,6 +1060,9 @@ static int mdt_reint_unlink(struct mdt_thread_info *info, case S_IFIFO: case S_IFSOCK: mdt_counter_incr(req, LPROC_MDT_UNLINK); + if (snapshot_flag == 1) + mdt_snapshot_counter_incr( + SNAPSHOT_COUNT_UNLINK); break; default: LASSERTF(0, "bad file type %o unlinking\n", @@ -1069,6 +1082,10 @@ unlock_parent: put_parent: mdt_object_put(info->mti_env, mp); out: + /* clear snapshot readonly flag */ + if (snapshot_flag == 1) + tgt_snapshot_clear(info->mti_env); + return rc; } @@ -2008,6 +2025,510 @@ static int mdt_reint_migrate(struct mdt_thread_info *info, return mdt_reint_rename_or_migrate(info, lhc, MRL_MIGRATE); } +/* + * mdt_md_snapshot() + * + * create snapshot + * + * \param[in] info reint information + * + * \retval 0 success + * \retval not 0 error code + */ +static int mdt_md_snapshot(struct mdt_thread_info *info) +{ + struct ptlrpc_request *req = mdt_info_req(info); + struct mdt_device *mdt = info->mti_mdt; + struct mdt_body *repbody; + struct mdt_object *snap_dir, *snap; + struct mdt_reint_record *rr = &info->mti_rr; + struct mdt_lock_handle *snap_dir_lh; + struct lu_fid *snap_dir_fid = (struct lu_fid *)rr->rr_fid1; + struct lu_fid *snap_fid = (struct lu_fid *)rr->rr_fid2; + struct lu_fid *orig_fid = &info->mti_tmp_fid1; + struct lu_buf *buf = &info->mti_buf; + struct lu_name *snap_name = &rr->rr_name; + struct md_attr *ma = &info->mti_attr; + int rc, rc2; + ENTRY; + + /* MEMO: + * In info, follows are set by mdt_snapshot_unpack(). + * (1) mdt_ucred(info) + * uc->uc_fsuid uid + * uc->uc_fsgid gid + * uc->uc_cap cap + * uc->uc_umask umask + * (2) info->mti_rr + * rr->rr_fid1 snapshot directory fid + * rr->rr_fid2 snapshot fid + * rr->rr_name snapshot name + * (3) info->mti_attr.ma_attr + * attr->la_mode mode + * attr->la_uid uid + * attr->la_gid gid + * attr->la_atime atime + * attr->la_mtime mtime + * attr->la_ctime ctime + * attr->la_valid LA_MODE | LA_UID | LA_GID | + * LA_CTIME | LA_MTIME | LA_ATIME + * (4) info->mti_spec + * (5) info->mti_tmp_fid1 original fid + * (6) info->mti_dlm_req ldlm cancel request + */ + + DEBUG_REQ(D_INODE, req, + "START TO CREATE SNAPSHOT ("DNAME"->"DFID") " + "in "DFID" orig "DFID"", + PNAME(snap_name), PFID(snap_fid), + PFID(snap_dir_fid), PFID(orig_fid)); + + /* check snapshot parent directory FID */ + if (!fid_is_md_operative(snap_dir_fid)) { + CERROR("check error snapshot parent directory FID "DFID"\n", + PFID(snap_dir_fid)); + GOTO(out, rc = -EPERM); + } + /* check snapshot orig FID */ + if (!fid_is_md_operative(orig_fid) && + !fid_seq_is_dot(orig_fid->f_seq)) { + CERROR("check error snapshot orig directory FID "DFID"\n", + PFID(orig_fid)); + GOTO(out, rc = -EPERM); + } + + /* get reply body */ + repbody = req_capsule_server_get(info->mti_pill, &RMF_MDT_BODY); + + /* search snapshot parent directory */ + snap_dir = mdt_object_find(info->mti_env, info->mti_mdt, snap_dir_fid); + if (IS_ERR(snap_dir)) { + CERROR("search error snapshot parent directory\n"); + GOTO(out, rc = PTR_ERR(snap_dir)); + } + + /* test snapshot parent directory existance */ + if (!mdt_object_exists(snap_dir)) { + CERROR("test error snapshot parent directory existance\n"); + GOTO(out_free_snap_dir, rc = -ENOENT); + } + + /* lock snapshot parent directory */ + snap_dir_lh = &info->mti_lh[MDT_LH_PARENT]; + mdt_lock_pdo_init(snap_dir_lh, LCK_PW, snap_name); + rc = mdt_object_lock(info, snap_dir, snap_dir_lh, MDS_INODELOCK_UPDATE, + MDT_CROSS_LOCK); + if (rc) { + CERROR("lock error snapshot parent directory rc=%d\n", rc); + GOTO(out_free_snap_dir, rc); + } + + if (!mdt_object_remote(snap_dir)) { + rc = mdt_version_get_check_save(info, snap_dir, 0); + if (rc) { + CERROR("version check error snapshot rc=%d\n", + rc); + GOTO(out_unlock_snap_dir, rc); + } + } + + fid_zero(&info->mti_tmp_fid2); + rc = mdo_lookup(info->mti_env, mdt_object_child(snap_dir), + snap_name, &info->mti_tmp_fid2, + &info->mti_spec); + + if (rc == 0 && !req_is_replay(req)) + GOTO(out_unlock_snap_dir, rc = -EEXIST); + + if (rc != 0 && rc != -ENOENT) { + CERROR("lookup error snapshot rc=%d\n", + rc); + GOTO(out_unlock_snap_dir, rc); + } + + ma->ma_attr.la_flags |= MDT_SNAPSHOT_FILE_PATT; + /* get original attributes */ + if (rc == -ENOENT) { + /* save version of file name for replay, + it must be ENOENT here */ + mdt_enoent_version_save(info, 1); + + /* new object */ + snap = mdt_object_new(info->mti_env, mdt, snap_fid); + if (IS_ERR(snap)) { + rc = PTR_ERR(snap); + mdt_create_pack_capa(info, rc, NULL, repbody); + CERROR("does not create new object rc=%d\n" + , rc); + GOTO(out_unlock_snap_dir, rc = PTR_ERR(snap)); + } + + rc = mdt_remote_permission(info, snap_dir, snap); + if (rc != 0) { + CERROR("remote permission error rc=%d\n", rc); + GOTO(out_free_snap, rc); + } + + /* capa for cross-ref will be stored here */ + ma->ma_capa = req_capsule_server_get(info->mti_pill, + &RMF_CAPA1); + LASSERT(ma->ma_capa); + + /* Version of child will be updated on disk. */ + tgt_vbr_obj_set(info->mti_env, mdt_obj2dt(snap)); + rc = mdt_version_get_check_save(info, snap, 2); + if (rc) { + CERROR("version of child check error rc=%d\n", rc); + GOTO(out_free_snap, rc); + } + + /* Let lower layer know current lock mode. */ + info->mti_spec.sp_cr_mode = + mdt_dlm_mode2mdl_mode(snap_dir_lh->mlh_pdo_mode); + + /* + * Do not perform lookup sanity check. We know that name does + * not exist. + */ + info->mti_spec.sp_cr_lookup = 0; + info->mti_spec.sp_feat = &dt_directory_features; + + /* create snapshot object */ + ma->ma_attr.la_valid |= LA_FLAGS; + ma->ma_valid = 0; + rc = mdo_create(info->mti_env, + mdt_object_child(snap_dir), + snap_name, + mdt_object_child(snap), + &info->mti_spec, ma); + if (rc) { + CERROR("create error snapshot object rc=%d\n", rc); + GOTO(out_free_snap, rc); + } + ma->ma_need = MA_INODE; + rc = mdt_attr_get_complex(info, snap, ma); + if (rc) { + CERROR("get inode attr error snapshot object rc=%d\n", + rc); + GOTO(out_free_snap, rc); + } + } else { + snap = mdt_object_find(info->mti_env, mdt, snap_fid); + if (IS_ERR(snap)) { + rc = PTR_ERR(snap); + CERROR("find error snapshot object rc=%d\n", rc); + GOTO(out_unlock_snap_dir, rc); + } + ma->ma_need = MA_INODE; + rc = mo_attr_get(info->mti_env, + mdt_object_child(snap), ma); + if (rc) { + CERROR("get attribute error snapshot" + " object rc=%d\n", rc); + GOTO(out_delete_snap, rc); + } + if (!(ma->ma_attr.la_flags & LUSTRE_SNAPSHOT_FL)) { + CERROR("invalid snapshot inode flags\n"); + rc = -EFAULT; + GOTO(out_delete_snap, rc); + } + ma->ma_valid = 0; + rc = mdt_stripe_get(info, snap, + ma, XATTR_NAME_LOV); + if (rc == 0) + goto done_set_xattr; + ma->ma_valid |= MA_INODE; + } + + /* return fid & attr to client */ + mdt_pack_attr2body(info, repbody, &ma->ma_attr, + mdt_object_fid(snap)); + + /* set lov attribute */ + if (ma->ma_lmm_size > 0) { + buf->lb_buf = ma->ma_lmm; + buf->lb_len = ma->ma_lmm_size; + rc = mo_xattr_set(info->mti_env, mdt_object_child(snap), + buf, XATTR_NAME_LOV, LU_XATTR_SNAPSHOT); + if (rc < 0) { + CERROR("set lov attribute error rc=%d\n", rc); + GOTO(out_delete_snap, rc); + } + } +done_set_xattr: + /* file snapshot */ + if (S_ISREG(ma->ma_attr.la_mode)) { + buf->lb_buf = (void *)rr->rr_eadata; + buf->lb_len = rr->rr_eadatalen; + LASSERT(buf->lb_len > 0); + /* call ost for snapshot clone on regular file */ + rc = mo_snapshot_clone(info->mti_env, + mdt_object_child(snap), + ma, + buf); + if (rc) { + CERROR("snapshot clone error rc=%d\n", rc); + GOTO(out_delete_snap, rc); + } + } + + mdt_create_pack_capa(info, rc, snap, repbody); + if (S_ISREG(ma->ma_attr.la_mode)) { + ma->ma_lmm = req_capsule_server_get(info->mti_pill, + &RMF_MDT_MD); + ma->ma_lmm_size = req_capsule_get_size(info->mti_pill, + &RMF_MDT_MD, + RCL_SERVER); + if (ma->ma_lmm_size < 0) + GOTO(out_delete_snap, rc = -EFAULT); + + ma->ma_valid = 0; + ma->ma_need = MA_LOV; + rc = mdt_attr_get_complex(info, snap, ma); + if (rc) { + CERROR("get lov error rc=%d\n", + rc); + GOTO(out_delete_snap, rc); + } + LASSERT(ma->ma_valid & MA_LOV); + repbody->mbo_eadatasize = ma->ma_lmm_size; + repbody->mbo_valid |= OBD_MD_FLEASIZE; + } + + /* free snapshot object */ + mdt_object_put(info->mti_env, snap); + + /* unlock & free snapshot directory */ + mdt_object_unlock_put(info, snap_dir, snap_dir_lh, 0); + + RETURN(0); + +out_delete_snap: + ma->ma_need = 0; + ma->ma_valid = 0; + rc2 = mdo_unlink(info->mti_env, + mdt_object_child(snap_dir), + mdt_object_child(snap), + snap_name, ma, 0); + if (rc2 != 0) + CERROR("failed to cleanup of create snapshot: " + "rc = %d\n", rc2); + +out_free_snap: + /* free snapshot object */ + mdt_create_pack_capa(info, rc, snap, repbody); + mdt_object_put(info->mti_env, snap); +out_unlock_snap_dir: + /* unlock snap parent directory */ + mdt_object_unlock(info, snap_dir, snap_dir_lh, rc); +out_free_snap_dir: + /* free snap parent directory */ + mdt_object_put(info->mti_env, snap_dir); + +out: + if (rc != -EEXIST) + DEBUG_REQ(D_ERROR, req, + "failed to create snapshot rc=%d " + "("DNAME"->"DFID") " + "in "DFID" orig "DFID"", + rc, PNAME(snap_name), PFID(snap_fid), + PFID(snap_dir_fid), PFID(orig_fid)); + RETURN(rc); +} + +/* + * mdt_md_snapshot_xattr() + * + * set extented atribute for snapshot create + * + * \param[in] info reint information + * + * \retval 0 success + * \retval not 0 error code + */ +static int mdt_md_snapshot_xattr(struct mdt_thread_info *info) +{ + struct mdt_object *snap; + struct mdt_reint_record *rr = &info->mti_rr; + struct mdt_lock_handle *snap_lh; + struct lu_fid *snap_dir_fid = (struct lu_fid *)rr->rr_fid1; + struct lu_fid *snap_fid = (struct lu_fid *)rr->rr_fid2; + struct lu_buf *buf = &info->mti_buf; + struct lu_name *snap_name = &rr->rr_name; + struct md_attr *ma = &info->mti_attr; + __u64 lockpart; + int rc; + ENTRY; + + DEBUG_REQ(D_INODE, mdt_info_req(info), + "START TO SET XATTR SNAPSHOT ("DNAME"->"DFID") " + "in "DFID"", + PNAME(snap_name), PFID(snap_fid), + PFID(snap_dir_fid)); + + /* check snapshot parent directory FID */ + if (!fid_is_md_operative(snap_fid)) { + CERROR("check error snapshot parent directory FID\n"); + RETURN(-EPERM); + } + + rc = mdt_init_ucred_reint(info); + if (rc != 0) + RETURN(rc); + + lockpart = MDS_INODELOCK_UPDATE; + /* Revoke all clients' lookup lock, since the access + * permissions for this inode is changed when ACL_ACCESS is + * set. This isn't needed for ACL_DEFAULT, since that does + * not change the access permissions of this inode, nor any + * other existing inodes. It is setting the ACLs inherited + * by new directories/files at create time. */ + /* We need revoke both LOOKUP|PERM lock here, see mdt_attr_set. */ + if (!strcmp(info->mti_eaname.ln_name, XATTR_NAME_ACL_ACCESS)) + lockpart |= MDS_INODELOCK_PERM | MDS_INODELOCK_LOOKUP; + /* We need to take the lock on behalf of old clients so that newer + * clients flush their xattr caches */ + else + lockpart |= MDS_INODELOCK_XATTR; + + snap_lh = &info->mti_lh[MDT_LH_PARENT]; + /* ACLs were sent to clients under LCK_CR locks, so taking LCK_EX + * to cancel them. */ + mdt_lock_reg_init(snap_lh, LCK_EX); + snap = mdt_object_find_lock(info, snap_fid, snap_lh, lockpart); + if (rc) { + CERROR("lock error snapshot rc=%d\n", rc); + GOTO(out, rc); + } + if (!mdt_object_exists(snap)) { + CERROR("snapshot object not exist\n"); + GOTO(out_unlock_snap, rc = -ENOENT); + } + tgt_vbr_obj_set(info->mti_env, mdt_obj2dt(snap)); + rc = mdt_version_get_check_save(info, snap, 0); + if (rc) { + CERROR("version check error snapshot rc=%d\n", rc); + GOTO(out_unlock_snap, rc); + } + + /* set external attribute */ + if (info->mti_eaname.ln_namelen && ma->ma_lmm_size > 0) { + buf->lb_buf = ma->ma_lmm; + buf->lb_len = ma->ma_lmm_size; + rc = mo_xattr_set(info->mti_env, mdt_object_child(snap), + buf, info->mti_eaname.ln_name, 0); + if (rc < 0) { + CERROR("set external attribute error rc=%d\n", rc); + GOTO(out_unlock_snap, rc); + } + } +out_unlock_snap: + /* unlock snap parent directory */ + mdt_object_unlock_put(info, snap, snap_lh, rc); + + if (rc) + DEBUG_REQ(D_ERROR, mdt_info_req(info), + "failed to setxattr snapshot rc=%d " + "("DNAME"->"DFID") " + "in "DFID"", + rc, PNAME(snap_name), PFID(snap_fid), + PFID(snap_dir_fid)); +out: + mdt_exit_ucred(info); + RETURN(rc); +} + +/* + * mdt_reint_snapshot() + * + * call REINT_SNAPSHOT + * + * \param[in] info reint information + * \param[in] lhc lock handle + * + * \retval 0 success + * \retval not 0 error code + */ +int mdt_reint_snapshot(struct mdt_thread_info *info, + struct mdt_lock_handle *lhc) +{ + struct ptlrpc_request *req = mdt_info_req(info); + int ope, snap_ope; + int rc; + ENTRY; + + CDEBUG(D_TRACE, "START TO CREATE SNAPSHOT\n"); + + /* set readonly flag */ + tgt_snapshot_set(info->mti_env); + + /* cancel ldlm lock */ + if (info->mti_dlm_req) + ldlm_request_cancel(req, info->mti_dlm_req, 0); + + /* check protocol */ + if (!lu_name_is_valid(&info->mti_rr.rr_name)) { + CERROR("protocol error\n"); + GOTO(out, rc = -EPROTO); + } + + /* check if readonly */ + if (exp_connect_flags(req->rq_export) & OBD_CONNECT_RDONLY) { + CERROR("readonly filesystem\n"); + GOTO(out, rc = -EROFS); + } + + switch (info->mti_attr.ma_attr.la_mode & S_IFMT) { + case S_IFDIR: + ope = LPROC_MDT_MKDIR; + snap_ope = SNAPSHOT_COUNT_MKDIR; + break; + case S_IFREG: + /* Special file should stay on the same node as parent. */ + ope = LPROC_MDT_MKNOD; + snap_ope = SNAPSHOT_COUNT_MKNOD; + break; + case S_IFLNK: + case S_IFCHR: + case S_IFBLK: + case S_IFIFO: + case S_IFSOCK: + default: + CERROR("%s: Unsupported mode %o\n", + mdt_obd_name(info->mti_mdt), + info->mti_attr.ma_attr.la_mode); + GOTO(out, rc = err_serious(-EOPNOTSUPP)); + } + + if (info->mti_eaname.ln_namelen == 0) { + /* snapshot create */ + mdt_counter_incr(req, ope); + mdt_snapshot_counter_incr(snap_ope); + rc = mdt_md_snapshot(info); + if (rc) + GOTO(out, rc); + } else { + /* snapshot set extend attribute */ + mdt_counter_incr(req, LPROC_MDT_SETXATTR); + mdt_snapshot_counter_incr(SNAPSHOT_COUNT_SETXATTR); + rc = mdt_md_snapshot_xattr(info); + if (rc) + GOTO(out, rc); + } + + /* clear readonly flag */ + tgt_snapshot_clear(info->mti_env); + + RETURN(0); +out: + lustre_msg_set_transno(req->rq_repmsg, 0); + + /* clear readonly flag */ + tgt_snapshot_clear(info->mti_env); + + RETURN(rc); +} + typedef int (*mdt_reinter)(struct mdt_thread_info *info, struct mdt_lock_handle *lhc); @@ -2021,6 +2542,7 @@ static mdt_reinter reinters[REINT_MAX] = { [REINT_SETXATTR] = mdt_reint_setxattr, [REINT_RMENTRY] = mdt_reint_unlink, [REINT_MIGRATE] = mdt_reint_migrate, + [REINT_SNAPSHOT] = mdt_reint_snapshot, }; int mdt_reint_rec(struct mdt_thread_info *info, diff --git a/lustre/mdt/mdt_snapshot.c b/lustre/mdt/mdt_snapshot.c new file mode 100644 index 0000000..b07474d --- /dev/null +++ b/lustre/mdt/mdt_snapshot.c @@ -0,0 +1,79 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License version 2 for more details. A copy is + * included in the COPYING file that accompanied this code. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * GPL HEADER END + */ +/* + * Copyright(c) 2016-2017 FUJITSU LIMITED. + * All rights reserved. + */ + +#include "mdt_internal.h" + +static spinlock_t ss_count_lock; +static __u64 ss_count[SNAPSHOT_COUNT_LAST]; + +/* + * mdt_snapshot_counter_init + * + * initialize snapshot mdt counter + */ +void mdt_snapshot_counter_init(void) +{ + int i; + + /* initialize spin lock */ + spin_lock_init(&ss_count_lock); + + /* initialize couter */ + for (i = 0; i < SNAPSHOT_COUNT_LAST; i++) + ss_count[i] = 0; + + return; +} + +/* + * mdt_snapshot_counter_incr + * + * count up snapshot mdt counter + */ +inline void mdt_snapshot_counter_incr(int op) +{ + /* count up value */ + spin_lock(&ss_count_lock); + ss_count[op]++; + spin_unlock(&ss_count_lock); + + return; +} + +/* + * mdt_snapshot_counter_get + * + * get snapshot mdt counter value + * + * \param[in] op operation code + * + * \retval 0 and more count + */ +inline __u64 mdt_snapshot_counter_get(int op) +{ + /* return value */ + return ss_count[op]; +} diff --git a/lustre/obdclass/lprocfs_status_server.c b/lustre/obdclass/lprocfs_status_server.c index 3ba8e70..a2975a0 100644 --- a/lustre/obdclass/lprocfs_status_server.c +++ b/lustre/obdclass/lprocfs_status_server.c @@ -668,8 +668,12 @@ void lprocfs_init_ops_stats(int num_private_stats, struct lprocfs_stats *stats) LPROCFS_OBD_OP_INIT(num_private_stats, stats, pool_del); LPROCFS_OBD_OP_INIT(num_private_stats, stats, getref); LPROCFS_OBD_OP_INIT(num_private_stats, stats, putref); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, snapshot_lock); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, snapshot_unlock); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, snapshot_get_info); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, snapshot_cancel_lock); - CLASSERT(NUM_OBD_STATS == OBD_COUNTER_OFFSET(putref) + 1); + CLASSERT(NUM_OBD_STATS == OBD_COUNTER_OFFSET(snapshot_cancel_lock) + 1); } EXPORT_SYMBOL(lprocfs_init_ops_stats); diff --git a/lustre/ofd/ofd_dev.c b/lustre/ofd/ofd_dev.c index 774ec02..9b036ee 100644 --- a/lustre/ofd/ofd_dev.c +++ b/lustre/ofd/ofd_dev.c @@ -52,6 +52,9 @@ #include "ofd_internal.h" +#include +#include + /* Slab for OFD object allocation */ static struct kmem_cache *ofd_object_kmem; @@ -66,6 +69,11 @@ static struct lu_kmem_descr ofd_caches[] = { } }; +#define SNAP_PRECREATE_RETRY_MAX (10) +static int snap_precreate_retry = SNAP_PRECREATE_RETRY_MAX; +CFS_MODULE_PARM(snap_precreate_retry, "i", int, 0644, + "maximum of retry precreate for snapshot"); + static int ofd_connect_to_next(const struct lu_env *env, struct ofd_device *m, const char *next, struct obd_export **exp) { @@ -742,6 +750,7 @@ int ofd_fiemap_get(const struct lu_env *env, struct ofd_device *ofd, { struct ofd_object *fo; int rc; + void *lock = NULL; fo = ofd_object_find(env, ofd, fid); if (IS_ERR(fo)) { @@ -750,12 +759,22 @@ int ofd_fiemap_get(const struct lu_env *env, struct ofd_device *ofd, return PTR_ERR(fo); } + rc = dt_snapshot_lock(ofd_object_child(fo), false, &lock); + if (rc) { + CERROR("fail to lock snapshot. err=%d\n", + rc); + ofd_object_put(env, fo); + return rc; + } + ofd_read_lock(env, fo); if (ofd_object_exists(fo)) rc = dt_fiemap_get(env, ofd_object_child(fo), fiemap); else rc = -ENOENT; ofd_read_unlock(env, fo); + if (lock) + dt_snapshot_unlock(ofd_object_child(fo), lock); ofd_object_put(env, fo); return rc; } @@ -1167,7 +1186,7 @@ static int ofd_orphans_destroy(const struct lu_env *env, if (unlikely(rc != 0)) GOTO(out_put, rc); - rc = ofd_destroy_by_fid(env, ofd, fid, 1); + rc = ofd_destroy_by_fid(env, ofd, fid, OST_DESTRY_ORPHAN); if (rc != 0 && rc != -ENOENT && rc != -ESTALE && likely(rc != -EREMCHG && rc != -EINPROGRESS)) /* this is pretty fatal... */ @@ -1485,13 +1504,14 @@ static int ofd_destroy_hdl(struct tgt_session_info *tsi) else count = 1; /* default case - single destroy */ - CDEBUG(D_HA, "%s: Destroy object "DOSTID" count %d\n", ofd_name(ofd), - POSTID(&body->oa.o_oi), count); + CDEBUG(D_HA, "%s: Destroy object "DOSTID" "DFID" count %d\n", + ofd_name(ofd), POSTID(&body->oa.o_oi), PFID(fid), count); while (count > 0) { int lrc; - lrc = ofd_destroy_by_fid(tsi->tsi_env, ofd, fid, 0); + lrc = ofd_destroy_by_fid(tsi->tsi_env, ofd, fid, + OST_DESTRY_NORMAL); if (lrc == -ENOENT) { CDEBUG(D_INODE, "%s: destroying non-existent object "DFID"\n", @@ -1695,6 +1715,270 @@ out: return rc; } +/** + * ofd_snapshot_clone() + * + * Snapshot clone handler in OFD + * + * \param[in] tsi target session information + * + * \retval 0 success + * \retval less than 0 failure (-errno) + */ +static int ofd_snapshot_clone(struct tgt_session_info *tsi) +{ + const struct lu_env *env = tsi->tsi_env; + struct ofd_thread_info *fti = tsi2ofd_info(tsi); + struct ost_body *req_body = tsi->tsi_ost_body; + struct ost_body *rep_body; + struct lu_fid *snap_fid, orig_fid; + struct ofd_device *ofd = ofd_exp(tsi->tsi_exp); + struct ofd_object *snap_obj, *orig_obj; + struct dt_object *snap_osd, *orig_osd; + struct ldlm_res_id resid; + struct filter_fid *ff = NULL; + struct lustre_handle lh = {0}; + __u64 flags = 0; + void *lock = NULL; + struct thandle *th; + int rc; + ENTRY; + + /* check request body */ + LASSERT(req_body != NULL); + + snap_fid = &req_body->oa.o_oi.oi_fid; + + /* get reply body */ + rep_body = req_capsule_server_get(tsi->tsi_pill, &RMF_OST_BODY); + if (rep_body == NULL) { + CERROR("fail to get reply body\n"); + GOTO(out, rc = -ENOMEM); + } + orig_fid.f_seq = req_body->oa.o_snapshot_orig_seq; + orig_fid.f_oid = req_body->oa.o_snapshot_orig_oid; + orig_fid.f_ver = req_body->oa.o_snapshot_orig_ver; + + /* set reply body */ + rep_body->oa.o_oi.oi_fid = *snap_fid; + rep_body->oa.o_valid = OBD_MD_FLID; + + /* lock original object by ldlm */ + ost_fid_build_resid(&orig_fid, &resid); + rc = tgt_extent_lock(tsi->tsi_tgt->lut_obd->obd_namespace, &resid, + 0, OBD_OBJECT_EOF, &lh, LCK_PW, &flags); + if (rc) { + CERROR("fail to lock snapshot original. err=%d\n", rc); + GOTO(out, rc); + } + + /* copied from ofd_preprw_write() */ + if (unlikely(tsi->tsi_exp->exp_obd->obd_recovering) || + (lustre_msg_get_flags(tgt_ses_req(tsi)->rq_reqmsg) + & (MSG_RESENT | MSG_REPLAY))) { + obd_seq seq = fid_seq(snap_fid); + obd_id oid = fid_oid(snap_fid); + struct ofd_seq *oseq; + int retry = 0; + +retry_seq_load: + oseq = ofd_seq_load(env, ofd, seq); + if (IS_ERR(oseq)) { + CERROR("%s: Can't find FID Sequence "LPX64": rc = %d\n", + ofd_name(ofd), seq, (int)PTR_ERR(oseq)); + rc = PTR_ERR(oseq); + if (rc == -ENOMEM && + retry++ < snap_precreate_retry) { + schedule_timeout_and_set_state( + TASK_INTERRUPTIBLE, + cfs_time_seconds(1)); + CDEBUG(D_INODE, "retry ofd_seq_load" + " (%d / %d)\n", + retry, snap_precreate_retry); + GOTO(retry_seq_load , rc); + } + GOTO(out_unlock_orig_ldlm, rc); + } + + if (oid > ofd_seq_last_oid(oseq)) { + int sync = 0; + int diff; + CDEBUG(D_INODE, "oid("LPX64") > last_oid("LPX64")\n", + oid, ofd_seq_last_oid(oseq)); + mutex_lock(&oseq->os_create_lock); + diff = oid - ofd_seq_last_oid(oseq); + + /* Do sync create if the seq is about to used up */ + if (fid_seq_is_idif(seq) || fid_seq_is_mdt0(seq)) { + if (unlikely(oid >= IDIF_MAX_OID - 1)) + sync = 1; + } else if (fid_seq_is_norm(seq)) { + if (unlikely(oid >= + LUSTRE_DATA_SEQ_MAX_WIDTH - 1)) + sync = 1; + } else { + CERROR("%s : invalid o_seq "DOSTID"\n", + ofd_name(ofd), + POSTID(&req_body->oa.o_oi)); + mutex_unlock(&oseq->os_create_lock); + ofd_seq_put(env, oseq); + GOTO(out_unlock_orig_ldlm, rc = -EINVAL); + } + + while (diff > 0) { + obd_id next_id = ofd_seq_last_oid(oseq) + 1; + int count = ofd_precreate_batch(ofd, diff); + +retry_precreate: + rc = ofd_precreate_objects(env, ofd, next_id, + oseq, count, sync); + if (rc == -ENOMEM && + retry++ < snap_precreate_retry) { + schedule_timeout_and_set_state( + TASK_INTERRUPTIBLE, + cfs_time_seconds(1)); + CDEBUG(D_INODE, "retry" + " precreate_objects" + " (%d / %d)\n", + retry, snap_precreate_retry); + GOTO(retry_precreate , rc); + } + if (rc < 0) { + mutex_unlock(&oseq->os_create_lock); + ofd_seq_put(env, oseq); + GOTO(out_unlock_orig_ldlm, rc); + } + + diff -= rc; + } + + mutex_unlock(&oseq->os_create_lock); + } + + ofd_seq_put(env, oseq); + } + + /* get snapshot object */ + snap_obj = ofd_object_find_exists(env, ofd, snap_fid); + if (IS_ERR(snap_obj)) { + rc = PTR_ERR(snap_obj); + CERROR("fail to find snapshot object. " + ""DFID" err=%d\n", PFID(snap_fid), rc); + GOTO(out_unlock_orig_ldlm, rc); + } + + rc = ofd_attr_get(env, snap_obj, &fti->fti_attr); + if (rc) { + GOTO(out_free_snap_obj, rc); + } else if ((fti->fti_attr.la_flags & SNAPSHOT_FLAGS_MASK) == + OST_SNAPSHOT_FILE_PATT) { + /* processing has been completed already, + * so there is nothing to do. */ + GOTO(out_free_snap_obj, rc = 0); + } + + /* get snapshot osd object */ + snap_osd = ofd_object_child(snap_obj); + + /* get original ofd object */ + orig_obj = ofd_object_find_exists(env, ofd, &orig_fid); + if (IS_ERR(orig_obj)) { + rc = PTR_ERR(orig_obj); + CERROR("cannot find snapshot original. " + ""DFID"err=%d\n", PFID(&orig_fid), rc); + GOTO(out_free_snap_obj, rc); + } + + /* get original osd object */ + orig_osd = ofd_object_child(orig_obj); + + /* get snapshot attributes from request body */ + la_from_obdo(&fti->fti_attr, &req_body->oa, + req_body->oa.o_valid); + + if (req_body->oa.o_valid & OBD_MD_FLFID) { + ff = &fti->fti_mds_fid; + ofd_prepare_fidea(ff, &req_body->oa); + } + /* set snapshot attributes */ + /* not necessary to protect it with snapshot_lock */ + rc = dt_snapshot_lock(orig_osd, true, &lock); + if (rc) { + CERROR("fail to lock snapshot. err=%d\n", rc); + GOTO(out_free_orig_obj, rc); + } + ofd_read_lock(env, orig_obj); + rc = ofd_write_attr_set(env, ofd, snap_obj, + &fti->fti_attr, ff, + 1 /* is_snapshot */); + if (rc) { + CERROR("fail to set attributes. err=%d\n", rc); + GOTO(out_free_orig_obj, rc); + } + + /* set snapshot attribute to reply body */ + obdo_from_la(&rep_body->oa, &fti->fti_attr, + LA_ATIME | LA_MTIME | LA_CTIME | + LA_MODE | LA_UID | LA_GID); + + + th = ofd_trans_create(env, ofd); + if (IS_ERR(th)) { + rc = PTR_ERR(th); + CERROR("fail to trans_create. err=%d\n", rc); + GOTO(out_read_unlock, rc); + } + + /* call osd in ost */ + rc = dt_osd_declare_snapshot_clone(env, snap_osd, orig_osd, + th, req_body->oa.o_flags); + if (rc) { + CERROR("fail to declare_snapshot_clone. err=%d\n", rc); + GOTO(out_stop, rc); + } + + rc = ofd_trans_start(env, ofd, snap_obj, th); + if (rc) { + CERROR("fail to trans_start. err=%d\n", rc); + GOTO(out_stop, rc); + } + rc = dt_osd_snapshot_clone(snap_osd, orig_osd); + +out_stop: + ofd_trans_stop(env, ofd, th, rc); +out_read_unlock: + /* unlock original object */ + ofd_read_unlock(env, orig_obj); + dt_snapshot_unlock(orig_osd, lock); +out_free_orig_obj: + /* free original object */ + ofd_object_put(env, orig_obj); +out_free_snap_obj: + /* free snapshot object */ + ofd_object_put(env, snap_obj); +out_unlock_orig_ldlm: + /* unlock orignal object by ldlm */ + tgt_extent_unlock(&lh, LCK_PW); +out: + if (rc == 0) + /* increment statistics */ + ofd_counter_incr(tsi->tsi_exp, + LPROC_OFD_STATS_CREATE, + tsi->tsi_jobid, + 1); + else { + if (ff) + CERROR("snapshot clone failed." + " mdt_fid="DFID" ost_fid="DFID"" + " orig_ost_fid="DFID"\n", + PFID(&ff->ff_parent), PFID(snap_fid), + PFID(&orig_fid)); + else + CERROR("snapshot clone failed.\n"); + } + RETURN(rc); +} + static int ofd_quotactl(struct tgt_session_info *tsi) { struct obd_quotactl *oqctl, *repoqc; @@ -2047,6 +2331,8 @@ TGT_OST_HDL(0 | HABEO_REFERO, OST_STATFS, ofd_statfs_hdl), TGT_OST_HDL_HP(HABEO_CORPUS| HABEO_REFERO, OST_BRW_READ, tgt_brw_read, ofd_hp_brw), +TGT_OST_HDL(HABEO_CORPUS | HABEO_REFERO | MUTABOR, + OST_SNAPSHOT, ofd_snapshot_clone), /* don't set CORPUS flag for brw_write because -ENOENT may be valid case */ TGT_OST_HDL_HP(HABEO_CORPUS| MUTABOR, OST_BRW_WRITE, tgt_brw_write, ofd_hp_brw), diff --git a/lustre/ofd/ofd_dlm.c b/lustre/ofd/ofd_dlm.c index 6337548..1a83ccb 100644 --- a/lustre/ofd/ofd_dlm.c +++ b/lustre/ofd/ofd_dlm.c @@ -41,6 +41,7 @@ #define DEBUG_SUBSYSTEM S_FILTER +#include #include "ofd_internal.h" struct ofd_intent_args { @@ -107,6 +108,11 @@ int ofd_intent_policy(struct ldlm_namespace *ns, struct ldlm_lock **lockp, [DLM_REPLY_REC_OFF] = sizeof(*reply_lvb) }; struct ldlm_glimpse_work gl_work; + struct ofd_device *ofd; + struct ofd_object *fo = NULL; + struct ofd_thread_info *info; + struct lu_env env; + bool snapshot = false; CFS_LIST_HEAD(gl_list); ENTRY; @@ -139,6 +145,39 @@ int ofd_intent_policy(struct ldlm_namespace *ns, struct ldlm_lock **lockp, RETURN(ELDLM_LOCK_ABORTED); } + /* copied from ofd_lvbo_update() */ + ofd = ldlm_res_to_ns(res)->ns_lvbp; + LASSERT(ofd != NULL); + + rc = lu_env_init(&env, LCT_DT_THREAD); + if (rc) { + /* if error occurred, + * set "true" to snapshot to return ELDLM_LOCK_ABORTED + */ + snapshot = true; + goto skip; + } + + info = ofd_info_init(&env, NULL); + + ost_fid_from_resid(&info->fti_fid, &res->lr_name, + ofd->ofd_lut.lut_lsd.lsd_osd_index); + fo = ofd_object_find(&env, ofd, &info->fti_fid); + if (IS_ERR(fo)) { + lu_env_fini(&env); + snapshot = true; + goto skip; + } + rc = ofd_attr_get(&env, fo, &info->fti_attr); + if (rc) + snapshot = true; + else if ((info->fti_attr.la_flags & SNAPSHOT_FLAGS_MASK) == + OST_SNAPSHOT_FILE_PATT) + snapshot = true; + + ofd_object_put(&env, fo); + lu_env_fini(&env); +skip: LASSERT(ns == ldlm_res_to_ns(res)); lock_res(res); @@ -161,7 +200,8 @@ int ofd_intent_policy(struct ldlm_namespace *ns, struct ldlm_lock **lockp, * list (and potentially being added to l_pending_list by an * AST) when we are going to drop this lock ASAP. */ if (lock->l_export->exp_libclient || - OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_GLIMPSE, 2)) { + OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_GLIMPSE, 2) || + snapshot) { ldlm_resource_unlink_lock(lock); err = ELDLM_LOCK_ABORTED; } else { diff --git a/lustre/ofd/ofd_internal.h b/lustre/ofd/ofd_internal.h index 934b109..bdf69f0 100644 --- a/lustre/ofd/ofd_internal.h +++ b/lustre/ofd/ofd_internal.h @@ -87,6 +87,13 @@ enum { LPROC_OFD_STATS_LAST, }; +enum { + OST_DESTRY_NORMAL = 0, + OST_DESTRY_ORPHAN, + OST_DESTRY_SNAPSHOT_ORPHAN, + OST_DESTRY_LAST, +}; + static inline void ofd_counter_incr(struct obd_export *exp, int opcode, char *jobid, long amount) { @@ -377,6 +384,9 @@ void ofd_seqs_free(const struct lu_env *env, struct ofd_device *ofd); /* ofd_io.c */ int ofd_start_inconsistency_verification_thread(struct ofd_device *ofd); int ofd_stop_inconsistency_verification_thread(struct ofd_device *ofd); +int ofd_write_attr_set(const struct lu_env *env, struct ofd_device *ofd, + struct ofd_object *ofd_obj, struct lu_attr *la, + struct filter_fid *ff, int is_snapshot); int ofd_verify_ff(const struct lu_env *env, struct ofd_object *fo, struct obdo *oa); int ofd_preprw(const struct lu_env *env,int cmd, struct obd_export *exp, diff --git a/lustre/ofd/ofd_io.c b/lustre/ofd/ofd_io.c index 3d7e60a..ef61a84 100644 --- a/lustre/ofd/ofd_io.c +++ b/lustre/ofd/ofd_io.c @@ -542,6 +542,7 @@ static int ofd_preprw_write(const struct lu_env *env, struct obd_export *exp, err: dt_bufs_put(env, ofd_object_child(fo), lnb, *nr_local); ofd_read_unlock(env, fo); + ofd_object_put(env, fo); /* ofd_grant_prepare_write() was called, so we must commit */ ofd_grant_commit(env, exp, rc); out: @@ -662,10 +663,10 @@ ofd_commitrw_read(const struct lu_env *env, struct ofd_device *ofd, RETURN(0); } -static int +int ofd_write_attr_set(const struct lu_env *env, struct ofd_device *ofd, struct ofd_object *ofd_obj, struct lu_attr *la, - struct filter_fid *ff) + struct filter_fid *ff, int is_snapshot) { struct ofd_thread_info *info = ofd_info(env); __u64 valid = la->la_valid; @@ -848,7 +849,7 @@ ofd_commitrw_write(const struct lu_env *env, struct obd_export *exp, * dt_declare_write_commit() since quota enforcement is now handled in * declare phases. */ - rc = ofd_write_attr_set(env, ofd, fo, la, ff); + rc = ofd_write_attr_set(env, ofd, fo, la, ff, 0); if (rc) GOTO(out, rc); diff --git a/lustre/ofd/ofd_obd.c b/lustre/ofd/ofd_obd.c index 7b3fa77..a9f192a 100644 --- a/lustre/ofd/ofd_obd.c +++ b/lustre/ofd/ofd_obd.c @@ -48,6 +48,9 @@ #include #include #include +#include + +#define SNAPSHOT_ORPHAN_ARRAY_MAX 3 static int ofd_export_stats_init(struct ofd_device *ofd, struct obd_export *exp, void *client_nid) @@ -754,6 +757,121 @@ out: return rc; } +static int ofd_snapshot_destroy(const struct lu_env *env, + struct ofd_device *ofd, + struct ofd_object *fo, bool *is_snap) +{ + void *lock = NULL; + int rc = 0, i; + int array_num = SNAPSHOT_ORPHAN_ARRAY_MAX; + struct lu_fid *fid_array = NULL; + struct lu_fid orig_fid; + struct ofd_thread_info *info = ofd_info(env); + ENTRY; + + /* if target is not snapshot_orig or snapshot, + * dt_snapshot_lock() doesn't hold lock + * and lock is NULL. + */ + rc = dt_snapshot_lock(ofd_object_child(fo), false, &lock); + if (rc) { + CERROR("fail to lock snapshot. err=%d\n", rc); + RETURN(rc); + } + if (!lock) { + *is_snap = false; + RETURN(0); + } + *is_snap = true; + + rc = ofd_attr_get(env, fo, &info->fti_attr); + if (rc) { + CERROR("fail to get inode flags\n"); + GOTO(unlock, rc); + } + info->fti_attr.la_valid = LA_FLAGS; + info->fti_attr.la_flags |= LUSTRE_SNAPSHOT_FL; + + ofd_read_lock(env, fo); + rc = ofd_write_attr_set(env, ofd, fo, &info->fti_attr, NULL, + 1 /* is_snapshot */); + ofd_read_unlock(env, fo); + if (rc) { + CERROR("fail to set del flag\n"); + GOTO(unlock, rc); + } + + OBD_ALLOC(fid_array, sizeof(struct lu_fid) * SNAPSHOT_ORPHAN_ARRAY_MAX); + if (fid_array == NULL) { + CERROR("fail to alloc fid_array\n"); + GOTO(unlock, rc = -ENOMEM); + } + + ofd_write_lock(env, fo); + if (!ofd_object_exists(fo)) { + OBD_FREE(fid_array, + sizeof(struct lu_fid) * SNAPSHOT_ORPHAN_ARRAY_MAX); + ofd_write_unlock(env, fo); + GOTO(unlock, rc = -ENOENT); + } +retry: + /* get list of orphan inode to delete */ + rc = dt_snapshot_get_orphan(ofd_object_child(fo), + fid_array, &array_num); + if ((rc == 0 || rc == -EAGAIN) && (array_num != 0)) { + for (i = 0; i < array_num; i++) { + ofd_destroy_by_fid(env, + ofd, &fid_array[i], + OST_DESTRY_SNAPSHOT_ORPHAN); + } + } + if (rc == -EAGAIN) { + memset(fid_array, 0, sizeof(struct lu_fid) + * SNAPSHOT_ORPHAN_ARRAY_MAX); + array_num = SNAPSHOT_ORPHAN_ARRAY_MAX; + goto retry; + } + OBD_FREE(fid_array, sizeof(struct lu_fid) * SNAPSHOT_ORPHAN_ARRAY_MAX); + + rc = dt_snapshot_destroy(ofd_object_child(fo), &orig_fid); + if (rc < 0) + CERROR("fail to snapshot destroy. err=%d\n", rc); + + ofd_write_unlock(env, fo); +unlock: + dt_snapshot_unlock(ofd_object_child(fo), lock); + + if (rc == 1) { + /* In the case of the last snapshot deletion, + * get LCK_PW to notify client a change of i_blocks by + * the deletion of snapshot_link */ + struct lustre_handle lockh; + ldlm_policy_data_t policy = { + .l_extent = { 0, OBD_OBJECT_EOF, 0 } + }; + __u64 flags = 0; + struct ldlm_res_id res_id; + + ost_fid_build_resid((const struct lu_fid *)&orig_fid, &res_id); + rc = ldlm_cli_enqueue_local(ofd->ofd_namespace, &res_id, + LDLM_EXTENT, + &policy, LCK_PW, &flags, + ldlm_blocking_ast, + ldlm_completion_ast, + NULL, NULL, 0, LVB_T_NONE, + NULL, &lockh); + if (rc == ELDLM_OK) + ldlm_lock_decref(&lockh, LCK_PW); + else + /* deletion processing continues + * even if failed to get lock */ + rc = 0; + } + + RETURN(rc); +} + + int ofd_destroy_by_fid(const struct lu_env *env, struct ofd_device *ofd, const struct lu_fid *fid, int orphan) { @@ -762,9 +880,11 @@ int ofd_destroy_by_fid(const struct lu_env *env, struct ofd_device *ofd, __u64 flags = LDLM_FL_AST_DISCARD_DATA; __u64 rc = 0; ldlm_policy_data_t policy = { - .l_extent = { 0, OBD_OBJECT_EOF } + .l_extent = { 0, OBD_OBJECT_EOF, 0 } }; struct ofd_object *fo; + int err = 0; + bool is_snap = false; ENTRY; @@ -785,12 +905,17 @@ int ofd_destroy_by_fid(const struct lu_env *env, struct ofd_device *ofd, ldlm_lock_decref(&lockh, LCK_PW); LASSERT(fo != NULL); - - rc = ofd_object_destroy(env, fo, orphan); + if (orphan != OST_DESTRY_SNAPSHOT_ORPHAN) { + err = ofd_snapshot_destroy(env, ofd, fo, &is_snap); + if (err) + GOTO(out, err); + } + err = ofd_object_destroy(env, fo, orphan); EXIT; - + out: ofd_object_put(env, fo); - RETURN(rc); + + RETURN(err); } /* needed by echo client only for now, RPC handler uses ofd_destroy_hdl() */ @@ -809,7 +934,7 @@ int ofd_echo_destroy(const struct lu_env *env, struct obd_export *exp, CDEBUG(D_HA, "%s: Destroy object "DFID"\n", ofd_name(ofd), PFID(fid)); - rc = ofd_destroy_by_fid(env, ofd, fid, 0); + rc = ofd_destroy_by_fid(env, ofd, fid, OST_DESTRY_NORMAL); if (rc == -ENOENT) { CDEBUG(D_INODE, "%s: destroying non-existent object "DFID"\n", ofd_name(ofd), PFID(fid)); @@ -996,6 +1121,77 @@ out: return rc; } +static int ofd_ioc_snapshot_orphan(const struct lu_env *env, + struct ofd_device *ofd, void *karg) +{ + struct obd_ioctl_data *data = karg; + int rc = 0; + struct lu_buf *bufp, buf; + + ENTRY; + + if (data->ioc_plen1 && data->ioc_pbuf1) { + OBD_ALLOC(buf.lb_buf, data->ioc_plen1); + if (buf.lb_buf == NULL) + RETURN(-ENOMEM); + rc = copy_from_user(buf.lb_buf, data->ioc_pbuf1, + data->ioc_plen1); + if (rc) { + OBD_FREE(buf.lb_buf, data->ioc_plen1); + RETURN(-EFAULT); + } + buf.lb_len = data->ioc_plen1; + bufp = &buf; + } else + bufp = NULL; + + if (data->ioc_command == OBD_IOC_SNAPSHOT_ORPHAN_INODE || + data->ioc_command == OBD_IOC_SNAPSHOT_ORPHAN_DEL) { + struct ofd_object *fo = NULL; + struct lu_fid fid = + *(struct lu_fid *)data->ioc_inlbuf1; + + if (!fid_is_sane(&fid)) + GOTO(out, rc = -EBFONT); + + fo = ofd_object_find(env, ofd, &fid); + if (IS_ERR(fo)) + GOTO(out, rc = PTR_ERR(fo)); + + if (!ofd_object_exists(fo)) { + ofd_object_put(env, fo); + GOTO(out, rc = -ENOENT); + } + if (data->ioc_command == OBD_IOC_SNAPSHOT_ORPHAN_INODE) { + rc = dt_osd_snapshot_get_old_list(ofd_object_child(fo), + (void *)bufp); + } else { + /* check if there is an old snapshot */ + rc = dt_osd_snapshot_get_old_list(ofd_object_child(fo), + NULL); + if (rc == 0) + rc = ofd_destroy_by_fid(env, ofd, &fid, + OST_DESTRY_ORPHAN); + } + ofd_object_put(env, fo); + } else { + if (!bufp) + GOTO(out, rc = -EFAULT); + rc = dt_snapshot_list_orphan(ofd->ofd_osd, (void *)bufp); + } + if (bufp) { /* -EAGAIN has valid return data */ + int rc2; + rc2 = obd_ioctl_popdata(data->ioc_pbuf1, buf.lb_buf, + data->ioc_plen1); + if (rc2 != 0) + rc = rc2; + } +out: + if (bufp) + OBD_FREE(buf.lb_buf, data->ioc_plen1); + RETURN(rc); +} + int ofd_iocontrol(unsigned int cmd, struct obd_export *exp, int len, void *karg, void *uarg) { @@ -1057,6 +1253,9 @@ int ofd_iocontrol(unsigned int cmd, struct obd_export *exp, int len, case OBD_IOC_GET_OBJ_VERSION: rc = ofd_ioc_get_obj_version(&env, ofd, karg); break; + case OBD_IOC_SNAPSHOT_ORPHAN: + rc = ofd_ioc_snapshot_orphan(&env, ofd, karg); + break; default: CERROR("%s: not supported cmd = %d\n", obd->obd_name, cmd); rc = -ENOTTY; @@ -1174,6 +1373,44 @@ static int ofd_quotactl(struct obd_device *obd, struct obd_export *exp, RETURN(rc); } +static int ofd_snapshot_lock(const struct lu_env *env, + struct obd_export *exp, + struct obdo *oa, + bool create, void **lock) +{ + struct ofd_device *ofd = ofd_exp(exp); + struct lu_fid *fid = &oa->o_oi.oi_fid; + struct ofd_object *fo; + int rc = 0; + ENTRY; + + fo = ofd_object_find_exists(env, ofd, fid); + if (IS_ERR(fo)) + RETURN(PTR_ERR(fo)); + + rc = dt_snapshot_lock(ofd_object_child(fo), create, lock); + ofd_object_put(env, fo); + RETURN(rc); +} + +static int ofd_snapshot_unlock(const struct lu_env *env, + struct obd_export *exp, + struct obdo *oa, void *lock) +{ + struct ofd_device *ofd = ofd_exp(exp); + struct lu_fid *fid = &oa->o_oi.oi_fid; + struct ofd_object *fo; + ENTRY; + + fo = ofd_object_find_exists(env, ofd, fid); + if (IS_ERR(fo)) + RETURN(PTR_ERR(fo)); + + dt_snapshot_unlock(ofd_object_child(fo), lock); + ofd_object_put(env, fo); + RETURN(0); +} + struct obd_ops ofd_obd_ops = { .o_owner = THIS_MODULE, .o_connect = ofd_obd_connect, @@ -1196,4 +1433,6 @@ struct obd_ops ofd_obd_ops = { .o_quotactl = ofd_quotactl, .o_set_info_async = ofd_set_info_async, .o_get_info = ofd_get_info, + .o_snapshot_lock = ofd_snapshot_lock, + .o_snapshot_unlock = ofd_snapshot_unlock, }; diff --git a/lustre/ofd/ofd_objects.c b/lustre/ofd/ofd_objects.c index c60e18f..c4b61c2 100644 --- a/lustre/ofd/ofd_objects.c +++ b/lustre/ofd/ofd_objects.c @@ -44,7 +44,7 @@ #include #include #include - +#include #include "ofd_internal.h" int ofd_version_get_check(struct ofd_thread_info *info, @@ -491,6 +491,7 @@ int ofd_object_punch(const struct lu_env *env, struct ofd_object *fo, struct thandle *th; int ff_needed = 0; int rc; + void *lock = NULL; ENTRY; @@ -502,6 +503,15 @@ int ofd_object_punch(const struct lu_env *env, struct ofd_object *fo, fmd->fmd_mactime_xid = info->fti_xid; ofd_fmd_put(info->fti_exp, fmd); + /* if taget is not snapshot_orig, + * dt_snapshot_lock() doesn't hold snapshot_lock + * and lock is NULL */ + rc = dt_snapshot_lock(dob, false, &lock); + if (rc) { + CERROR("fail to lock snapshot. err=%d\n", + rc); + return rc; + } ofd_write_lock(env, fo); if (!ofd_object_exists(fo)) GOTO(unlock, rc = -ENOENT); @@ -585,7 +595,8 @@ stop: ofd_trans_stop(env, ofd, th, rc); unlock: ofd_write_unlock(env, fo); - + if (lock) + dt_snapshot_unlock(dob, lock); return rc; } diff --git a/lustre/osc/osc_request.c b/lustre/osc/osc_request.c index 38b0d99..7b9fd78 100644 --- a/lustre/osc/osc_request.c +++ b/lustre/osc/osc_request.c @@ -3267,6 +3267,23 @@ static int osc_process_config(struct obd_device *obd, obd_count len, void *buf) return osc_process_config_base(obd, buf); } +static int osc_snapshot_cancel_lock(struct obd_export *exp, void *val) +{ + struct obdo oa; + struct list_head cancels = LIST_HEAD_INIT(cancels); + struct ost_id *id = val; + int count; + ENTRY; + + memcpy(&oa.o_oi, id, sizeof(struct ost_id)); + /* Specify LCK_PW to cancel LCK_PR */ + count = osc_resource_get_unused(exp, &oa, &cancels, LCK_PW, + LDLM_FL_DISCARD_DATA); + if (count == 0) + RETURN(0); + RETURN(ldlm_cli_cancel_list(&cancels, count, NULL, 0)); +} + struct obd_ops osc_obd_ops = { .o_owner = THIS_MODULE, .o_setup = osc_setup, @@ -3295,6 +3312,7 @@ struct obd_ops osc_obd_ops = { .o_process_config = osc_process_config, .o_quotactl = osc_quotactl, .o_quotacheck = osc_quotacheck, + .o_snapshot_cancel_lock = osc_snapshot_cancel_lock, }; extern struct lu_kmem_descr osc_caches[]; diff --git a/lustre/osd-ldiskfs/osd_handler.c b/lustre/osd-ldiskfs/osd_handler.c index 0c5a07d..9a86be9 100644 --- a/lustre/osd-ldiskfs/osd_handler.c +++ b/lustre/osd-ldiskfs/osd_handler.c @@ -72,7 +72,8 @@ #include #include -#include +/* snapshot lock */ +#include int ldiskfs_pdo = 1; CFS_MODULE_PARM(ldiskfs_pdo, "i", int, 0644, @@ -1430,6 +1431,68 @@ static int osd_init_capa_ctxt(const struct lu_env *env, struct dt_device *d, RETURN(0); } +/* + * osd_snapshot_get_enable + * + * snapshot get enable status + * + * \param[in] lu_env lustre environment + * \param[in] dt osd dt device + * + * \retval 0 snapshot is disabled + * \retval 1 snapshot is enabled + */ +static int osd_snapshot_get_enable(const struct lu_env *env, + struct dt_device *dt) +{ + struct osd_device *osd = osd_dt_dev(dt); + struct super_block *sb = osd_sb(osd); + int rc; + ENTRY; + + /* check if snapshot is enabled */ + rc = ldiskfs_snapshot_get_enable(sb); + + RETURN(rc); +} + +/* + * osd_snapshot_set_enable + * + * snapshot set enable status + * + * \param[in] lu_env lustre environment + * \param[in] dt osd dt device + * + * \retval 0 success + * \retval less than 0 failure (-errno) + */ +static int osd_snapshot_set_enable(const struct lu_env *env, + struct dt_device *dt) +{ + struct osd_device *osd = osd_dt_dev(dt); + struct super_block *sb = osd_sb(osd); + int rc; + ENTRY; + + /* snapshot is enable */ + rc = ldiskfs_snapshot_set_enable(sb); + + RETURN(rc); +} + +static int osd_snapshot_list_orphan(struct dt_device *dt, + void *buf) +{ + struct osd_device *osd = osd_dt_dev(dt); + struct super_block *sb = osd_sb(osd); + int rc; + ENTRY; + + rc = ldiskfs_snapshot_list_orphan(sb, buf); + RETURN(rc); +} + /** * Note: we do not count into QUOTA here. * If we mount with --data_journal we may need more. @@ -1500,6 +1563,9 @@ static const struct dt_device_operations osd_dt_ops = { .dt_ro = osd_ro, .dt_commit_async = osd_commit_async, .dt_init_capa_ctxt = osd_init_capa_ctxt, + .dt_snapshot_get_enable = osd_snapshot_get_enable, + .dt_snapshot_set_enable = osd_snapshot_set_enable, + .dt_snapshot_list_orphan = osd_snapshot_list_orphan, }; static void osd_object_read_lock(const struct lu_env *env, @@ -3389,6 +3455,202 @@ static int osd_otable_it_attr_get(const struct lu_env *env, return 0; } +static int osd_declare_snapshot_clone(const struct lu_env *env, + struct dt_object *snap_dt, + struct dt_object *orig_dt, + struct thandle *handle, + int ignore_flag) +{ + struct osd_object *snap_obj = osd_dt_obj(snap_dt); + struct osd_object *orig_obj = osd_dt_obj(orig_dt); + struct inode *snap_inode = snap_obj->oo_inode; + struct inode *orig_inode = orig_obj->oo_inode; + struct osd_thandle *oh; + int rc; + int ignore_quota = + (ignore_flag & OBD_BRW_NOQUOTA) ? 1 : 0; + int quota_space_blocks = + (sizeof(struct ldiskfs_snapshot_link) + + LDISKFS_BLOCK_SIZE(snap_inode->i_sb) - 1) + >> LDISKFS_BLOCK_SIZE_BITS(snap_inode->i_sb); + long long quota_space = + toqb(quota_space_blocks << + LDISKFS_BLOCK_SIZE_BITS(snap_inode->i_sb)); + + ENTRY; + + LASSERT(handle != NULL); + oh = container_of0(handle, struct osd_thandle, ot_super); + LASSERT(oh->ot_handle == NULL); + + if (!dt_object_exists(snap_dt)) + RETURN(-ENOENT); + if (!dt_object_exists(orig_dt)) + RETURN(-ENOENT); + + quota_space_blocks = (sizeof(struct ldiskfs_snapshot_link) + + LDISKFS_BLOCK_SIZE(snap_inode->i_sb) - 1) + >> LDISKFS_BLOCK_SIZE_BITS(snap_inode->i_sb); + quota_space = toqb(quota_space_blocks + << LDISKFS_BLOCK_SIZE_BITS(snap_inode->i_sb)); + + /* for ldiskfs_snapshot_set_enable() */ + if (!ldiskfs_snapshot_get_enable(snap_inode->i_sb)) + oh->ot_credits++; + + oh->ot_credits += ldiskfs_calc_snapshot_link_credits(snap_inode, + SNAPSHOT_CREATE_OP); + + if (!LDISKFS_TEST_OST_SNAPSHOT(orig_inode)) { + rc = osd_declare_inode_qid(env, i_uid_read(orig_inode), + i_gid_read(orig_inode), + quota_space, oh, + orig_obj, true, NULL, + ignore_quota); + if (rc) { + CERROR("fail to orig osd_declare_inode_qid. rc=%d\n", + rc); + RETURN(rc); + } + } + rc = osd_declare_inode_qid(env, i_uid_read(snap_inode), + i_gid_read(snap_inode), + quota_space, oh, + snap_obj, true, NULL, + ignore_quota); + if (rc) + CERROR("fail to snapshot osd_declare_inode_qid. rc=%d\n", + rc); + RETURN(rc); +} + +/* + * osd_snapshot_clone() + * + * create snapshot at osd layer + * + * \param[in] snap_dt snapshot osd object + * \param[in] orig_dt original osd object + * + * \retval 0 success + * \retval less than 0 failure (error code -errno) + */ +static int osd_snapshot_clone(struct dt_object *snap_dt, + struct dt_object *orig_dt) +{ + struct osd_object *snap_obj = osd_dt_obj(snap_dt); + struct osd_object *orig_obj = osd_dt_obj(orig_dt); + struct osd_device *osd = osd_dev(snap_dt->do_lu.lo_dev); + struct super_block *sb = osd_sb(osd); + struct inode *snap_inode = snap_obj->oo_inode; + struct inode *orig_inode = orig_obj->oo_inode; + int rc; + ENTRY; + + LASSERT(snap_inode); + LASSERT(orig_inode); + + /* check if snapshot is enabled */ + rc = ldiskfs_snapshot_get_enable(sb); + if (rc != 1) { + rc = ldiskfs_snapshot_set_enable(sb); + if (rc) + RETURN(rc); + } + + /* clone inode extents */ + rc = ldiskfs_snapshot_clone(snap_inode, orig_inode); + + RETURN(rc); +} + +static int osd_snapshot_get_old_list(struct dt_object *snap_dt, + void *list_buf) +{ + ENTRY; + + if (!dt_object_exists(snap_dt)) + RETURN(-ENOENT); + + RETURN(ldiskfs_snapshot_get_old_list(osd_dt_obj(snap_dt)->oo_inode, + list_buf)); +} + +/* + * osd_snapshot_lock() + * + * snapshot lock for all generations + * + * \param[in] dt osd object + * \param[in] create for create snapshot + * \param[out] lock snapshot lock object + * + * \retval 0 success + * \retval less than 0 failure (error code -errno) + */ +static int osd_snapshot_lock(struct dt_object *dt, bool create, + void **lock) +{ + struct osd_object *osd_obj = osd_dt_obj(dt); + struct inode *inode = osd_obj->oo_inode; + int rc; + ENTRY; + + if (!dt_object_exists(dt)) + RETURN(-ENOENT); + + /* lock snapshot at ldiskfs layer */ + rc = ldiskfs_snapshot_lock(inode, create, + (struct ldiskfs_snapshot_gen_lock **)lock); + + RETURN(rc); +} + +/* + * osd_snapshot_unlock() + * + * snapshot unlock for all generations + * + * \param[in] lock snapshot lock object + * + * \retval none + */ +static void osd_snapshot_unlock(void *lock) +{ + ENTRY; + + /* unlock snapshot at ldiskfs layer */ + ldiskfs_snapshot_unlock((struct ldiskfs_snapshot_gen_lock *)lock); + + EXIT; +} + +static int osd_snapshot_get_orphan(struct dt_object *dt, + void *fid_buf, + int *array_num) +{ + struct osd_object *osd_obj = osd_dt_obj(dt); + struct inode *inode = osd_obj->oo_inode; + ENTRY; + + LASSERT(inode); + + RETURN(ldiskfs_snapshot_get_orphan(inode, + fid_buf, + array_num)); +} + +static int osd_snapshot_destroy(struct dt_object *dt, void *orig_fid) +{ + struct osd_object *osd_obj = osd_dt_obj(dt); + struct inode *inode = osd_obj->oo_inode; + ENTRY; + + LASSERT(inode); + + RETURN(ldiskfs_snapshot_destroy(inode, orig_fid)); +} + static const struct dt_object_operations osd_obj_ops = { .do_read_lock = osd_object_read_lock, .do_write_lock = osd_object_write_lock, @@ -3417,6 +3679,13 @@ static const struct dt_object_operations osd_obj_ops = { .do_capa_get = osd_capa_get, .do_object_sync = osd_object_sync, .do_data_get = osd_data_get, + .do_osd_snapshot_clone = osd_snapshot_clone, + .do_osd_declare_snapshot_clone = osd_declare_snapshot_clone, + .do_osd_snapshot_get_old_list = osd_snapshot_get_old_list, + .do_osd_snapshot_lock = osd_snapshot_lock, + .do_osd_snapshot_unlock = osd_snapshot_unlock, + .do_osd_snapshot_destroy = osd_snapshot_destroy, + .do_osd_snapshot_get_orphan = osd_snapshot_get_orphan, }; /** @@ -3451,6 +3720,13 @@ static const struct dt_object_operations osd_obj_ea_ops = { .do_capa_get = osd_capa_get, .do_object_sync = osd_object_sync, .do_data_get = osd_data_get, + .do_osd_snapshot_clone = osd_snapshot_clone, + .do_osd_declare_snapshot_clone = osd_declare_snapshot_clone, + .do_osd_snapshot_get_old_list = osd_snapshot_get_old_list, + .do_osd_snapshot_lock = osd_snapshot_lock, + .do_osd_snapshot_unlock = osd_snapshot_unlock, + .do_osd_snapshot_destroy = osd_snapshot_destroy, + .do_osd_snapshot_get_orphan = osd_snapshot_get_orphan, }; static const struct dt_object_operations osd_obj_otable_it_ops = { diff --git a/lustre/osd-ldiskfs/osd_io.c b/lustre/osd-ldiskfs/osd_io.c index fdb7915..d4f0f5f 100644 --- a/lustre/osd-ldiskfs/osd_io.c +++ b/lustre/osd-ldiskfs/osd_io.c @@ -60,6 +60,9 @@ /* ext_depth() */ #include +/* for snapshot lock */ +#include + static int __osd_init_iobuf(struct osd_device *d, struct osd_iobuf *iobuf, int rw, int line, int pages) { @@ -502,14 +505,6 @@ static int osd_bufs_put(const struct lu_env *env, struct dt_object *dt, #define ldiskfs_ext_pblock(ex) ext_pblock((ex)) #endif -struct bpointers { - unsigned long *blocks; - unsigned long start; - int num; - int init_num; - int create; -}; - static long ldiskfs_ext_find_goal(struct inode *inode, struct ldiskfs_ext_path *path, unsigned long block, int *aflags) @@ -597,7 +592,7 @@ static int ldiskfs_ext_new_extent_cb(struct inode *inode, goto map; } - if (bp->create == 0) { + if (bp->create != WRITE_COMMIT_OP) { i = 0; if (cex->ec_block < bp->start) i = bp->start - cex->ec_block; @@ -832,6 +827,8 @@ static int osd_ldiskfs_map_inode_pages(struct inode *inode, struct page **page, return rc; } +static int osd_read_prep(const struct lu_env *env, struct dt_object *dt, + struct niobuf_local *lnb, int npages); static int osd_write_prep(const struct lu_env *env, struct dt_object *dt, struct niobuf_local *lnb, int npages) @@ -851,6 +848,21 @@ static int osd_write_prep(const struct lu_env *env, struct dt_object *dt, LASSERT(inode); + if (npages && LDISKFS_TEST_OST_SNAPSHOT_ORIG(inode)) { + rc = osd_init_iobuf(osd, iobuf, 0, npages); + if (unlikely(rc != 0)) + RETURN(rc); + for (i = 0; i < npages; i++) + osd_iobuf_add_page(iobuf, lnb[i].page); + + rc = osd_ldiskfs_map_inode_pages(inode, iobuf->dr_pages, + npages, + iobuf->dr_blocks, + WRITE_COPY_OP); + if (rc) + RETURN(rc); + } + rc = osd_init_iobuf(osd, iobuf, 0, npages); if (unlikely(rc != 0)) RETURN(rc); @@ -903,7 +915,8 @@ static int osd_write_prep(const struct lu_env *env, struct dt_object *dt, if (iobuf->dr_npages) { rc = osd_ldiskfs_map_inode_pages(inode, iobuf->dr_pages, iobuf->dr_npages, - iobuf->dr_blocks, 0); + iobuf->dr_blocks, + WRITE_PREP_OP); if (likely(rc == 0)) { rc = osd_do_bio(osd, inode, iobuf); /* do IO stats for preparation reads */ @@ -1106,7 +1119,8 @@ static int osd_write_commit(const struct lu_env *env, struct dt_object *dt, } else if (iobuf->dr_npages > 0) { rc = osd_ldiskfs_map_inode_pages(inode, iobuf->dr_pages, iobuf->dr_npages, - iobuf->dr_blocks, 1); + iobuf->dr_blocks, + WRITE_COMMIT_OP); } else { /* no pages to write, no transno is needed */ thandle->th_local = 1; @@ -1201,10 +1215,13 @@ static int osd_read_prep(const struct lu_env *env, struct dt_object *dt, cache_hits + cache_misses); if (iobuf->dr_npages) { - rc = osd_ldiskfs_map_inode_pages(inode, iobuf->dr_pages, - iobuf->dr_npages, - iobuf->dr_blocks, 0); - rc = osd_do_bio(osd, inode, iobuf); + rc = osd_ldiskfs_map_inode_pages( + inode, iobuf->dr_pages, iobuf->dr_npages, + iobuf->dr_blocks, READ_OP); + if (likely(rc == 0)) + rc = osd_do_bio(osd, inode, iobuf); + else + osd_fini_iobuf(osd, iobuf); /* IO stats will be done in osd_bufs_put() */ } @@ -1644,8 +1661,19 @@ static int osd_punch(const struct lu_env *env, struct dt_object *dt, tid = oh->ot_handle->h_transaction->t_tid; + if (LDISKFS_TEST_OST_SNAPSHOT_ORIG(inode)) { + rc = ldiskfs_snapshot_punch(ldiskfs_journal_current_handle(), + inode, start, end); + if (rc) { + CERROR("fail to snapshot punch inode=%lu\n", + inode->i_ino); + SNAPSHOT_CONSOLE_ERR(rc); + RETURN(rc); + } + } i_size_write(inode, start); ll_truncate_pagecache(inode, start); + #ifdef HAVE_INODEOPS_TRUNCATE if (inode->i_op->truncate) { inode->i_op->truncate(inode); diff --git a/lustre/osd-ldiskfs/osd_lproc.c b/lustre/osd-ldiskfs/osd_lproc.c index 98e857a..690a7bc 100644 --- a/lustre/osd-ldiskfs/osd_lproc.c +++ b/lustre/osd-ldiskfs/osd_lproc.c @@ -45,6 +45,8 @@ #include "osd_internal.h" +#include + #ifdef LPROCFS void osd_brw_stats_update(struct osd_device *osd, struct osd_iobuf *iobuf) @@ -403,6 +405,30 @@ ldiskfs_osd_auto_scrub_seq_write(struct file *file, const char *buffer, LPROC_SEQ_FOPS(ldiskfs_osd_auto_scrub); static int +ldiskfs_osd_snapshot_lock_timeout_seq_show(struct seq_file *m, void *data) +{ + return seq_printf(m, "%d\n", ldiskfs_get_snapshot_lock_timeout()); +} + +static ssize_t +ldiskfs_osd_snapshot_lock_timeout_seq_write(struct file *file, + const char *buffer, + size_t count, loff_t *off) +{ + int snap_tout; + int rc; + + rc = lprocfs_write_helper(buffer, count, &snap_tout); + if (rc != 0) + return rc; + + ldiskfs_set_snapshot_lock_timeout(snap_tout); + + return count; +} +LPROC_SEQ_FOPS(ldiskfs_osd_snapshot_lock_timeout); + +static int ldiskfs_osd_track_declares_assert_seq_show(struct seq_file *m, void *data) { return seq_printf(m, "%d\n", ldiskfs_track_declares_assert); @@ -553,6 +579,8 @@ struct lprocfs_seq_vars lprocfs_osd_obd_vars[] = { struct lprocfs_seq_vars lprocfs_osd_module_vars[] = { { .name = "track_declares_assert", .fops = &ldiskfs_osd_track_declares_assert_fops }, + { .name = "snapshot_lock_timeout", + .fops = &ldiskfs_osd_snapshot_lock_timeout_fops }, { 0 } }; diff --git a/lustre/osp/osp_object.c b/lustre/osp/osp_object.c index 690e1ce..f993b25 100644 --- a/lustre/osp/osp_object.c +++ b/lustre/osp/osp_object.c @@ -1513,6 +1513,83 @@ static int osp_index_try(const struct lu_env *env, return 0; } +/* + * osp_snapshot_clone() + * + * \param[in] env environment + * \param[in] snap_dt snapshot object + * \param[in] attr snapshot attributes + * \param[in] orig_fid original fid + * \param[in] mdt_fid snapshot fid on mdt + * + * \retval 0 success + * \retval not 0 errcode + */ +static int osp_snapshot_clone(const struct lu_env *env, + struct dt_object *snap_dt, + struct lu_attr *attr, + const struct lu_fid *orig_fid, + const struct lu_fid *mdt_fid) +{ + struct osp_device *d = lu2osp_dev(snap_dt->do_lu.lo_dev); + struct obd_import *imp = d->opd_obd->u.cli.cl_import; + struct ptlrpc_request *req = NULL; + struct ost_body *body; + struct lu_ucred *uc = lu_ucred(env); + int rc; + ENTRY; + + /* alloc req */ + req = ptlrpc_request_alloc(imp, &RQF_OST_SNAPSHOT); + if (req == NULL) + GOTO(out, rc = -ENOMEM); + + /* pack req */ + rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_SNAPSHOT); + if (rc) { + ptlrpc_request_free(req); + req = NULL; + GOTO(out, rc); + } + req->rq_request_portal = OST_IO_PORTAL; + + body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY); + LASSERT(body); + memset(body, 0x0, sizeof(*body)); + + fid_to_ostid(lu_object_fid(&snap_dt->do_lu), &body->oa.o_oi); + body->oa.o_valid |= (OBD_MD_FLID | OBD_MD_FLGROUP); + + /* flag of inode on MDS is not set to OST */ + obdo_from_la(&body->oa, attr, attr->la_valid & ~LA_FLAGS); + obdo_set_parent_fid(&body->oa, mdt_fid); + + body->oa.o_stripe_idx = d->opd_index; + + body->oa.o_snapshot_orig_seq = orig_fid->f_seq; + body->oa.o_snapshot_orig_oid = orig_fid->f_oid; + body->oa.o_snapshot_orig_ver = orig_fid->f_ver; + + body->oa.o_flags = (uc && (uc->uc_cap & CFS_CAP_SYS_RESOURCE_MASK)) + ? OBD_BRW_NOQUOTA : 0; + ptlrpc_request_set_replen(req); + ptlrpc_at_set_req_timeout(req); + + /* send req */ + rc = ptlrpc_queue_wait(req); +out: + if (rc) + CERROR("failed to create snapshot " + "FID="DFID" ost_idx=%d, ost_fid="DFID" rc=%d\n", + PFID(mdt_fid), d->opd_index, + PFID(lu_object_fid(&snap_dt->do_lu)), + rc); + + if (req) + ptlrpc_req_finished(req); + RETURN(rc); +} + struct dt_object_operations osp_obj_ops = { .do_declare_attr_get = osp_declare_attr_get, .do_attr_get = osp_attr_get, @@ -1527,6 +1604,7 @@ struct dt_object_operations osp_obj_ops = { .do_declare_destroy = osp_declare_object_destroy, .do_destroy = osp_object_destroy, .do_index_try = osp_index_try, + .do_osp_snapshot_clone = osp_snapshot_clone, }; static int osp_object_init(const struct lu_env *env, struct lu_object *o, diff --git a/lustre/ptlrpc/layout.c b/lustre/ptlrpc/layout.c index 4ecd9b5..b9f1db9 100644 --- a/lustre/ptlrpc/layout.c +++ b/lustre/ptlrpc/layout.c @@ -312,6 +312,23 @@ static const struct req_msg_field *mds_reint_setxattr_client[] = { &RMF_DLM_REQ }; +static const struct req_msg_field *mds_reint_snapshot_client[] = { + &RMF_PTLRPC_BODY, + &RMF_REC_REINT, + &RMF_NAME, + &RMF_SNAP_EANAME, + &RMF_EADATA, + &RMF_EADATA2, + &RMF_DLM_REQ +}; + +static const struct req_msg_field *mds_reint_snapshot_server[] = { + &RMF_PTLRPC_BODY, + &RMF_MDT_BODY, + &RMF_CAPA1, + &RMF_MDT_MD +}; + static const struct req_msg_field *mdt_swap_layouts[] = { &RMF_PTLRPC_BODY, &RMF_MDT_BODY, @@ -799,6 +816,8 @@ static struct req_format *req_formats[] = { &RQF_CONNECT, &RQF_LFSCK_NOTIFY, &RQF_LFSCK_QUERY, + &RQF_MDS_REINT_SNAPSHOT, + &RQF_OST_SNAPSHOT, }; struct req_msg_field { @@ -1063,6 +1082,14 @@ struct req_msg_field RMF_EADATA = DEFINE_MSGF("eadata", 0, -1, NULL, NULL); EXPORT_SYMBOL(RMF_EADATA); +struct req_msg_field RMF_EADATA2 = DEFINE_MSGF("eadata", 0, -1, + NULL, NULL); +EXPORT_SYMBOL(RMF_EADATA2); + +struct req_msg_field RMF_SNAP_EANAME = + DEFINE_MSGF("snap_eaname", RMF_F_STRING, -1, NULL, NULL); +EXPORT_SYMBOL(RMF_SNAP_EANAME); + struct req_msg_field RMF_EAVALS = DEFINE_MSGF("eavals", 0, -1, NULL, NULL); EXPORT_SYMBOL(RMF_EAVALS); @@ -1676,6 +1703,16 @@ struct req_format RQF_LFSCK_QUERY = DEFINE_REQ_FMT0("LFSCK_QUERY", obd_lfsck_request, obd_lfsck_reply); EXPORT_SYMBOL(RQF_LFSCK_QUERY); +struct req_format RQF_MDS_REINT_SNAPSHOT = + DEFINE_REQ_FMT0("MDS_REINT_SNAPSHOT", mds_reint_snapshot_client, + mds_reint_snapshot_server); +EXPORT_SYMBOL(RQF_MDS_REINT_SNAPSHOT); + +struct req_format RQF_OST_SNAPSHOT = + DEFINE_REQ_FMT0("OST_SNAPSHOT", ost_body_only, + ost_body_only); +EXPORT_SYMBOL(RQF_OST_SNAPSHOT); + #if !defined(__REQ_LAYOUT_USER__) /* Convenience macro */ diff --git a/lustre/ptlrpc/lproc_ptlrpc.c b/lustre/ptlrpc/lproc_ptlrpc.c index 84b0460..8b644af 100644 --- a/lustre/ptlrpc/lproc_ptlrpc.c +++ b/lustre/ptlrpc/lproc_ptlrpc.c @@ -73,6 +73,18 @@ struct ll_rpc_opcode { { OST_QUOTACHECK, "ost_quotacheck" }, { OST_QUOTACTL, "ost_quotactl" }, { OST_QUOTA_ADJUST_QUNIT, "ost_quota_adjust_qunit" }, + { 21, NULL }, + { 22, NULL }, + { 23, NULL }, + { 24, NULL }, + { 25, NULL }, + { 26, NULL }, + { 27, NULL }, + { 28, NULL }, + { 29, NULL }, + { 30, NULL }, + { 31, NULL }, + { OST_SNAPSHOT, "ost_snapshot" }, { MDS_GETATTR, "mds_getattr" }, { MDS_GETATTR_NAME, "mds_getattr_lock" }, { MDS_CLOSE, "mds_close" }, diff --git a/lustre/ptlrpc/pack_generic.c b/lustre/ptlrpc/pack_generic.c index 7768e41..4c7fa52 100644 --- a/lustre/ptlrpc/pack_generic.c +++ b/lustre/ptlrpc/pack_generic.c @@ -2092,6 +2092,10 @@ void lustre_swab_mdt_rec_reint (struct mdt_rec_reint *rr) __swab32s(&rr->rr_flags); __swab32s(&rr->rr_flags_h); __swab32s(&rr->rr_umask); + __swab32s(&rr->rr_padding_4); /* SEE FOLLOWINGS */ + /* rr_padding_4 is used for mdt_rec_snapshot_create::sc_flags + this swab function handles both mdt_rec_reint and + mdt_rec_snapshot_create */ CLASSERT(offsetof(typeof(*rr), rr_padding_4) != 0); }; diff --git a/lustre/ptlrpc/wiretest.c b/lustre/ptlrpc/wiretest.c index 62ddffa..53b7dc8 100644 --- a/lustre/ptlrpc/wiretest.c +++ b/lustre/ptlrpc/wiretest.c @@ -110,7 +110,9 @@ void lustre_assert_wire_constants(void) (long long)OST_QUOTACTL); LASSERTF(OST_QUOTA_ADJUST_QUNIT == 20, "found %lld\n", (long long)OST_QUOTA_ADJUST_QUNIT); - LASSERTF(OST_LAST_OPC == 21, "found %lld\n", + LASSERTF(OST_SNAPSHOT == 32, "found %lld\n", + (long long)OST_SNAPSHOT); + LASSERTF(OST_LAST_OPC == 33, "found %lld\n", (long long)OST_LAST_OPC); LASSERTF(OBD_OBJECT_EOF == 0xffffffffffffffffULL, "found 0x%.16llxULL\n", OBD_OBJECT_EOF); @@ -202,7 +204,9 @@ void lustre_assert_wire_constants(void) (long long)REINT_RMENTRY); LASSERTF(REINT_MIGRATE == 9, "found %lld\n", (long long)REINT_MIGRATE); - LASSERTF(REINT_MAX == 10, "found %lld\n", + LASSERTF(REINT_SNAPSHOT == 21, "found %lld\n", + (long long)REINT_SNAPSHOT); + LASSERTF(REINT_MAX == 22, "found %lld\n", (long long)REINT_MAX); LASSERTF(DISP_IT_EXECD == 0x00000001UL, "found 0x%.8xUL\n", (unsigned)DISP_IT_EXECD); diff --git a/lustre/target/tgt_handler.c b/lustre/target/tgt_handler.c index af9cb58..6fdf048 100644 --- a/lustre/target/tgt_handler.c +++ b/lustre/target/tgt_handler.c @@ -471,6 +471,7 @@ static int tgt_filter_recovery_request(struct ptlrpc_request *req, case OST_SETATTR: case OST_SYNC: case OST_WRITE: + case OST_SNAPSHOT: *process = target_queue_recovery_request(req, obd); RETURN(0); @@ -1650,12 +1651,13 @@ int tgt_brw_read(struct tgt_session_info *tsi) struct niobuf_remote *remote_nb; struct niobuf_local *local_nb; struct obd_ioobj *ioo; - struct ost_body *body, *repbody; + struct ost_body *body, *repbody = NULL; struct l_wait_info lwi; struct lustre_handle lockh = { 0 }; int niocount, npages, nob = 0, rc, i; int no_reply = 0; struct tgt_thread_big_cache *tbc = req->rq_svc_thread->t_data; + void *snap_lock = NULL; ENTRY; @@ -1726,11 +1728,22 @@ int tgt_brw_read(struct tgt_session_info *tsi) repbody = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY); repbody->oa = body->oa; + /* if taget is not snapshot_orig, + * obd_snapshot_lock() doesn't hold snapshot_lock + * and snap_lock is NULL */ + rc = obd_snapshot_lock(tsi->tsi_env, exp, + &repbody->oa, false, &snap_lock); + if (rc) { + CERROR("fail to lock snapshot. err=%d\n", + rc); + GOTO(out_lock, rc); + } + npages = PTLRPC_MAX_BRW_PAGES; rc = obd_preprw(tsi->tsi_env, OBD_BRW_READ, exp, &repbody->oa, 1, ioo, remote_nb, &npages, local_nb, NULL, BYPASS_CAPA); if (rc != 0) - GOTO(out_lock, rc); + GOTO(out_snap_unlock, rc); desc = ptlrpc_prep_bulk_exp(req, npages, ioobj_max_brw_get(ioo), BULK_PUT_SOURCE, OST_BULK_PORTAL); @@ -1792,6 +1805,10 @@ out_commitrw: NULL, rc); if (rc == 0) tgt_drop_id(exp, &repbody->oa); +out_snap_unlock: + if (snap_lock) + obd_snapshot_unlock(tsi->tsi_env, exp, + &repbody->oa, snap_lock); out_lock: tgt_brw_unlock(ioo, remote_nb, &lockh, LCK_PR); @@ -1884,15 +1901,16 @@ int tgt_brw_write(struct tgt_session_info *tsi) struct niobuf_remote *remote_nb; struct niobuf_local *local_nb; struct obd_ioobj *ioo; - struct ost_body *body, *repbody; + struct ost_body *body, *repbody = NULL; struct l_wait_info lwi; struct lustre_handle lockh = {0}; __u32 *rcs; int objcount, niocount, npages; int rc, i, j; - cksum_type_t cksum_type = OBD_CKSUM_CRC32; + cksum_type_t cksum_type = OBD_CKSUM_CRC32; bool no_reply = false, mmap; struct tgt_thread_big_cache *tbc = req->rq_svc_thread->t_data; + void *snap_lock = NULL; ENTRY; @@ -1994,6 +2012,16 @@ int tgt_brw_write(struct tgt_session_info *tsi) GOTO(out_lock, rc = -ENOMEM); repbody->oa = body->oa; + /* if taget is not snapshot_orig, + * obd_snapshot_lock() doesn't hold snapshot_lock + * and snap_lock is NULL */ + rc = obd_snapshot_lock(tsi->tsi_env, exp, + &repbody->oa, false, &snap_lock); + if (rc) { + CERROR("fail to lock snapshot. err=%d\n", rc); + GOTO(out_lock, rc); + } + npages = PTLRPC_MAX_BRW_PAGES; rc = obd_preprw(tsi->tsi_env, OBD_BRW_WRITE, exp, &repbody->oa, objcount, ioo, remote_nb, &npages, local_nb, NULL, @@ -2090,6 +2118,9 @@ skip_transfer: tgt_drop_id(exp, &repbody->oa); } out_lock: + if (snap_lock) + obd_snapshot_unlock(tsi->tsi_env, exp, + &repbody->oa, snap_lock); tgt_brw_unlock(ioo, remote_nb, &lockh, LCK_PW); if (desc) ptlrpc_free_bulk_nopin(desc); diff --git a/lustre/utils/Makefile.am b/lustre/utils/Makefile.am index da0213f..6bf35b3 100644 --- a/lustre/utils/Makefile.am +++ b/lustre/utils/Makefile.am @@ -48,7 +48,7 @@ lctl_SOURCES = lustre_lfsck.c obd.c lustre_cfg.c lctl.c obdctl.h lctl_LDADD := liblustreapi.a $(LIBPTLCTL) $(PTHREAD_LIBS) $(LIBREADLINE) lctl_DEPENDENCIES := $(LIBPTLCTL) liblustreapi.a -lfs_SOURCES = lfs.c +lfs_SOURCES = lfs.c lfs_snapshot.c lfs_snapshot.h lfs_LDADD := liblustreapi.a $(LIBPTLCTL) $(LIBREADLINE) lfs_DEPENDENCIES := $(LIBPTLCTL) liblustreapi.a diff --git a/lustre/utils/lctl.c b/lustre/utils/lctl.c index 261da54..23ccb3d 100644 --- a/lustre/utils/lctl.c +++ b/lustre/utils/lctl.c @@ -448,6 +448,17 @@ command_t cmdlist[] = { {"set_route", jt_ptl_notify_router, 0, "enable/disable routes via gateway in the portals routing table\n" "usage: set_route [