diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2012-12-12 09:19:45 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2012-12-12 09:19:45 -0800 |
commit | 3f1c64f410e4394ecefadd7a597a7c20368a65fc (patch) | |
tree | 10f15d6a222b15a34831f2d7d1e3ac26f1436638 /fs | |
parent | 22a40fd9a60388aec8106b0baffc8f59f83bb1b4 (diff) | |
parent | f9668a09e32ac6d2aa22f44cc310e430a8f4a40f (diff) |
Merge tag 'for-linus-v3.8-rc1' of git://oss.sgi.com/xfs/xfs
Pull xfs update from Ben Myers:
"There is plenty going on, including the cleanup of xfssyncd, metadata
verifiers, CRC infrastructure for the log, tracking of inodes with
speculative allocation, a cleanup of xfs_fs_subr.c, fixes for
XFS_IOC_ZERO_RANGE, and important fix related to log replay (only
update the last_sync_lsn when a transaction completes), a fix for
deadlock on AGF buffers, documentation and comment updates, and a few
more cleanups and fixes.
Details:
- remove the xfssyncd mess
- only update the last_sync_lsn when a transaction completes
- zero allocation_args on the kernel stack
- fix AGF/alloc workqueue deadlock
- silence uninitialised f.file warning
- Update inode alloc comments
- Update mount options documentation
- report projid32bit feature in geometry call
- speculative preallocation inode tracking
- fix attr tree double split corruption
- fix broken error handling in xfs_vm_writepage
- drop buffer io reference when a bad bio is built
- add more attribute tree trace points
- growfs infrastructure changes for 3.8
- fs/xfs/xfs_fs_subr.c die die die
- add CRC infrastructure
- add CRC checks to the log
- Remove description of nodelaylog mount option from xfs.txt
- inode allocation should use unmapped buffers
- byte range granularity for XFS_IOC_ZERO_RANGE
- fix direct IO nested transaction deadlock
- fix stray dquot unlock when reclaiming dquots
- fix sparse reported log CRC endian issue"
Fix up trivial conflict in fs/xfs/xfs_fsops.c due to the same patch
having been applied twice (commits eaef854335ce and 1375cb65e87b: "xfs:
growfs: don't read garbage for new secondary superblocks") with later
updates to the affected code in the XFS tree.
* tag 'for-linus-v3.8-rc1' of git://oss.sgi.com/xfs/xfs: (78 commits)
xfs: fix sparse reported log CRC endian issue
xfs: fix stray dquot unlock when reclaiming dquots
xfs: fix direct IO nested transaction deadlock.
xfs: byte range granularity for XFS_IOC_ZERO_RANGE
xfs: inode allocation should use unmapped buffers.
xfs: Remove the description of nodelaylog mount option from xfs.txt
xfs: add CRC checks to the log
xfs: add CRC infrastructure
xfs: convert buffer verifiers to an ops structure.
xfs: connect up write verifiers to new buffers
xfs: add pre-write metadata buffer verifier callbacks
xfs: add buffer pre-write callback
xfs: Add verifiers to dir2 data readahead.
xfs: add xfs_da_node verification
xfs: factor and verify attr leaf reads
xfs: factor dir2 leaf read
xfs: factor out dir2 data block reading
xfs: factor dir2 free block reading
xfs: verify dir2 block format buffers
xfs: factor dir2 block read operations
...
Diffstat (limited to 'fs')
69 files changed, 3737 insertions, 2308 deletions
diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig index 6100ec0fa1d4..5a7ffe54f5d5 100644 --- a/fs/xfs/Kconfig +++ b/fs/xfs/Kconfig @@ -2,6 +2,7 @@ config XFS_FS tristate "XFS filesystem support" depends on BLOCK select EXPORTFS + select LIBCRC32C help XFS is a high performance journaling filesystem which originated on the SGI IRIX platform. It is completely multi-threaded, can diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index d2bf974b1a2f..d02201df855b 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -37,9 +37,8 @@ xfs-y += xfs_aops.o \ xfs_file.o \ xfs_filestream.o \ xfs_fsops.o \ - xfs_fs_subr.o \ xfs_globals.o \ - xfs_iget.o \ + xfs_icache.o \ xfs_ioctl.o \ xfs_iomap.o \ xfs_iops.o \ @@ -47,7 +46,6 @@ xfs-y += xfs_aops.o \ xfs_message.o \ xfs_mru_cache.o \ xfs_super.o \ - xfs_sync.o \ xfs_xattr.o \ xfs_rename.o \ xfs_utils.o \ diff --git a/fs/xfs/uuid.h b/fs/xfs/uuid.h index 4732d71262cc..104db0f3bed6 100644 --- a/fs/xfs/uuid.h +++ b/fs/xfs/uuid.h @@ -26,4 +26,10 @@ extern int uuid_is_nil(uuid_t *uuid); extern int uuid_equal(uuid_t *uuid1, uuid_t *uuid2); extern void uuid_getnodeuniq(uuid_t *uuid, int fsid [2]); +static inline void +uuid_copy(uuid_t *dst, uuid_t *src) +{ + memcpy(dst, src, sizeof(uuid_t)); +} + #endif /* __XFS_SUPPORT_UUID_H__ */ diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h index 44d65c1533c0..f2aeedb6a579 100644 --- a/fs/xfs/xfs_ag.h +++ b/fs/xfs/xfs_ag.h @@ -108,6 +108,8 @@ typedef struct xfs_agf { extern int xfs_read_agf(struct xfs_mount *mp, struct xfs_trans *tp, xfs_agnumber_t agno, int flags, struct xfs_buf **bpp); +extern const struct xfs_buf_ops xfs_agf_buf_ops; + /* * Size of the unlinked inode hash table in the agi. */ @@ -161,6 +163,8 @@ typedef struct xfs_agi { extern int xfs_read_agi(struct xfs_mount *mp, struct xfs_trans *tp, xfs_agnumber_t agno, struct xfs_buf **bpp); +extern const struct xfs_buf_ops xfs_agi_buf_ops; + /* * The third a.g. block contains the a.g. freelist, an array * of block pointers to blocks owned by the allocation btree code. @@ -233,6 +237,7 @@ typedef struct xfs_perag { #define XFS_ICI_NO_TAG (-1) /* special flag for an untagged lookup in xfs_inode_ag_iterator */ #define XFS_ICI_RECLAIM_TAG 0 /* inode is to be reclaimed */ +#define XFS_ICI_EOFBLOCKS_TAG 1 /* inode has blocks beyond EOF */ #define XFS_AG_MAXLEVELS(mp) ((mp)->m_ag_maxlevels) #define XFS_MIN_FREELIST_RAW(bl,cl,mp) \ diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c index 335206a9c698..393055fe3aef 100644 --- a/fs/xfs/xfs_alloc.c +++ b/fs/xfs/xfs_alloc.c @@ -430,6 +430,60 @@ xfs_alloc_fixup_trees( return 0; } +static void +xfs_agfl_verify( + struct xfs_buf *bp) +{ +#ifdef WHEN_CRCS_COME_ALONG + /* + * we cannot actually do any verification of the AGFL because mkfs does + * not initialise the AGFL to zero or NULL. Hence the only valid part of + * the AGFL is what the AGF says is active. We can't get to the AGF, so + * we can't verify just those entries are valid. + * + * This problem goes away when the CRC format change comes along as that + * requires the AGFL to be initialised by mkfs. At that point, we can + * verify the blocks in the agfl -active or not- lie within the bounds + * of the AG. Until then, just leave this check ifdef'd out. + */ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_agfl *agfl = XFS_BUF_TO_AGFL(bp); + int agfl_ok = 1; + + int i; + + for (i = 0; i < XFS_AGFL_SIZE(mp); i++) { + if (be32_to_cpu(agfl->agfl_bno[i]) == NULLAGBLOCK || + be32_to_cpu(agfl->agfl_bno[i]) >= mp->m_sb.sb_agblocks) + agfl_ok = 0; + } + + if (!agfl_ok) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, agfl); + xfs_buf_ioerror(bp, EFSCORRUPTED); + } +#endif +} + +static void +xfs_agfl_write_verify( + struct xfs_buf *bp) +{ + xfs_agfl_verify(bp); +} + +static void +xfs_agfl_read_verify( + struct xfs_buf *bp) +{ + xfs_agfl_verify(bp); +} + +const struct xfs_buf_ops xfs_agfl_buf_ops = { + .verify_read = xfs_agfl_read_verify, + .verify_write = xfs_agfl_write_verify, +}; + /* * Read in the allocation group free block array. */ @@ -447,7 +501,7 @@ xfs_alloc_read_agfl( error = xfs_trans_read_buf( mp, tp, mp->m_ddev_targp, XFS_AG_DADDR(mp, agno, XFS_AGFL_DADDR(mp)), - XFS_FSS_TO_BB(mp, 1), 0, &bp); + XFS_FSS_TO_BB(mp, 1), 0, &bp, &xfs_agfl_buf_ops); if (error) return error; ASSERT(!xfs_buf_geterror(bp)); @@ -2091,6 +2145,63 @@ xfs_alloc_put_freelist( return 0; } +static void +xfs_agf_verify( + struct xfs_buf *bp) + { + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_agf *agf; + int agf_ok; + + agf = XFS_BUF_TO_AGF(bp); + + agf_ok = agf->agf_magicnum == cpu_to_be32(XFS_AGF_MAGIC) && + XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) && + be32_to_cpu(agf->agf_freeblks) <= be32_to_cpu(agf->agf_length) && + be32_to_cpu(agf->agf_flfirst) < XFS_AGFL_SIZE(mp) && + be32_to_cpu(agf->agf_fllast) < XFS_AGFL_SIZE(mp) && + be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp); + + /* + * during growfs operations, the perag is not fully initialised, + * so we can't use it for any useful checking. growfs ensures we can't + * use it by using uncached buffers that don't have the perag attached + * so we can detect and avoid this problem. + */ + if (bp->b_pag) + agf_ok = agf_ok && be32_to_cpu(agf->agf_seqno) == + bp->b_pag->pag_agno; + + if (xfs_sb_version_haslazysbcount(&mp->m_sb)) + agf_ok = agf_ok && be32_to_cpu(agf->agf_btreeblks) <= + be32_to_cpu(agf->agf_length); + + if (unlikely(XFS_TEST_ERROR(!agf_ok, mp, XFS_ERRTAG_ALLOC_READ_AGF, + XFS_RANDOM_ALLOC_READ_AGF))) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, agf); + xfs_buf_ioerror(bp, EFSCORRUPTED); + } +} + +static void +xfs_agf_read_verify( + struct xfs_buf *bp) +{ + xfs_agf_verify(bp); +} + +static void +xfs_agf_write_verify( + struct xfs_buf *bp) +{ + xfs_agf_verify(bp); +} + +const struct xfs_buf_ops xfs_agf_buf_ops = { + .verify_read = xfs_agf_read_verify, + .verify_write = xfs_agf_write_verify, +}; + /* * Read in the allocation group header (free/alloc section). */ @@ -2102,44 +2213,19 @@ xfs_read_agf( int flags, /* XFS_BUF_ */ struct xfs_buf **bpp) /* buffer for the ag freelist header */ { - struct xfs_agf *agf; /* ag freelist header */ - int agf_ok; /* set if agf is consistent */ int error; ASSERT(agno != NULLAGNUMBER); error = xfs_trans_read_buf( mp, tp, mp->m_ddev_targp, XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)), - XFS_FSS_TO_BB(mp, 1), flags, bpp); + XFS_FSS_TO_BB(mp, 1), flags, bpp, &xfs_agf_buf_ops); if (error) return error; if (!*bpp) return 0; ASSERT(!(*bpp)->b_error); - agf = XFS_BUF_TO_AGF(*bpp); - - /* - * Validate the magic number of the agf block. - */ - agf_ok = - agf->agf_magicnum == cpu_to_be32(XFS_AGF_MAGIC) && - XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) && - be32_to_cpu(agf->agf_freeblks) <= be32_to_cpu(agf->agf_length) && - be32_to_cpu(agf->agf_flfirst) < XFS_AGFL_SIZE(mp) && - be32_to_cpu(agf->agf_fllast) < XFS_AGFL_SIZE(mp) && - be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp) && - be32_to_cpu(agf->agf_seqno) == agno; - if (xfs_sb_version_haslazysbcount(&mp->m_sb)) - agf_ok = agf_ok && be32_to_cpu(agf->agf_btreeblks) <= - be32_to_cpu(agf->agf_length); - if (unlikely(XFS_TEST_ERROR(!agf_ok, mp, XFS_ERRTAG_ALLOC_READ_AGF, - XFS_RANDOM_ALLOC_READ_AGF))) { - XFS_CORRUPTION_ERROR("xfs_alloc_read_agf", - XFS_ERRLEVEL_LOW, mp, agf); - xfs_trans_brelse(tp, *bpp); - return XFS_ERROR(EFSCORRUPTED); - } xfs_buf_set_ref(*bpp, XFS_AGF_REF); return 0; } diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h index feacb061bab7..99d0a6101558 100644 --- a/fs/xfs/xfs_alloc.h +++ b/fs/xfs/xfs_alloc.h @@ -231,4 +231,7 @@ xfs_alloc_get_rec( xfs_extlen_t *len, /* output: length of extent */ int *stat); /* output: success/failure */ +extern const struct xfs_buf_ops xfs_agf_buf_ops; +extern const struct xfs_buf_ops xfs_agfl_buf_ops; + #endif /* __XFS_ALLOC_H__ */ diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c index f7876c6d6165..b1ddef6b2689 100644 --- a/fs/xfs/xfs_alloc_btree.c +++ b/fs/xfs/xfs_alloc_btree.c @@ -272,6 +272,82 @@ xfs_allocbt_key_diff( return (__int64_t)be32_to_cpu(kp->ar_startblock) - rec->ar_startblock; } +static void +xfs_allocbt_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); + struct xfs_perag *pag = bp->b_pag; + unsigned int level; + int sblock_ok; /* block passes checks */ + + /* + * magic number and level verification + * + * During growfs operations, we can't verify the exact level as the + * perag is not fully initialised and hence not attached to the buffer. + * In this case, check against the maximum tree depth. + */ + level = be16_to_cpu(block->bb_level); + switch (block->bb_magic) { + case cpu_to_be32(XFS_ABTB_MAGIC): + if (pag) + sblock_ok = level < pag->pagf_levels[XFS_BTNUM_BNOi]; + else + sblock_ok = level < mp->m_ag_maxlevels; + break; + case cpu_to_be32(XFS_ABTC_MAGIC): + if (pag) + sblock_ok = level < pag->pagf_levels[XFS_BTNUM_CNTi]; + else + sblock_ok = level < mp->m_ag_maxlevels; + break; + default: + sblock_ok = 0; + break; + } + + /* numrecs verification */ + sblock_ok = sblock_ok && + be16_to_cpu(block->bb_numrecs) <= mp->m_alloc_mxr[level != 0]; + + /* sibling pointer verification */ + sblock_ok = sblock_ok && + (block->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK) || + be32_to_cpu(block->bb_u.s.bb_leftsib) < mp->m_sb.sb_agblocks) && + block->bb_u.s.bb_leftsib && + (block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK) || + be32_to_cpu(block->bb_u.s.bb_rightsib) < mp->m_sb.sb_agblocks) && + block->bb_u.s.bb_rightsib; + + if (!sblock_ok) { + trace_xfs_btree_corrupt(bp, _RET_IP_); + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, block); + xfs_buf_ioerror(bp, EFSCORRUPTED); + } +} + +static void +xfs_allocbt_read_verify( + struct xfs_buf *bp) +{ + xfs_allocbt_verify(bp); +} + +static void +xfs_allocbt_write_verify( + struct xfs_buf *bp) +{ + xfs_allocbt_verify(bp); +} + +const struct xfs_buf_ops xfs_allocbt_buf_ops = { + .verify_read = xfs_allocbt_read_verify, + .verify_write = xfs_allocbt_write_verify, +}; + + #ifdef DEBUG STATIC int xfs_allocbt_keys_inorder( @@ -327,6 +403,7 @@ static const struct xfs_btree_ops xfs_allocbt_ops = { .init_rec_from_cur = xfs_allocbt_init_rec_from_cur, .init_ptr_from_cur = xfs_allocbt_init_ptr_from_cur, .key_diff = xfs_allocbt_key_diff, + .buf_ops = &xfs_allocbt_buf_ops, #ifdef DEBUG .keys_inorder = xfs_allocbt_keys_inorder, .recs_inorder = xfs_allocbt_recs_inorder, diff --git a/fs/xfs/xfs_alloc_btree.h b/fs/xfs/xfs_alloc_btree.h index 359fb86ed876..7e89a2b429dd 100644 --- a/fs/xfs/xfs_alloc_btree.h +++ b/fs/xfs/xfs_alloc_btree.h @@ -93,4 +93,6 @@ extern struct xfs_btree_cur *xfs_allocbt_init_cursor(struct xfs_mount *, xfs_agnumber_t, xfs_btnum_t); extern int xfs_allocbt_maxrecs(struct xfs_mount *, int, int); +extern const struct xfs_buf_ops xfs_allocbt_buf_ops; + #endif /* __XFS_ALLOC_BTREE_H__ */ diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index e57e2daa357c..4111a40ebe1a 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -124,7 +124,7 @@ xfs_setfilesize_trans_alloc( ioend->io_append_trans = tp; /* - * We will pass freeze protection with a transaction. So tell lockdep + * We may pass freeze protection with a transaction. So tell lockdep * we released it. */ rwsem_release(&ioend->io_inode->i_sb->s_writers.lock_map[SB_FREEZE_FS-1], @@ -149,11 +149,13 @@ xfs_setfilesize( xfs_fsize_t isize; /* - * The transaction was allocated in the I/O submission thread, - * thus we need to mark ourselves as beeing in a transaction - * manually. + * The transaction may have been allocated in the I/O submission thread, + * thus we need to mark ourselves as beeing in a transaction manually. + * Similarly for freeze protection. */ current_set_flags_nested(&tp->t_pflags, PF_FSTRANS); + rwsem_acquire_read(&VFS_I(ip)->i_sb->s_writers.lock_map[SB_FREEZE_FS-1], + 0, 1, _THIS_IP_); xfs_ilock(ip, XFS_ILOCK_EXCL); isize = xfs_new_eof(ip, ioend->io_offset + ioend->io_size); @@ -187,7 +189,8 @@ xfs_finish_ioend( if (ioend->io_type == XFS_IO_UNWRITTEN) queue_work(mp->m_unwritten_workqueue, &ioend->io_work); - else if (ioend->io_append_trans) + else if (ioend->io_append_trans || + (ioend->io_isdirect && xfs_ioend_is_append(ioend))) queue_work(mp->m_data_workqueue, &ioend->io_work); else xfs_destroy_ioend(ioend); @@ -205,15 +208,6 @@ xfs_end_io( struct xfs_inode *ip = XFS_I(ioend->io_inode); int error = 0; - if (ioend->io_append_trans) { - /* - * We've got freeze protection passed with the transaction. - * Tell lockdep about it. - */ - rwsem_acquire_read( - &ioend->io_inode->i_sb->s_writers.lock_map[SB_FREEZE_FS-1], - 0, 1, _THIS_IP_); - } if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { ioend->io_error = -EIO; goto done; @@ -226,35 +220,31 @@ xfs_end_io( * range to normal written extens after the data I/O has finished. */ if (ioend->io_type == XFS_IO_UNWRITTEN) { + error = xfs_iomap_write_unwritten(ip, ioend->io_offset, + ioend->io_size); + } else if (ioend->io_isdirect && xfs_ioend_is_append(ioend)) { /* - * For buffered I/O we never preallocate a transaction when - * doing the unwritten extent conversion, but for direct I/O - * we do not know if we are converting an unwritten extent - * or not at the point where we preallocate the transaction. + * For direct I/O we do not know if we need to allocate blocks + * or not so we can't preallocate an append transaction as that + * results in nested reservations and log space deadlocks. Hence + * allocate the transaction here. While this is sub-optimal and + * can block IO completion for some time, we're stuck with doing + * it this way until we can pass the ioend to the direct IO + * allocation callbacks and avoid nesting that way. */ - if (ioend->io_append_trans) { - ASSERT(ioend->io_isdirect); - - current_set_flags_nested( - &ioend->io_append_trans->t_pflags, PF_FSTRANS); - xfs_trans_cancel(ioend->io_append_trans, 0); - } - - error = xfs_iomap_write_unwritten(ip, ioend->io_offset, - ioend->io_size); - if (error) { - ioend->io_error = -error; + error = xfs_setfilesize_trans_alloc(ioend); + if (error) goto done; - } + error = xfs_setfilesize(ioend); } else if (ioend->io_append_trans) { error = xfs_setfilesize(ioend); - if (error) - ioend->io_error = -error; } else { ASSERT(!xfs_ioend_is_append(ioend)); } done: + if (error) + ioend->io_error = -error; xfs_destroy_ioend(ioend); } @@ -1432,25 +1422,21 @@ xfs_vm_direct_IO( size_t size = iov_length(iov, nr_segs); /* - * We need to preallocate a transaction for a size update - * here. In the case that this write both updates the size - * and converts at least on unwritten extent we will cancel - * the still clean transaction after the I/O has finished. + * We cannot preallocate a size update transaction here as we + * don't know whether allocation is necessary or not. Hence we + * can only tell IO completion that one is necessary if we are + * not doing unwritten extent conversion. */ iocb->private = ioend = xfs_alloc_ioend(inode, XFS_IO_DIRECT); - if (offset + size > XFS_I(inode)->i_d.di_size) { - ret = xfs_setfilesize_trans_alloc(ioend); - if (ret) - goto out_destroy_ioend; + if (offset + size > XFS_I(inode)->i_d.di_size) ioend->io_isdirect = 1; - } ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset, nr_segs, xfs_get_blocks_direct, xfs_end_io_direct_write, NULL, 0); if (ret != -EIOCBQUEUED && iocb->private) - goto out_trans_cancel; + goto out_ |