diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2020-04-10 09:54:26 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2020-04-10 09:54:26 -0700 |
commit | 8c3c07439e1d793d8803a404df86e487875f3340 (patch) | |
tree | 605809ff7d40f3260fb462b09bb8ee2e5b39f901 /fs | |
parent | d3e5e977abe2b6c98455e4f6acd3b09cb4a0a940 (diff) | |
parent | 5833112df7e9a306af9af09c60127b92ed723962 (diff) |
Merge tag 'xfs-5.7-merge-12' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux
Pull more xfs updates from Darrick Wong:
"As promised last week, this batch changes how xfs interacts with
memory reclaim; how the log batches and throttles log items; how hard
writes near ENOSPC will try to squeeze more space out of the
filesystem; and hopefully fix the last of the umount hangs after a
catastrophic failure.
Summary:
- Validate the realtime geometry in the superblock when mounting
- Refactor a bunch of tricky flag handling in the log code
- Flush the CIL more judiciously so that we don't wait until there
are millions of log items consuming a lot of memory.
- Throttle transaction commits to prevent the xfs frontend from
flooding the CIL with too many log items.
- Account metadata buffers correctly for memory reclaim.
- Mark slabs properly for memory reclaim. These should help reclaim
run more effectively when XFS is using a lot of memory.
- Don't write a garbage log record at unmount time if we're trying to
trigger summary counter recalculation at next mount.
- Don't block the AIL on locked dquot/inode buffers; instead trigger
its backoff mechanism to give the lock holder a chance to finish
up.
- Ratelimit writeback flushing when buffered writes encounter ENOSPC.
- Other minor cleanups.
- Make reflink a synchronous operation when the fs is mounted with
wsync or sync, which means that now we force the log to disk to
record the changes"
* tag 'xfs-5.7-merge-12' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux: (26 commits)
xfs: reflink should force the log out if mounted with wsync
xfs: factor out a new xfs_log_force_inode helper
xfs: fix inode number overflow in ifree cluster helper
xfs: remove redundant variable assignment in xfs_symlink()
xfs: ratelimit inode flush on buffered write ENOSPC
xfs: return locked status of inode buffer on xfsaild push
xfs: trylock underlying buffer on dquot flush
xfs: remove unnecessary ternary from xfs_create
xfs: don't write a corrupt unmount record to force summary counter recalc
xfs: factor inode lookup from xfs_ifree_cluster
xfs: tail updates only need to occur when LSN changes
xfs: factor common AIL item deletion code
xfs: correctly acount for reclaimable slabs
xfs: Improve metadata buffer reclaim accountability
xfs: don't allow log IO to be throttled
xfs: Throttle commits on delayed background CIL push
xfs: Lower CIL flush limit for large logs
xfs: remove some stale comments from the log code
xfs: refactor unmount record writing
xfs: merge xlog_commit_record with xlog_write_done
...
Diffstat (limited to 'fs')
-rw-r--r-- | fs/xfs/libxfs/xfs_sb.c | 32 | ||||
-rw-r--r-- | fs/xfs/xfs_buf.c | 11 | ||||
-rw-r--r-- | fs/xfs/xfs_dquot.c | 6 | ||||
-rw-r--r-- | fs/xfs/xfs_dquot_item.c | 3 | ||||
-rw-r--r-- | fs/xfs/xfs_export.c | 14 | ||||
-rw-r--r-- | fs/xfs/xfs_file.c | 16 | ||||
-rw-r--r-- | fs/xfs/xfs_inode.c | 174 | ||||
-rw-r--r-- | fs/xfs/xfs_inode.h | 1 | ||||
-rw-r--r-- | fs/xfs/xfs_inode_item.c | 31 | ||||
-rw-r--r-- | fs/xfs/xfs_log.c | 372 | ||||
-rw-r--r-- | fs/xfs/xfs_log.h | 4 | ||||
-rw-r--r-- | fs/xfs/xfs_log_cil.c | 55 | ||||
-rw-r--r-- | fs/xfs/xfs_log_priv.h | 75 | ||||
-rw-r--r-- | fs/xfs/xfs_mount.h | 1 | ||||
-rw-r--r-- | fs/xfs/xfs_qm.c | 14 | ||||
-rw-r--r-- | fs/xfs/xfs_super.c | 17 | ||||
-rw-r--r-- | fs/xfs/xfs_symlink.c | 1 | ||||
-rw-r--r-- | fs/xfs/xfs_trace.h | 15 | ||||
-rw-r--r-- | fs/xfs/xfs_trans.c | 27 | ||||
-rw-r--r-- | fs/xfs/xfs_trans_ail.c | 88 | ||||
-rw-r--r-- | fs/xfs/xfs_trans_priv.h | 6 |
21 files changed, 512 insertions, 451 deletions
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index 00266de58954..c526c5e5ab76 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -328,6 +328,38 @@ xfs_validate_sb_common( return -EFSCORRUPTED; } + /* Validate the realtime geometry; stolen from xfs_repair */ + if (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE || + sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE) { + xfs_notice(mp, + "realtime extent sanity check failed"); + return -EFSCORRUPTED; + } + + if (sbp->sb_rblocks == 0) { + if (sbp->sb_rextents != 0 || sbp->sb_rbmblocks != 0 || + sbp->sb_rextslog != 0 || sbp->sb_frextents != 0) { + xfs_notice(mp, + "realtime zeroed geometry check failed"); + return -EFSCORRUPTED; + } + } else { + uint64_t rexts; + uint64_t rbmblocks; + + rexts = div_u64(sbp->sb_rblocks, sbp->sb_rextsize); + rbmblocks = howmany_64(sbp->sb_rextents, + NBBY * sbp->sb_blocksize); + + if (sbp->sb_rextents != rexts || + sbp->sb_rextslog != xfs_highbit32(sbp->sb_rextents) || + sbp->sb_rbmblocks != rbmblocks) { + xfs_notice(mp, + "realtime geometry sanity check failed"); + return -EFSCORRUPTED; + } + } + if (sbp->sb_unit) { if (!xfs_sb_version_hasdalign(sbp) || sbp->sb_unit > sbp->sb_width || diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index f880141a2268..9ec3eaf1c618 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -327,6 +327,9 @@ xfs_buf_free( __free_page(page); } + if (current->reclaim_state) + current->reclaim_state->reclaimed_slab += + bp->b_page_count; } else if (bp->b_flags & _XBF_KMEM) kmem_free(bp->b_addr); _xfs_buf_free_pages(bp); @@ -2114,9 +2117,11 @@ xfs_buf_delwri_pushbuf( int __init xfs_buf_init(void) { - xfs_buf_zone = kmem_cache_create("xfs_buf", - sizeof(struct xfs_buf), 0, - SLAB_HWCACHE_ALIGN, NULL); + xfs_buf_zone = kmem_cache_create("xfs_buf", sizeof(struct xfs_buf), 0, + SLAB_HWCACHE_ALIGN | + SLAB_RECLAIM_ACCOUNT | + SLAB_MEM_SPREAD, + NULL); if (!xfs_buf_zone) goto out; diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 711376ca269f..af2c8e5ceea0 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -1105,8 +1105,8 @@ xfs_qm_dqflush( * Get the buffer containing the on-disk dquot */ error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dqp->q_blkno, - mp->m_quotainfo->qi_dqchunklen, 0, &bp, - &xfs_dquot_buf_ops); + mp->m_quotainfo->qi_dqchunklen, XBF_TRYLOCK, + &bp, &xfs_dquot_buf_ops); if (error) goto out_unlock; @@ -1177,7 +1177,7 @@ xfs_qm_dqflush( out_unlock: xfs_dqfunlock(dqp); - return -EIO; + return error; } /* diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c index cf65e2e43c6e..baad1748d0d1 100644 --- a/fs/xfs/xfs_dquot_item.c +++ b/fs/xfs/xfs_dquot_item.c @@ -189,7 +189,8 @@ xfs_qm_dquot_logitem_push( if (!xfs_buf_delwri_queue(bp, buffer_list)) rval = XFS_ITEM_FLUSHING; xfs_buf_relse(bp); - } + } else if (error == -EAGAIN) + rval = XFS_ITEM_LOCKED; spin_lock(&lip->li_ailp->ail_lock); out_unlock: diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c index f1372f9046e3..5a4b0119143a 100644 --- a/fs/xfs/xfs_export.c +++ b/fs/xfs/xfs_export.c @@ -15,7 +15,6 @@ #include "xfs_trans.h" #include "xfs_inode_item.h" #include "xfs_icache.h" -#include "xfs_log.h" #include "xfs_pnfs.h" /* @@ -221,18 +220,7 @@ STATIC int xfs_fs_nfs_commit_metadata( struct inode *inode) { - struct xfs_inode *ip = XFS_I(inode); - struct xfs_mount *mp = ip->i_mount; - xfs_lsn_t lsn = 0; - - xfs_ilock(ip, XFS_ILOCK_SHARED); - if (xfs_ipincount(ip)) - lsn = ip->i_itemp->ili_last_lsn; - xfs_iunlock(ip, XFS_ILOCK_SHARED); - - if (!lsn) - return 0; - return xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL); + return xfs_log_force_inode(XFS_I(inode)); } const struct export_operations xfs_export_operations = { diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index b8a4a3f29b36..4b8bdecc3863 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -80,19 +80,9 @@ xfs_dir_fsync( int datasync) { struct xfs_inode *ip = XFS_I(file->f_mapping->host); - struct xfs_mount *mp = ip->i_mount; - xfs_lsn_t lsn = 0; trace_xfs_dir_fsync(ip); - - xfs_ilock(ip, XFS_ILOCK_SHARED); - if (xfs_ipincount(ip)) - lsn = ip->i_itemp->ili_last_lsn; - xfs_iunlock(ip, XFS_ILOCK_SHARED); - - if (!lsn) - return 0; - return xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL); + return xfs_log_force_inode(ip); } STATIC int @@ -1069,7 +1059,11 @@ xfs_file_remap_range( ret = xfs_reflink_update_dest(dest, pos_out + len, cowextsize, remap_flags); + if (ret) + goto out_unlock; + if (mp->m_flags & XFS_MOUNT_WSYNC) + xfs_log_force_inode(dest); out_unlock: xfs_reflink_remap_unlock(file_in, file_out); if (ret) diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 14b922f2a6db..d1772786af29 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1200,8 +1200,7 @@ xfs_create( unlock_dp_on_error = false; error = xfs_dir_createname(tp, dp, name, ip->i_ino, - resblks ? - resblks - XFS_IALLOC_SPACE_RES(mp) : 0); + resblks - XFS_IALLOC_SPACE_RES(mp)); if (error) { ASSERT(error != -ENOSPC); goto out_trans_cancel; @@ -2504,6 +2503,88 @@ out: } /* + * Look up the inode number specified and mark it stale if it is found. If it is + * dirty, return the inode so it can be attached to the cluster buffer so it can + * be processed appropriately when the cluster free transaction completes. + */ +static struct xfs_inode * +xfs_ifree_get_one_inode( + struct xfs_perag *pag, + struct xfs_inode *free_ip, + xfs_ino_t inum) +{ + struct xfs_mount *mp = pag->pag_mount; + struct xfs_inode *ip; + +retry: + rcu_read_lock(); + ip = radix_tree_lookup(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, inum)); + + /* Inode not in memory, nothing to do */ + if (!ip) + goto out_rcu_unlock; + + /* + * because this is an RCU protected lookup, we could find a recently + * freed or even reallocated inode during the lookup. We need to check + * under the i_flags_lock for a valid inode here. Skip it if it is not + * valid, the wrong inode or stale. + */ + spin_lock(&ip->i_flags_lock); + if (ip->i_ino != inum || __xfs_iflags_test(ip, XFS_ISTALE)) { + spin_unlock(&ip->i_flags_lock); + goto out_rcu_unlock; + } + spin_unlock(&ip->i_flags_lock); + + /* + * Don't try to lock/unlock the current inode, but we _cannot_ skip the + * other inodes that we did not find in the list attached to the buffer + * and are not already marked stale. If we can't lock it, back off and + * retry. + */ + if (ip != free_ip) { + if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) { + rcu_read_unlock(); + delay(1); + goto retry; + } + + /* + * Check the inode number again in case we're racing with + * freeing in xfs_reclaim_inode(). See the comments in that + * function for more information as to why the initial check is + * not sufficient. + */ + if (ip->i_ino != inum) { + xfs_iunlock(ip, XFS_ILOCK_EXCL); + goto out_rcu_unlock; + } + } + rcu_read_unlock(); + + xfs_iflock(ip); + xfs_iflags_set(ip, XFS_ISTALE); + + /* + * We don't need to attach clean inodes or those only with unlogged + * changes (which we throw away, anyway). + */ + if (!ip->i_itemp || xfs_inode_clean(ip)) { + ASSERT(ip != free_ip); + xfs_ifunlock(ip); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + goto out_no_inode; + } + return ip; + +out_rcu_unlock: + rcu_read_unlock(); +out_no_inode: + return NULL; +} + +/* * A big issue when freeing the inode cluster is that we _cannot_ skip any * inodes that are in memory - they all must be marked stale and attached to * the cluster buffer. @@ -2603,77 +2684,11 @@ xfs_ifree_cluster( * even trying to lock them. */ for (i = 0; i < igeo->inodes_per_cluster; i++) { -retry: - rcu_read_lock(); - ip = radix_tree_lookup(&pag->pag_ici_root, - XFS_INO_TO_AGINO(mp, (inum + i))); - - /* Inode not in memory, nothing to do */ - if (!ip) { - rcu_read_unlock(); + ip = xfs_ifree_get_one_inode(pag, free_ip, inum + i); + if (!ip) continue; - } - - /* - * because this is an RCU protected lookup, we could - * find a recently freed or even reallocated inode - * during the lookup. We need to check under the - * i_flags_lock for a valid inode here. Skip it if it - * is not valid, the wrong inode or stale. - */ - spin_lock(&ip->i_flags_lock); - if (ip->i_ino != inum + i || - __xfs_iflags_test(ip, XFS_ISTALE)) { - spin_unlock(&ip->i_flags_lock); - rcu_read_unlock(); - continue; - } - spin_unlock(&ip->i_flags_lock); - - /* - * Don't try to lock/unlock the current inode, but we - * _cannot_ skip the other inodes that we did not find - * in the list attached to the buffer and are not - * already marked stale. If we can't lock it, back off - * and retry. - */ - if (ip != free_ip) { - if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) { - rcu_read_unlock(); - delay(1); - goto retry; - } - - /* - * Check the inode number again in case we're - * racing with freeing in xfs_reclaim_inode(). - * See the comments in that function for more - * information as to why the initial check is - * not sufficient. - */ - if (ip->i_ino != inum + i) { - xfs_iunlock(ip, XFS_ILOCK_EXCL); - rcu_read_unlock(); - continue; - } - } - rcu_read_unlock(); - - xfs_iflock(ip); - xfs_iflags_set(ip, XFS_ISTALE); - /* - * we don't need to attach clean inodes or those only - * with unlogged changes (which we throw away, anyway). - */ iip = ip->i_itemp; - if (!iip || xfs_inode_clean(ip)) { - ASSERT(ip != free_ip); - xfs_ifunlock(ip); - xfs_iunlock(ip, XFS_ILOCK_EXCL); - continue; - } - iip->ili_last_fields = iip->ili_fields; iip->ili_fields = 0; iip->ili_fsync_fields = 0; @@ -3930,3 +3945,22 @@ xfs_irele( trace_xfs_irele(ip, _RET_IP_); iput(VFS_I(ip)); } + +/* + * Ensure all commited transactions touching the inode are written to the log. + */ +int +xfs_log_force_inode( + struct xfs_inode *ip) +{ + xfs_lsn_t lsn = 0; + + xfs_ilock(ip, XFS_ILOCK_SHARED); + if (xfs_ipincount(ip)) + lsn = ip->i_itemp->ili_last_lsn; + xfs_iunlock(ip, XFS_ILOCK_SHARED); + + if (!lsn) + return 0; + return xfs_log_force_lsn(ip->i_mount, lsn, XFS_LOG_SYNC, NULL); +} diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 492e53992fa9..c6a63f6764a6 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -426,6 +426,7 @@ int xfs_itruncate_extents_flags(struct xfs_trans **, struct xfs_inode *, int, xfs_fsize_t, int); void xfs_iext_realloc(xfs_inode_t *, int, int); +int xfs_log_force_inode(struct xfs_inode *ip); void xfs_iunpin_wait(xfs_inode_t *); #define xfs_ipincount(ip) ((unsigned int) atomic_read(&ip->i_pincount)) diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 4a3d13d4a022..f779cca2346f 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -552,7 +552,8 @@ xfs_inode_item_push( if (!xfs_buf_delwri_queue(bp, buffer_list)) rval = XFS_ITEM_FLUSHING; xfs_buf_relse(bp); - } + } else if (error == -EAGAIN) + rval = XFS_ITEM_LOCKED; spin_lock(&lip->li_ailp->ail_lock); out_unlock: @@ -730,29 +731,27 @@ xfs_iflush_done( * holding the lock before removing the inode from the AIL. */ if (need_ail) { - bool mlip_changed = false; + xfs_lsn_t tail_lsn = 0; /* this is an opencoded batch version of xfs_trans_ail_delete */ spin_lock(&ailp->ail_lock); list_for_each_entry(blip, &tmp, li_bio_list) { if (INODE_ITEM(blip)->ili_logged && - blip->li_lsn == INODE_ITEM(blip)->ili_flush_lsn) - mlip_changed |= xfs_ail_delete_one(ailp, blip); - else { + blip->li_lsn == INODE_ITEM(blip)->ili_flush_lsn) { + /* + * xfs_ail_update_finish() only cares about the + * lsn of the first tail item removed, any + * others will be at the same or higher lsn so + * we just ignore them. + */ + xfs_lsn_t lsn = xfs_ail_delete_one(ailp, blip); + if (!tail_lsn && lsn) + tail_lsn = lsn; + } else { xfs_clear_li_failed(blip); } } - - if (mlip_changed) { - if (!XFS_FORCED_SHUTDOWN(ailp->ail_mount)) - xlog_assign_tail_lsn_locked(ailp->ail_mount); - if (list_empty(&ailp->ail_head)) - wake_up_all(&ailp->ail_empty); - } - spin_unlock(&ailp->ail_lock); - - if (mlip_changed) - xfs_log_space_wake(ailp->ail_mount); + xfs_ail_update_finish(ailp, tail_lsn); } /* diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 4a53768c5397..00fda2e8e738 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -24,13 +24,6 @@ kmem_zone_t *xfs_log_ticket_zone; /* Local miscellaneous function prototypes */ -STATIC int -xlog_commit_record( - struct xlog *log, - struct xlog_ticket *ticket, - struct xlog_in_core **iclog, - xfs_lsn_t *commitlsnp); - STATIC struct xlog * xlog_alloc_log( struct xfs_mount *mp, @@ -66,14 +59,6 @@ xlog_grant_push_ail( struct xlog *log, int need_bytes); STATIC void -xlog_regrant_reserve_log_space( - struct xlog *log, - struct xlog_ticket *ticket); -STATIC void -xlog_ungrant_log_space( - struct xlog *log, - struct xlog_ticket *ticket); -STATIC void xlog_sync( struct xlog *log, struct xlog_in_core *iclog); @@ -478,73 +463,6 @@ out_error: return error; } - -/* - * NOTES: - * - * 1. currblock field gets updated at startup and after in-core logs - * marked as with WANT_SYNC. - */ - -/* - * This routine is called when a user of a log manager ticket is done with - * the reservation. If the ticket was ever used, then a commit record for - * the associated transaction is written out as a log operation header with - * no data. The flag XLOG_TIC_INITED is set when the first write occurs with - * a given ticket. If the ticket was one with a permanent reservation, then - * a few operations are done differently. Permanent reservation tickets by - * default don't release the reservation. They just commit the current - * transaction with the belief that the reservation is still needed. A flag - * must be passed in before permanent reservations are actually released. - * When these type of tickets are not released, they need to be set into - * the inited state again. By doing this, a start record will be written - * out when the next write occurs. - */ -xfs_lsn_t -xfs_log_done( - struct xfs_mount *mp, - struct xlog_ticket *ticket, - struct xlog_in_core **iclog, - bool regrant) -{ - struct xlog *log = mp->m_log; - xfs_lsn_t lsn = 0; - - if (XLOG_FORCED_SHUTDOWN(log) || - /* - * If nothing was ever written, don't write out commit record. - * If we get an error, just continue and give back the log ticket. - */ - (((ticket->t_flags & XLOG_TIC_INITED) == 0) && - (xlog_commit_record(log, ticket, iclog, &lsn)))) { - lsn = (xfs_lsn_t) -1; - regrant = false; - } - - - if (!regrant) { - trace_xfs_log_done_nonperm(log, ticket); - - /* - * Release ticket if not permanent reservation or a specific - * request has been made to release a permanent reservation. - */ - xlog_ungrant_log_space(log, ticket); - } else { - trace_xfs_log_done_perm(log, ticket); - - xlog_regrant_reserve_log_space(log, ticket); - /* If this ticket was a permanent reservation and we aren't - * trying to release it, reset the inited flags; so next time - * we write, a start record will be written out. - */ - ticket->t_flags |= XLOG_TIC_INITED; - } - - xfs_log_ticket_put(ticket); - return lsn; -} - static bool __xlog_state_release_iclog( struct xlog *log, @@ -869,32 +787,44 @@ xlog_wait_on_iclog( } /* - * Final log writes as part of unmount. - * - * Mark the filesystem clean as unmount happens. Note that during relocation - * this routine needs to be executed as part of source-bag while the - * deallocation must not be done until source-end. + * Write out an unmount record using the ticket provided. We have to account for + * the data space used in the unmount ticket as this write is not done from a + * transaction context that has already done the accounting for us. */ - -/* Actually write the unmount record to disk. */ -static void -xfs_log_write_unmount_record( - struct xfs_mount *mp) +static int +xlog_write_unmount_record( + struct xlog *log, + struct xlog_ticket *ticket, + xfs_lsn_t *lsn, + uint flags) { - /* the data section must be 32 bit size aligned */ - struct xfs_unmount_log_format magic = { + struct xfs_unmount_log_format ulf = { .magic = XLOG_UNMOUNT_TYPE, }; struct xfs_log_iovec reg = { - .i_addr = &magic, - .i_len = sizeof(magic), + .i_addr = &ulf, + .i_len = sizeof(ulf), .i_type = XLOG_REG_TYPE_UNMOUNT, }; struct xfs_log_vec vec = { .lv_niovecs = 1, .lv_iovecp = ®, }; - struct xlog *log = mp->m_log; + + /* account for space used by record data */ + ticket->t_curr_res -= sizeof(ulf); + return xlog_write(log, &vec, ticket, lsn, NULL, flags, false); +} + +/* + * Mark the filesystem clean by writing an unmount record to the head of the + * log. + */ +static void +xlog_unmount_write( + struct xlog *log) +{ + struct xfs_mount *mp = log->l_mp; struct xlog_in_core *iclog; struct xlog_ticket *tic = NULL; xfs_lsn_t lsn; @@ -905,23 +835,7 @@ xfs_log_write_unmount_record( if (error) goto out_err; - /* - * If we think the summary counters are bad, clear the unmount header - * flag in the unmount record so that the summary counters will be - * recalculated during log recovery at next mount. Refer to - * xlog_check_unmount_rec for more details. - */ - if (XFS_TEST_ERROR(xfs_fs_has_sickness(mp, XFS_SICK_FS_COUNTERS), mp, - XFS_ERRTAG_FORCE_SUMMARY_RECALC)) { - xfs_alert(mp, "%s: will fix summary counters at next mount", - __func__); - flags &= ~XLOG_UNMOUNT_TRANS; - } - - /* remove inited flag, and account for space used */ - tic->t_flags = 0; - tic->t_curr_res -= sizeof(magic); - error = xlog_write(log, &vec, tic, &lsn, NULL, flags); + error = xlog_write_unmount_record(log, tic, &lsn, flags); /* * At this point, we're umounting anyway, so there's no point in * transitioning log state to IOERROR. Just continue... @@ -943,8 +857,7 @@ out_err: if (tic) { trace_xfs_log_umount_write(log, tic); - xlog_ungrant_log_space(log, tic); - xfs_log_ticket_put(tic); + xfs_log_ticket_ungrant(log, tic); } } @@ -987,8 +900,22 @@ xfs_log_unmount_write( if (XLOG_FORCED_SHUTDOWN(log)) return; + + /* + * If we think the summary counters are bad, avoid writing the unmount + * record to force log recovery at next mount, after which the summary + * counters will be recalculated. Refer to xlog_check_unmount_rec for + * more details. + */ + if (XFS_TEST_ERROR(xfs_fs_has_sickness(mp, XFS_SICK_FS_COUNTERS), mp, + XFS_ERRTAG_FORCE_SUMMARY_RECALC)) { + xfs_alert(mp, "%s: will fix summary counters at next mount", + __func__); + return; + } + xfs_log_unmount_verify_iclog(log); - xfs_log_write_unmount_record(mp); + xlog_unmount_write(log); } /* @@ -1515,20 +1442,17 @@ out: return ERR_PTR(error); } /* xlog_alloc_log */ - /* * Write out the commit record of a transaction associated with the given - * ticket. Return the lsn of the commit record. + * ticket to close off a running log write. Return the lsn of the commit record. */ -STATIC int +int xlog_commit_record( struct xlog *log, struct xlog_ticket *ticket, struct xlog_in_core **iclog, - xfs_lsn_t *commitlsnp) + xfs_lsn_t *lsn) { - struct xfs_mount *mp = log->l_mp; - int error; struct xfs_log_iovec reg = { .i_addr = NULL, .i_len = 0, @@ -1538,12 +1462,15 @@ xlog_commit_record( .lv_niovecs = 1, .lv_iovecp = ®, }; + int error; + + if (XLOG_FORCED_SHUTDOWN(log)) + return -EIO; - ASSERT_ALWAYS(iclog); - error = xlog_write(log, &vec, ticket, commitlsnp, iclog, - XLOG_COMMIT_TRANS); + error = xlog_write(log, &vec, ticket, lsn, iclog, XLOG_COMMIT_TRANS, + false); if (error) - xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR); + xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR); return error; } @@ -1761,7 +1688,15 @@ xlog_write_iclog( iclog->ic_bio.bi_iter.bi_sector = log->l_logBBstart + bno; iclog->ic_bio.bi_end_io = xlog_bio_end_io; iclog->ic_bio.bi_private = iclog; - iclog->ic_bio.bi_opf = REQ_OP_WRITE | REQ_META | REQ_SYNC | REQ_FUA; + + /* + * We use REQ_SYNC | REQ_IDLE here to tell the block layer the are more + * IOs coming immediately after this one. This prevents the block layer + * writeback throttle from throttling log writes behind background + * metadata writeback and causing priority inversions. + */ + iclog->ic_bio.bi_opf = REQ_OP_WRITE | REQ_META | REQ_SYNC | + REQ_IDLE | REQ_FUA; if (need_flush) iclog->ic_bio.bi_opf |= REQ_PREFLUSH; @@ -1981,7 +1916,7 @@ xlog_dealloc_log( log->l_mp->m_log = NULL; destroy_workqueue(log->l_ioend_workqueue); kmem_free(log); -} /* xlog_dealloc_log */ +} /* * Update counters atomically now that memcpy is done. @@ -2118,23 +2053,21 @@ xlog_print_trans( } /* - * Calculate the potential space needed by the log vector. Each region gets - * its own xlog_op_header_t and may need to be double word aligned. + * Calculate the potential space needed by the log vector. We may need a start + * record, and each region gets its own struct xlog_op_header and may need to be + * double word aligned. */ static int xlog_write_calc_vec_length( struct xlog_ticket *ticket, - struct xfs_log_vec *log_vector) + struct xfs_log_vec *log_vector, + bool need_start_rec) { struct xfs_log_vec *lv; - int headers = 0; + int headers = need_start_rec ? 1 : 0; int len = 0; int i; - /* acct for start rec of xact */ - if (ticket->t_flags & XLOG_TIC_INITED) - headers++; - for (lv = log_vector; lv; lv = lv->lv_next) { /* we don't write ordered log vectors */ if (lv->lv_buf_len == XFS_LOG_VEC_ORDERED) @@ -2156,27 +2089,16 @@ xlog_write_calc_vec_length( return len; } -/* - * If first write for transaction, insert start record We can't be trying to - * commit if we are inited. We can't have any "partial_copy" if we are inited. - */ -static int +static void xlog_write_start_rec( struct xlog_op_header *ophdr, struct xlog_ticket *ticket) { - if (!(ticket->t_flags & XLOG_TIC_INITED)) - return 0; - ophdr->oh_tid = cpu_to_be32(ticket->t_tid); ophdr->oh_clientid = ticket->t_clientid; ophdr->oh_len = 0; ophdr->oh_flags = XLOG_START_TRANS; ophdr->oh_res2 = 0; - - ticket->t_flags &= ~XLOG_TIC_INITED; - - return sizeof(struct xlog_op_header); } static xlog_op_header_t * @@ -2365,13 +2287,14 @@ xlog_write( struct xlog_ticket *ticket, xfs_lsn_t *start_lsn, struct xlog_in_core **commit_iclog, - uint flags) + uint flags, + bool need_start_rec) { struct xlog_in_core *iclog = NULL; - struct xfs_log_iovec *vecp; - struct xfs_log_vec *lv; + struct xfs_log_vec *lv = log_vector; + struct xfs_log_iovec *vecp = lv->lv_iovecp; + int index = 0; int len; - int index; int partial_copy = 0; int partial_copy_len = 0; int contwr = 0; @@ -2379,25 +2302,13 @@ xlog_write( int data_cnt = 0; int error = 0; - *start_lsn = 0; - - len = xlog_write_calc_vec_length(ticket, log_vector); - /* - * Region headers and bytes are already accounted for. - * We only need to take into account start records and - * split regions in this function. + * If this is a commit or unmount transaction, we don't need a start + * record to be written. We do, however, have to account for the + * commit or unmount header that gets written. Hence we always have + * to account for an extra xlog_op_header here. */ - if (ticket->t_flags & XLOG_TIC_INITED) - ticket->t_curr_res -= sizeof(xlog_op_header_t); - - /* - * Commit record headers need to be accounted for. These - * come in as separate writes so are easy to detect. - */ - if (flags & (XLOG_COMMIT_TRANS | XLOG_UNMOUNT_TRANS)) - ticket->t_curr_res -= sizeof(xlog_op_header_t); - + ticket->t_curr_res -= sizeof(struct xlog_op_header); if (ticket->t_curr_res < 0) { xfs_alert_tag(log->l_mp, XFS_PTAG_LOGRES, "ctx ticket reservation ran out. Need to up reservation"); @@ -2405,9 +2316,8 @@ xlog_write( xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR); } - index = 0; - lv = log_vector; - vecp = lv->lv_iovecp; + len = xlog_write_calc_vec_length(ticket, log_vector, need_start_rec); + *start_lsn = 0; while (lv && (!lv->lv_niovecs || index < lv->lv_niovecs)) { void *ptr; int log_offset; @@ -2431,7 +2341,6 @@ xlog_write( while (lv && (!lv->lv_niovecs || index < lv->lv_niovecs)) { struct xfs_log_iovec *reg; struct xlog_op_header *ophdr; - int start_rec_copy; int copy_len; int copy_off; bool ordered = false; @@ -2447,11 +2356,15 @@ xlog_write( ASSERT(reg->i_len % sizeof(int32_t) == 0); ASSERT((unsigned long)ptr % sizeof(int32_t) == 0); - start_rec_copy = xlog_write_start_rec(ptr, ticket); - if (start_rec_copy) { - record_cnt++; + /* + * Before we start formatting log vectors, we need to + * write a start record. Only do this for the first + * iclog we write to. + */ + if (need_start_rec) { + xlog_write_start_rec(ptr, ticket); xlog_write_adv_cnt(&ptr, &len, &log_offset, - start_rec_copy); + sizeof(struct xlog_op_header)); } ophdr = xlog_write_setup_ophdr(log |