From 757a69ef6cf2bf839bd4088e5609ddddd663b0c4 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@sandeen.net>
Date: Tue, 8 Aug 2017 18:19:47 -0700
Subject: xfs: write unmount record for ro mounts

There are dueling comments in the xfs code about intent
for log writes when unmounting a readonly filesystem.

In xfs_mountfs, we see the intent:

/*
 * Now the log is fully replayed, we can transition to full read-only
 * mode for read-only mounts. This will sync all the metadata and clean
 * the log so that the recovery we just performed does not have to be
 * replayed again on the next mount.
 */

and it calls xfs_quiesce_attr(), but by the time we get to
xfs_log_unmount_write(), it returns early for a RDONLY mount:

 * Don't write out unmount record on read-only mounts.

Because of this, sequential ro mounts of a filesystem with
a dirty log will replay the log each time, which seems odd.

Fix this by writing an unmount record even for RO mounts, as long
as norecovery wasn't specified (don't write a clean log record
if a dirty log may still be there!) and the log device is
writable.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/xfs_log.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 4ebd0bafc914..972eda87db2b 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -812,11 +812,14 @@ xfs_log_unmount_write(xfs_mount_t *mp)
 	int		 error;
 
 	/*
-	 * Don't write out unmount record on read-only mounts.
+	 * Don't write out unmount record on norecovery mounts or ro devices.
 	 * Or, if we are doing a forced umount (typically because of IO errors).
 	 */
-	if (mp->m_flags & XFS_MOUNT_RDONLY)
+	if (mp->m_flags & XFS_MOUNT_NORECOVERY ||
+	    xfs_readonly_buftarg(log->l_mp->m_logdev_targp)) {
+		ASSERT(mp->m_flags & XFS_MOUNT_RDONLY);
 		return 0;
+	}
 
 	error = _xfs_log_force(mp, XFS_LOG_SYNC, NULL);
 	ASSERT(error || !(XLOG_FORCED_SHUTDOWN(log)));
-- 
cgit v1.2.3


From 6f4a1eefdd0ad4561543270a7fceadabcca075dd Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@sandeen.net>
Date: Tue, 8 Aug 2017 18:21:49 -0700
Subject: xfs: toggle readonly state around xfs_log_mount_finish

When we do log recovery on a readonly mount, unlinked inode
processing does not happen due to the readonly checks in
xfs_inactive(), which are trying to prevent any I/O on a
readonly mount.

This is misguided - we do I/O on readonly mounts all the time,
for consistency; for example, log recovery.  So do the same
RDONLY flag twiddling around xfs_log_mount_finish() as we
do around xfs_log_mount(), for the same reason.

This all cries out for a big rework but for now this is a
simple fix to an obvious problem.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/xfs_log.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'fs')

diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 972eda87db2b..f467f1230444 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -743,10 +743,14 @@ xfs_log_mount_finish(
 	struct xfs_mount	*mp)
 {
 	int	error = 0;
+	bool	readonly = (mp->m_flags & XFS_MOUNT_RDONLY);
 
 	if (mp->m_flags & XFS_MOUNT_NORECOVERY) {
 		ASSERT(mp->m_flags & XFS_MOUNT_RDONLY);
 		return 0;
+	} else if (readonly) {
+		/* Allow unlinked processing to proceed */
+		mp->m_flags &= ~XFS_MOUNT_RDONLY;
 	}
 
 	/*
@@ -764,6 +768,9 @@ xfs_log_mount_finish(
 		xfs_log_work_queue(mp);
 	mp->m_super->s_flags &= ~MS_ACTIVE;
 
+	if (readonly)
+		mp->m_flags |= XFS_MOUNT_RDONLY;
+
 	return error;
 }
 
-- 
cgit v1.2.3


From 0b80ae6ed13169bd3a244e71169f2cc020b0c57a Mon Sep 17 00:00:00 2001
From: Carlos Maiolino <cmaiolino@redhat.com>
Date: Tue, 8 Aug 2017 18:21:50 -0700
Subject: xfs: Add infrastructure needed for error propagation during buffer IO
 failure

With the current code, XFS never re-submit a failed buffer for IO,
because the failed item in the buffer is kept in the flush locked state
forever.

To be able to resubmit an log item for IO, we need a way to mark an item
as failed, if, for any reason the buffer which the item belonged to
failed during writeback.

Add a new log item callback to be used after an IO completion failure
and make the needed clean ups.

Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Carlos Maiolino <cmaiolino@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/xfs_buf_item.c | 32 +++++++++++++++++++++++++++++++-
 fs/xfs/xfs_trans.h    |  7 +++++--
 2 files changed, 36 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index f6a8422e9562..7573a1f0bc9a 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -29,6 +29,7 @@
 #include "xfs_error.h"
 #include "xfs_trace.h"
 #include "xfs_log.h"
+#include "xfs_inode.h"
 
 
 kmem_zone_t	*xfs_buf_item_zone;
@@ -1054,6 +1055,31 @@ xfs_buf_do_callbacks(
 	}
 }
 
+/*
+ * Invoke the error state callback for each log item affected by the failed I/O.
+ *
+ * If a metadata buffer write fails with a non-permanent error, the buffer is
+ * eventually resubmitted and so the completion callbacks are not run. The error
+ * state may need to be propagated to the log items attached to the buffer,
+ * however, so the next AIL push of the item knows hot to handle it correctly.
+ */
+STATIC void
+xfs_buf_do_callbacks_fail(
+	struct xfs_buf		*bp)
+{
+	struct xfs_log_item	*next;
+	struct xfs_log_item	*lip = bp->b_fspriv;
+	struct xfs_ail		*ailp = lip->li_ailp;
+
+	spin_lock(&ailp->xa_lock);
+	for (; lip; lip = next) {
+		next = lip->li_bio_list;
+		if (lip->li_ops->iop_error)
+			lip->li_ops->iop_error(lip, bp);
+	}
+	spin_unlock(&ailp->xa_lock);
+}
+
 static bool
 xfs_buf_iodone_callback_error(
 	struct xfs_buf		*bp)
@@ -1123,7 +1149,11 @@ xfs_buf_iodone_callback_error(
 	if ((mp->m_flags & XFS_MOUNT_UNMOUNTING) && mp->m_fail_unmount)
 		goto permanent_error;
 
-	/* still a transient error, higher layers will retry */
+	/*
+	 * Still a transient error, run IO completion failure callbacks and let
+	 * the higher layers retry the buffer.
+	 */
+	xfs_buf_do_callbacks_fail(bp);
 	xfs_buf_ioerror(bp, 0);
 	xfs_buf_relse(bp);
 	return true;
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 6bdad6f58934..442d679210a1 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -64,11 +64,13 @@ typedef struct xfs_log_item {
 } xfs_log_item_t;
 
 #define	XFS_LI_IN_AIL	0x1
-#define XFS_LI_ABORTED	0x2
+#define	XFS_LI_ABORTED	0x2
+#define	XFS_LI_FAILED	0x4
 
 #define XFS_LI_FLAGS \
 	{ XFS_LI_IN_AIL,	"IN_AIL" }, \
-	{ XFS_LI_ABORTED,	"ABORTED" }
+	{ XFS_LI_ABORTED,	"ABORTED" }, \
+	{ XFS_LI_FAILED,	"FAILED" }
 
 struct xfs_item_ops {
 	void (*iop_size)(xfs_log_item_t *, int *, int *);
@@ -79,6 +81,7 @@ struct xfs_item_ops {
 	void (*iop_unlock)(xfs_log_item_t *);
 	xfs_lsn_t (*iop_committed)(xfs_log_item_t *, xfs_lsn_t);
 	void (*iop_committing)(xfs_log_item_t *, xfs_lsn_t);
+	void (*iop_error)(xfs_log_item_t *, xfs_buf_t *);
 };
 
 void	xfs_log_item_init(struct xfs_mount *mp, struct xfs_log_item *item,
-- 
cgit v1.2.3


From d3a304b6292168b83b45d624784f973fdc1ca674 Mon Sep 17 00:00:00 2001
From: Carlos Maiolino <cmaiolino@redhat.com>
Date: Tue, 8 Aug 2017 18:21:50 -0700
Subject: xfs: Properly retry failed inode items in case of error during buffer
 writeback

When a buffer has been failed during writeback, the inode items into it
are kept flush locked, and are never resubmitted due the flush lock, so,
if any buffer fails to be written, the items in AIL are never written to
disk and never unlocked.

This causes unmount operation to hang due these items flush locked in AIL,
but this also causes the items in AIL to never be written back, even when
the IO device comes back to normal.

I've been testing this patch with a DM-thin device, creating a
filesystem larger than the real device.

When writing enough data to fill the DM-thin device, XFS receives ENOSPC
errors from the device, and keep spinning on xfsaild (when 'retry
forever' configuration is set).

At this point, the filesystem can not be unmounted because of the flush locked
items in AIL, but worse, the items in AIL are never retried at all
(once xfs_inode_item_push() will skip the items that are flush locked),
even if the underlying DM-thin device is expanded to the proper size.

This patch fixes both cases, retrying any item that has been failed
previously, using the infra-structure provided by the previous patch.

Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Carlos Maiolino <cmaiolino@redhat.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/xfs_buf_item.c   | 28 ++++++++++++++++++++++++++++
 fs/xfs/xfs_buf_item.h   |  3 +++
 fs/xfs/xfs_inode_item.c | 47 +++++++++++++++++++++++++++++++++++++++++++----
 fs/xfs/xfs_trans.h      |  1 +
 fs/xfs/xfs_trans_ail.c  |  3 ++-
 fs/xfs/xfs_trans_priv.h | 31 +++++++++++++++++++++++++++++++
 6 files changed, 108 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 7573a1f0bc9a..573fc72c3f23 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -1234,3 +1234,31 @@ xfs_buf_iodone(
 	xfs_trans_ail_delete(ailp, lip, SHUTDOWN_CORRUPT_INCORE);
 	xfs_buf_item_free(BUF_ITEM(lip));
 }
+
+/*
+ * Requeue a failed buffer for writeback
+ *
+ * Return true if the buffer has been re-queued properly, false otherwise
+ */
+bool
+xfs_buf_resubmit_failed_buffers(
+	struct xfs_buf		*bp,
+	struct xfs_log_item	*lip,
+	struct list_head	*buffer_list)
+{
+	struct xfs_log_item	*next;
+
+	/*
+	 * Clear XFS_LI_FAILED flag from all items before resubmit
+	 *
+	 * XFS_LI_FAILED set/clear is protected by xa_lock, caller  this
+	 * function already have it acquired
+	 */
+	for (; lip; lip = next) {
+		next = lip->li_bio_list;
+		xfs_clear_li_failed(lip);
+	}
+
+	/* Add this buffer back to the delayed write list */
+	return xfs_buf_delwri_queue(bp, buffer_list);
+}
diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h
index f7eba99d19dd..530686e1afb9 100644
--- a/fs/xfs/xfs_buf_item.h
+++ b/fs/xfs/xfs_buf_item.h
@@ -70,6 +70,9 @@ void	xfs_buf_attach_iodone(struct xfs_buf *,
 			      xfs_log_item_t *);
 void	xfs_buf_iodone_callbacks(struct xfs_buf *);
 void	xfs_buf_iodone(struct xfs_buf *, struct xfs_log_item *);
+bool	xfs_buf_resubmit_failed_buffers(struct xfs_buf *,
+					struct xfs_log_item *,
+					struct list_head *);
 
 extern kmem_zone_t	*xfs_buf_item_zone;
 
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 013cc78d7daf..6d0f74ec31e8 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -27,6 +27,7 @@
 #include "xfs_error.h"
 #include "xfs_trace.h"
 #include "xfs_trans_priv.h"
+#include "xfs_buf_item.h"
 #include "xfs_log.h"
 
 
@@ -475,6 +476,23 @@ xfs_inode_item_unpin(
 		wake_up_bit(&ip->i_flags, __XFS_IPINNED_BIT);
 }
 
+/*
+ * Callback used to mark a buffer with XFS_LI_FAILED when items in the buffer
+ * have been failed during writeback
+ *
+ * This informs the AIL that the inode is already flush locked on the next push,
+ * and acquires a hold on the buffer to ensure that it isn't reclaimed before
+ * dirty data makes it to disk.
+ */
+STATIC void
+xfs_inode_item_error(
+	struct xfs_log_item	*lip,
+	struct xfs_buf		*bp)
+{
+	ASSERT(xfs_isiflocked(INODE_ITEM(lip)->ili_inode));
+	xfs_set_li_failed(lip, bp);
+}
+
 STATIC uint
 xfs_inode_item_push(
 	struct xfs_log_item	*lip,
@@ -484,13 +502,28 @@ xfs_inode_item_push(
 {
 	struct xfs_inode_log_item *iip = INODE_ITEM(lip);
 	struct xfs_inode	*ip = iip->ili_inode;
-	struct xfs_buf		*bp = NULL;
+	struct xfs_buf		*bp = lip->li_buf;
 	uint			rval = XFS_ITEM_SUCCESS;
 	int			error;
 
 	if (xfs_ipincount(ip) > 0)
 		return XFS_ITEM_PINNED;
 
+	/*
+	 * The buffer containing this item failed to be written back
+	 * previously. Resubmit the buffer for IO.
+	 */
+	if (lip->li_flags & XFS_LI_FAILED) {
+		if (!xfs_buf_trylock(bp))
+			return XFS_ITEM_LOCKED;
+
+		if (!xfs_buf_resubmit_failed_buffers(bp, lip, buffer_list))
+			rval = XFS_ITEM_FLUSHING;
+
+		xfs_buf_unlock(bp);
+		return rval;
+	}
+
 	if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED))
 		return XFS_ITEM_LOCKED;
 
@@ -622,7 +655,8 @@ static const struct xfs_item_ops xfs_inode_item_ops = {
 	.iop_unlock	= xfs_inode_item_unlock,
 	.iop_committed	= xfs_inode_item_committed,
 	.iop_push	= xfs_inode_item_push,
-	.iop_committing = xfs_inode_item_committing
+	.iop_committing = xfs_inode_item_committing,
+	.iop_error	= xfs_inode_item_error
 };
 
 
@@ -710,7 +744,8 @@ xfs_iflush_done(
 		 * the AIL lock.
 		 */
 		iip = INODE_ITEM(blip);
-		if (iip->ili_logged && blip->li_lsn == iip->ili_flush_lsn)
+		if ((iip->ili_logged && blip->li_lsn == iip->ili_flush_lsn) ||
+		    lip->li_flags & XFS_LI_FAILED)
 			need_ail++;
 
 		blip = next;
@@ -718,7 +753,8 @@ xfs_iflush_done(
 
 	/* make sure we capture the state of the initial inode. */
 	iip = INODE_ITEM(lip);
-	if (iip->ili_logged && lip->li_lsn == iip->ili_flush_lsn)
+	if ((iip->ili_logged && lip->li_lsn == iip->ili_flush_lsn) ||
+	    lip->li_flags & XFS_LI_FAILED)
 		need_ail++;
 
 	/*
@@ -739,6 +775,9 @@ xfs_iflush_done(
 			if (INODE_ITEM(blip)->ili_logged &&
 			    blip->li_lsn == INODE_ITEM(blip)->ili_flush_lsn)
 				mlip_changed |= xfs_ail_delete_one(ailp, blip);
+			else {
+				xfs_clear_li_failed(blip);
+			}
 		}
 
 		if (mlip_changed) {
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 442d679210a1..7d627721e4b3 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -49,6 +49,7 @@ typedef struct xfs_log_item {
 	struct xfs_ail			*li_ailp;	/* ptr to AIL */
 	uint				li_type;	/* item type */
 	uint				li_flags;	/* misc flags */
+	struct xfs_buf			*li_buf;	/* real buffer pointer */
 	struct xfs_log_item		*li_bio_list;	/* buffer item list */
 	void				(*li_cb)(struct xfs_buf *,
 						 struct xfs_log_item *);
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index 9056c0f34a3c..70f5ab017323 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -687,12 +687,13 @@ xfs_trans_ail_update_bulk(
 bool
 xfs_ail_delete_one(
 	struct xfs_ail		*ailp,
-	struct xfs_log_item 	*lip)
+	struct xfs_log_item	*lip)
 {
 	struct xfs_log_item	*mlip = xfs_ail_min(ailp);
 
 	trace_xfs_ail_delete(lip, mlip->li_lsn, lip->li_lsn);
 	xfs_ail_delete(ailp, lip);
+	xfs_clear_li_failed(lip);
 	lip->li_flags &= ~XFS_LI_IN_AIL;
 	lip->li_lsn = 0;
 
diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
index d91706c56c63..b317a3644c00 100644
--- a/fs/xfs/xfs_trans_priv.h
+++ b/fs/xfs/xfs_trans_priv.h
@@ -164,4 +164,35 @@ xfs_trans_ail_copy_lsn(
 	*dst = *src;
 }
 #endif
+
+static inline void
+xfs_clear_li_failed(
+	struct xfs_log_item	*lip)
+{
+	struct xfs_buf	*bp = lip->li_buf;
+
+	ASSERT(lip->li_flags & XFS_LI_IN_AIL);
+	lockdep_assert_held(&lip->li_ailp->xa_lock);
+
+	if (lip->li_flags & XFS_LI_FAILED) {
+		lip->li_flags &= ~XFS_LI_FAILED;
+		lip->li_buf = NULL;
+		xfs_buf_rele(bp);
+	}
+}
+
+static inline void
+xfs_set_li_failed(
+	struct xfs_log_item	*lip,
+	struct xfs_buf		*bp)
+{
+	lockdep_assert_held(&lip->li_ailp->xa_lock);
+
+	if (!(lip->li_flags & XFS_LI_FAILED)) {
+		xfs_buf_hold(bp);
+		lip->li_flags |= XFS_LI_FAILED;
+		lip->li_buf = bp;
+	}
+}
+
 #endif	/* __XFS_TRANS_PRIV_H__ */
-- 
cgit v1.2.3


From 284f1c2c9bebf871861184b0e2c40fa921dd380b Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Tue, 8 Aug 2017 18:21:51 -0700
Subject: xfs: fix recovery failure when log record header wraps log end

The high-level log recovery algorithm consists of two loops that
walk the physical log and process log records from the tail to the
head. The first loop handles the case where the tail is beyond the
head and processes records up to the end of the physical log. The
subsequent loop processes records from the beginning of the physical
log to the head.

Because log records can wrap around the end of the physical log, the
first loop mentioned above must handle this case appropriately.
Records are processed from in-core buffers, which means that this
algorithm must split the reads of such records into two partial
I/Os: 1.) from the beginning of the record to the end of the log and
2.) from the beginning of the log to the end of the record. This is
further complicated by the fact that the log record header and log
record data are read into independent buffers.

The current handling of each buffer correctly splits the reads when
either the header or data starts before the end of the log and wraps
around the end. The data read does not correctly handle the case
where the prior header read wrapped or ends on the physical log end
boundary. blk_no is incremented to or beyond the log end after the
header read to point to the record data, but the split data read
logic triggers, attempts to read from an invalid log block and
ultimately causes log recovery to fail. This can be reproduced
fairly reliably via xfstests tests generic/047 and generic/388 with
large iclog sizes (256k) and small (10M) logs.

If the record header read has pushed beyond the end of the physical
log, the subsequent data read is actually contiguous. Update the
data read logic to detect the case where blk_no has wrapped, mod it
against the log size to read from the correct address and issue one
contiguous read for the log data buffer. The log record is processed
as normal from the buffer(s), the loop exits after the current
iteration and the subsequent loop picks up with the first new record
after the start of the log.

Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/xfs_log_recover.c | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 9549188f5a36..36a179f2c931 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -5218,7 +5218,7 @@ xlog_do_recovery_pass(
 	xfs_daddr_t		*first_bad)	/* out: first bad log rec */
 {
 	xlog_rec_header_t	*rhead;
-	xfs_daddr_t		blk_no;
+	xfs_daddr_t		blk_no, rblk_no;
 	xfs_daddr_t		rhead_blk;
 	char			*offset;
 	xfs_buf_t		*hbp, *dbp;
@@ -5371,9 +5371,19 @@ xlog_do_recovery_pass(
 			bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
 			blk_no += hblks;
 
-			/* Read in data for log record */
-			if (blk_no + bblks <= log->l_logBBsize) {
-				error = xlog_bread(log, blk_no, bblks, dbp,
+			/*
+			 * Read the log record data in multiple reads if it
+			 * wraps around the end of the log. Note that if the
+			 * header already wrapped, blk_no could point past the
+			 * end of the log. The record data is contiguous in
+			 * that case.
+			 */
+			if (blk_no + bblks <= log->l_logBBsize ||
+			    blk_no >= log->l_logBBsize) {
+				/* mod blk_no in case the header wrapped and
+				 * pushed it beyond the end of the log */
+				rblk_no = do_mod(blk_no, log->l_logBBsize);
+				error = xlog_bread(log, rblk_no, bblks, dbp,
 						   &offset);
 				if (error)
 					goto bread_err2;
-- 
cgit v1.2.3


From 5297ac1f6d7cbf45464a49b9558831f271dfc559 Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Tue, 8 Aug 2017 18:21:51 -0700
Subject: xfs: always verify the log tail during recovery

Log tail verification currently only occurs when torn writes are
detected at the head of the log. This was introduced because a
change in the head block due to torn writes can lead to a change in
the tail block (each log record header references the current tail)
and the tail block should be verified before log recovery proceeds.

Tail corruption is possible outside of torn write scenarios,
however. For example, partial log writes can be detected and cleared
during the initial head/tail block discovery process. If the partial
write coincides with a tail overwrite, the log tail is corrupted and
recovery fails.

To facilitate correct handling of log tail overwites, update log
recovery to always perform tail verification. This is necessary to
detect potential tail overwrite conditions when torn writes may not
have occurred. This changes normal (i.e., no torn writes) recovery
behavior slightly to detect and return CRC related errors near the
tail before actual recovery starts.

Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/xfs_log_recover.c | 26 +++-----------------------
 1 file changed, 3 insertions(+), 23 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 36a179f2c931..c1337127d8fb 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -1183,31 +1183,11 @@ xlog_verify_head(
 			ASSERT(0);
 			return 0;
 		}
-
-		/*
-		 * Now verify the tail based on the updated head. This is
-		 * required because the torn writes trimmed from the head could
-		 * have been written over the tail of a previous record. Return
-		 * any errors since recovery cannot proceed if the tail is
-		 * corrupt.
-		 *
-		 * XXX: This leaves a gap in truly robust protection from torn
-		 * writes in the log. If the head is behind the tail, the tail
-		 * pushes forward to create some space and then a crash occurs
-		 * causing the writes into the previous record's tail region to
-		 * tear, log recovery isn't able to recover.
-		 *
-		 * How likely is this to occur? If possible, can we do something
-		 * more intelligent here? Is it safe to push the tail forward if
-		 * we can determine that the tail is within the range of the
-		 * torn write (e.g., the kernel can only overwrite the tail if
-		 * it has actually been pushed forward)? Alternatively, could we
-		 * somehow prevent this condition at runtime?
-		 */
-		error = xlog_verify_tail(log, *head_blk, *tail_blk);
 	}
+	if (error)
+		return error;
 
-	return error;
+	return xlog_verify_tail(log, *head_blk, *tail_blk);
 }
 
 /*
-- 
cgit v1.2.3


From 4a4f66eac4681378996a1837ad1ffec3a2e2981f Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Tue, 8 Aug 2017 18:21:52 -0700
Subject: xfs: fix log recovery corruption error due to tail overwrite

If we consider the case where the tail (T) of the log is pinned long
enough for the head (H) to push and block behind the tail, we can
end up blocked in the following state without enough free space (f)
in the log to satisfy a transaction reservation:

	0	phys. log	N
	[-------HffT---H'--T'---]

The last good record in the log (before H) refers to T. The tail
eventually pushes forward (T') leaving more free space in the log
for writes to H. At this point, suppose space frees up in the log
for the maximum of 8 in-core log buffers to start flushing out to
the log. If this pushes the head from H to H', these next writes
overwrite the previous tail T. This is safe because the items logged
from T to T' have been written back and removed from the AIL.

If the next log writes (H -> H') happen to fail and result in
partial records in the log, the filesystem shuts down having
overwritten T with invalid data. Log recovery correctly locates H on
the subsequent mount, but H still refers to the now corrupted tail
T. This results in log corruption errors and recovery failure.

Since the tail overwrite results from otherwise correct runtime
behavior, it is up to log recovery to try and deal with this
situation. Update log recovery tail verification to run a CRC pass
from the first record past the tail to the head. This facilitates
error detection at T and moves the recovery tail to the first good
record past H' (similar to truncating the head on torn write
detection). If corruption is detected beyond the range possibly
affected by the max number of iclogs, the log is legitimately
corrupted and log recovery failure is expected.

Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/xfs_log_recover.c | 108 +++++++++++++++++++++++++++++++++--------------
 1 file changed, 77 insertions(+), 31 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index c1337127d8fb..a5e2ca8f5cd6 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -1029,61 +1029,106 @@ out_error:
 }
 
 /*
- * Check the log tail for torn writes. This is required when torn writes are
- * detected at the head and the head had to be walked back to a previous record.
- * The tail of the previous record must now be verified to ensure the torn
- * writes didn't corrupt the previous tail.
+ * Calculate distance from head to tail (i.e., unused space in the log).
+ */
+static inline int
+xlog_tail_distance(
+	struct xlog	*log,
+	xfs_daddr_t	head_blk,
+	xfs_daddr_t	tail_blk)
+{
+	if (head_blk < tail_blk)
+		return tail_blk - head_blk;
+
+	return tail_blk + (log->l_logBBsize - head_blk);
+}
+
+/*
+ * Verify the log tail. This is particularly important when torn or incomplete
+ * writes have been detected near the front of the log and the head has been
+ * walked back accordingly.
+ *
+ * We also have to handle the case where the tail was pinned and the head
+ * blocked behind the tail right before a crash. If the tail had been pushed
+ * immediately prior to the crash and the subsequent checkpoint was only
+ * partially written, it's possible it overwrote the last referenced tail in the
+ * log with garbage. This is not a coherency problem because the tail must have
+ * been pushed before it can be overwritten, but appears as log corruption to
+ * recovery because we have no way to know the tail was updated if the
+ * subsequent checkpoint didn't write successfully.
  *
- * Return an error if CRC verification fails as recovery cannot proceed.
+ * Therefore, CRC check the log from tail to head. If a failure occurs and the
+ * offending record is within max iclog bufs from the head, walk the tail
+ * forward and retry until a valid tail is found or corruption is detected out
+ * of the range of a possible overwrite.
  */
 STATIC int
 xlog_verify_tail(
 	struct xlog		*log,
 	xfs_daddr_t		head_blk,
-	xfs_daddr_t		tail_blk)
+	xfs_daddr_t		*tail_blk,
+	int			hsize)
 {
 	struct xlog_rec_header	*thead;
 	struct xfs_buf		*bp;
 	xfs_daddr_t		first_bad;
-	int			count;
 	int			error = 0;
 	bool			wrapped;
-	xfs_daddr_t		tmp_head;
+	xfs_daddr_t		tmp_tail;
+	xfs_daddr_t		orig_tail = *tail_blk;
 
 	bp = xlog_get_bp(log, 1);
 	if (!bp)
 		return -ENOMEM;
 
 	/*
-	 * Seek XLOG_MAX_ICLOGS + 1 records past the current tail record to get
-	 * a temporary head block that points after the last possible
-	 * concurrently written record of the tail.
+	 * Make sure the tail points to a record (returns positive count on
+	 * success).
 	 */
-	count = xlog_seek_logrec_hdr(log, head_blk, tail_blk,
-				     XLOG_MAX_ICLOGS + 1, bp, &tmp_head, &thead,
-				     &wrapped);
-	if (count < 0) {
-		error = count;
+	error = xlog_seek_logrec_hdr(log, head_blk, *tail_blk, 1, bp,
+			&tmp_tail, &thead, &wrapped);
+	if (error < 0)
 		goto out;
-	}
+	if (*tail_blk != tmp_tail)
+		*tail_blk = tmp_tail;
 
 	/*
-	 * If the call above didn't find XLOG_MAX_ICLOGS + 1 records, we ran
-	 * into the actual log head. tmp_head points to the start of the record
-	 * so update it to the actual head block.
+	 * Run a CRC check from the tail to the head. We can't just check
+	 * MAX_ICLOGS records past the tail because the tail may point to stale
+	 * blocks cleared during the search for the head/tail. These blocks are
+	 * overwritten with zero-length records and thus record count is not a
+	 * reliable indicator of the iclog state before a crash.
 	 */
-	if (count < XLOG_MAX_ICLOGS + 1)
-		tmp_head = head_blk;
-
-	/*
-	 * We now have a tail and temporary head block that covers at least
-	 * XLOG_MAX_ICLOGS records from the tail. We need to verify that these
-	 * records were completely written. Run a CRC verification pass from
-	 * tail to head and return the result.
-	 */
-	error = xlog_do_recovery_pass(log, tmp_head, tail_blk,
+	first_bad = 0;
+	error = xlog_do_recovery_pass(log, head_blk, *tail_blk,
 				      XLOG_RECOVER_CRCPASS, &first_bad);
+	while (error == -EFSBADCRC && first_bad) {
+		int	tail_distance;
+
+		/*
+		 * Is corruption within range of the head? If so, retry from
+		 * the next record. Otherwise return an error.
+		 */
+		tail_distance = xlog_tail_distance(log, head_blk, first_bad);
+		if (tail_distance > BTOBB(XLOG_MAX_ICLOGS * hsize))
+			break;
 
+		/* skip to the next record; returns positive count on success */
+		error = xlog_seek_logrec_hdr(log, head_blk, first_bad, 2, bp,
+				&tmp_tail, &thead, &wrapped);
+		if (error < 0)
+			goto out;
+
+		*tail_blk = tmp_tail;
+		first_bad = 0;
+		error = xlog_do_recovery_pass(log, head_blk, *tail_blk,
+					      XLOG_RECOVER_CRCPASS, &first_bad);
+	}
+
+	if (!error && *tail_blk != orig_tail)
+		xfs_warn(log->l_mp,
+		"Tail block (0x%llx) overwrite detected. Updated to 0x%llx",
+			 orig_tail, *tail_blk);
 out:
 	xlog_put_bp(bp);
 	return error;
@@ -1187,7 +1232,8 @@ xlog_verify_head(
 	if (error)
 		return error;
 
-	return xlog_verify_tail(log, *head_blk, *tail_blk);
+	return xlog_verify_tail(log, *head_blk, tail_blk,
+				be32_to_cpu((*rhead)->h_size));
 }
 
 /*
-- 
cgit v1.2.3


From 7f4d01f36a3ac16f539f0fd3839de5d58fa4940f Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Tue, 8 Aug 2017 18:21:52 -0700
Subject: xfs: add log item pinning error injection tag

Add an error injection tag to force log items in the AIL to the
pinned state. This option can be used by test infrastructure to
induce head behind tail conditions. Specifically, this is intended
to be used by xfstests to reproduce log recovery problems after
failed/corrupted log writes overwrite the last good tail LSN in the
log.

When enabled, AIL push attempts see log items in the AIL in the
pinned state. This stalls metadata writeback and thus prevents the
current tail of the log from moving forward. When disabled,
subsequent AIL pushes observe the log items in their appropriate
state and filesystem operation continues as normal.

Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/xfs_error.c     |  3 +++
 fs/xfs/xfs_error.h     |  4 +++-
 fs/xfs/xfs_trans_ail.c | 17 ++++++++++++++++-
 3 files changed, 22 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index 2f4feb959bfb..bd786a9ac2c3 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -57,6 +57,7 @@ static unsigned int xfs_errortag_random_default[] = {
 	XFS_RANDOM_AG_RESV_CRITICAL,
 	XFS_RANDOM_DROP_WRITES,
 	XFS_RANDOM_LOG_BAD_CRC,
+	XFS_RANDOM_LOG_ITEM_PIN,
 };
 
 struct xfs_errortag_attr {
@@ -161,6 +162,7 @@ XFS_ERRORTAG_ATTR_RW(bmap_finish_one,	XFS_ERRTAG_BMAP_FINISH_ONE);
 XFS_ERRORTAG_ATTR_RW(ag_resv_critical,	XFS_ERRTAG_AG_RESV_CRITICAL);
 XFS_ERRORTAG_ATTR_RW(drop_writes,	XFS_ERRTAG_DROP_WRITES);
 XFS_ERRORTAG_ATTR_RW(log_bad_crc,	XFS_ERRTAG_LOG_BAD_CRC);
+XFS_ERRORTAG_ATTR_RW(log_item_pin,	XFS_ERRTAG_LOG_ITEM_PIN);
 
 static struct attribute *xfs_errortag_attrs[] = {
 	XFS_ERRORTAG_ATTR_LIST(noerror),
@@ -193,6 +195,7 @@ static struct attribute *xfs_errortag_attrs[] = {
 	XFS_ERRORTAG_ATTR_LIST(ag_resv_critical),
 	XFS_ERRORTAG_ATTR_LIST(drop_writes),
 	XFS_ERRORTAG_ATTR_LIST(log_bad_crc),
+	XFS_ERRORTAG_ATTR_LIST(log_item_pin),
 	NULL,
 };
 
diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h
index 7577be5f09bc..7c4bef3bddb7 100644
--- a/fs/xfs/xfs_error.h
+++ b/fs/xfs/xfs_error.h
@@ -106,7 +106,8 @@ extern void xfs_verifier_error(struct xfs_buf *bp);
  */
 #define XFS_ERRTAG_DROP_WRITES				28
 #define XFS_ERRTAG_LOG_BAD_CRC				29
-#define XFS_ERRTAG_MAX					30
+#define XFS_ERRTAG_LOG_ITEM_PIN				30
+#define XFS_ERRTAG_MAX					31
 
 /*
  * Random factors for above tags, 1 means always, 2 means 1/2 time, etc.
@@ -141,6 +142,7 @@ extern void xfs_verifier_error(struct xfs_buf *bp);
 #define XFS_RANDOM_AG_RESV_CRITICAL			4
 #define XFS_RANDOM_DROP_WRITES				1
 #define XFS_RANDOM_LOG_BAD_CRC				1
+#define XFS_RANDOM_LOG_ITEM_PIN				1
 
 #ifdef DEBUG
 extern int xfs_errortag_init(struct xfs_mount *mp);
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index 70f5ab017323..354368a906e5 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -325,6 +325,21 @@ xfs_ail_delete(
 	xfs_trans_ail_cursor_clear(ailp, lip);
 }
 
+static inline uint
+xfsaild_push_item(
+	struct xfs_ail		*ailp,
+	struct xfs_log_item	*lip)
+{
+	/*
+	 * If log item pinning is enabled, skip the push and track the item as
+	 * pinned. This can help induce head-behind-tail conditions.
+	 */
+	if (XFS_TEST_ERROR(false, ailp->xa_mount, XFS_ERRTAG_LOG_ITEM_PIN))
+		return XFS_ITEM_PINNED;
+
+	return lip->li_ops->iop_push(lip, &ailp->xa_buf_list);
+}
+
 static long
 xfsaild_push(
 	struct xfs_ail		*ailp)
@@ -382,7 +397,7 @@ xfsaild_push(
 		 * rely on the AIL cursor implementation to be able to deal with
 		 * the dropped lock.
 		 */
-		lock_result = lip->li_ops->iop_push(lip, &ailp->xa_buf_list);
+		lock_result = xfsaild_push_item(ailp, lip);
 		switch (lock_result) {
 		case XFS_ITEM_SUCCESS:
 			XFS_STATS_INC(mp, xs_push_ail_success);
-- 
cgit v1.2.3


From a4c9b34d6a17081005ec459b57b8effc08f4c731 Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Tue, 8 Aug 2017 18:21:53 -0700
Subject: xfs: handle -EFSCORRUPTED during head/tail verification

Torn write and tail overwrite detection both trigger only on
-EFSBADCRC errors. While this is the most likely failure scenario
for each condition, -EFSCORRUPTED is still possible in certain cases
depending on what ends up on disk when a torn write or partial tail
overwrite occurs. For example, an invalid log record h_len can lead
to an -EFSCORRUPTED error when running the log recovery CRC pass.

Therefore, update log head and tail verification to trigger the
associated head/tail fixups in the event of -EFSCORRUPTED errors
along with -EFSBADCRC. Also, -EFSCORRUPTED can currently be returned
from xlog_do_recovery_pass() before rhead_blk is initialized if the
first record encountered happens to be corrupted. This leads to an
incorrect 'first_bad' return value. Initialize rhead_blk earlier in
the function to address that problem as well.

Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/xfs_log_recover.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index a5e2ca8f5cd6..43d5df3a563f 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -1102,7 +1102,7 @@ xlog_verify_tail(
 	first_bad = 0;
 	error = xlog_do_recovery_pass(log, head_blk, *tail_blk,
 				      XLOG_RECOVER_CRCPASS, &first_bad);
-	while (error == -EFSBADCRC && first_bad) {
+	while ((error == -EFSBADCRC || error == -EFSCORRUPTED) && first_bad) {
 		int	tail_distance;
 
 		/*
@@ -1188,7 +1188,7 @@ xlog_verify_head(
 	 */
 	error = xlog_do_recovery_pass(log, *head_blk, tmp_rhead_blk,
 				      XLOG_RECOVER_CRCPASS, &first_bad);
-	if (error == -EFSBADCRC) {
+	if ((error == -EFSBADCRC || error == -EFSCORRUPTED) && first_bad) {
 		/*
 		 * We've hit a potential torn write. Reset the error and warn
 		 * about it.
@@ -5257,7 +5257,7 @@ xlog_do_recovery_pass(
 	LIST_HEAD		(buffer_list);
 
 	ASSERT(head_blk != tail_blk);
-	rhead_blk = 0;
+	blk_no = rhead_blk = tail_blk;
 
 	for (i = 0; i < XLOG_RHASH_SIZE; i++)
 		INIT_HLIST_HEAD(&rhash[i]);
@@ -5335,7 +5335,6 @@ xlog_do_recovery_pass(
 	}
 
 	memset(rhash, 0, sizeof(rhash));
-	blk_no = rhead_blk = tail_blk;
 	if (tail_blk > head_blk) {
 		/*
 		 * Perform recovery around the end of the physical log.
-- 
cgit v1.2.3


From e67d3d4246e5fbb0c7c700426d11241ca9c6f473 Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Tue, 8 Aug 2017 18:21:53 -0700
Subject: xfs: add log recovery tracepoint for head/tail

Torn write detection and tail overwrite detection can shift the log
head and tail respectively in the event of CRC mismatch or
corruption errors. Add a high-level log recovery tracepoint to dump
the final log head/tail and make those values easily attainable in
debug/diagnostic situations.

Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/xfs_log_recover.c |  2 ++
 fs/xfs/xfs_trace.h       | 18 ++++++++++++++++++
 2 files changed, 20 insertions(+)

(limited to 'fs')

diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 43d5df3a563f..a36239980cf7 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -5598,6 +5598,8 @@ xlog_do_recover(
 	xfs_buf_t	*bp;
 	xfs_sb_t	*sbp;
 
+	trace_xfs_log_recover(log, head_blk, tail_blk);
+
 	/*
 	 * First replay the images in the log.
 	 */
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index bcc3cdf8e1c5..68810477ef2c 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -1963,6 +1963,24 @@ DEFINE_EVENT(xfs_swap_extent_class, name, \
 DEFINE_SWAPEXT_EVENT(xfs_swap_extent_before);
 DEFINE_SWAPEXT_EVENT(xfs_swap_extent_after);
 
+TRACE_EVENT(xfs_log_recover,
+	TP_PROTO(struct xlog *log, xfs_daddr_t headblk, xfs_daddr_t tailblk),
+	TP_ARGS(log, headblk, tailblk),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_daddr_t, headblk)
+		__field(xfs_daddr_t, tailblk)
+	),
+	TP_fast_assign(
+		__entry->dev = log->l_mp->m_super->s_dev;
+		__entry->headblk = headblk;
+		__entry->tailblk = tailblk;
+	),
+	TP_printk("dev %d:%d headblk 0x%llx tailblk 0x%llx",
+		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->headblk,
+		  __entry->tailblk)
+)
+
 TRACE_EVENT(xfs_log_recover_record,
 	TP_PROTO(struct xlog *log, struct xlog_rec_header *rhead, int pass),
 	TP_ARGS(log, rhead, pass),
-- 
cgit v1.2.3


From 2d32311cf19bfb8c1d2b4601974ddd951f9cfd0b Mon Sep 17 00:00:00 2001
From: Carlos Maiolino <cmaiolino@redhat.com>
Date: Fri, 18 Aug 2017 18:07:04 -0700
Subject: xfs: stop searching for free slots in an inode chunk when there are
 none

In a filesystem without finobt, the Space manager selects an AG to alloc a new
inode, where xfs_dialloc_ag_inobt() will search the AG for the free slot chunk.

When the new inode is in the same AG as its parent, the btree will be searched
starting on the parent's record, and then retried from the top if no slot is
available beyond the parent's record.

To exit this loop though, xfs_dialloc_ag_inobt() relies on the fact that the
btree must have a free slot available, once its callers relied on the
agi->freecount when deciding how/where to allocate this new inode.

In the case when the agi->freecount is corrupted, showing available inodes in an
AG, when in fact there is none, this becomes an infinite loop.

Add a way to stop the loop when a free slot is not found in the btree, making
the function to fall into the whole AG scan which will then, be able to detect
the corruption and shut the filesystem down.

As pointed by Brian, this might impact performance, giving the fact we
don't reset the search distance anymore when we reach the end of the
tree, giving it fewer tries before falling back to the whole AG search, but
it will only affect searches that start within 10 records to the end of the tree.

Signed-off-by: Carlos Maiolino <cmaiolino@redhat.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/libxfs/xfs_ialloc.c | 55 +++++++++++++++++++++++-----------------------
 1 file changed, 27 insertions(+), 28 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index abf5beaae907..1e0658a3f155 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -1133,6 +1133,7 @@ xfs_dialloc_ag_inobt(
 	int			error;
 	int			offset;
 	int			i, j;
+	int			searchdistance = 10;
 
 	pag = xfs_perag_get(mp, agno);
 
@@ -1159,7 +1160,6 @@ xfs_dialloc_ag_inobt(
 	if (pagno == agno) {
 		int		doneleft;	/* done, to the left */
 		int		doneright;	/* done, to the right */
-		int		searchdistance = 10;
 
 		error = xfs_inobt_lookup(cur, pagino, XFS_LOOKUP_LE, &i);
 		if (error)
@@ -1220,21 +1220,9 @@ xfs_dialloc_ag_inobt(
 		/*
 		 * Loop until we find an inode chunk with a free inode.
 		 */
-		while (!doneleft || !doneright) {
+		while (--searchdistance > 0 && (!doneleft || !doneright)) {
 			int	useleft;  /* using left inode chunk this time */
 
-			if (!--searchdistance) {
-				/*
-				 * Not in range - save last search
-				 * location and allocate a new inode
-				 */
-				xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
-				pag->pagl_leftrec = trec.ir_startino;
-				pag->pagl_rightrec = rec.ir_startino;
-				pag->pagl_pagino = pagino;
-				goto newino;
-			}
-
 			/* figure out the closer block if both are valid. */
 			if (!doneleft && !doneright) {
 				useleft = pagino -
@@ -1278,26 +1266,37 @@ xfs_dialloc_ag_inobt(
 				goto error1;
 		}
 
-		/*
-		 * We've reached the end of the btree. because
-		 * we are only searching a small chunk of the
-		 * btree each search, there is obviously free
-		 * inodes closer to the parent inode than we
-		 * are now. restart the search again.
-		 */
-		pag->pagl_pagino = NULLAGINO;
-		pag->pagl_leftrec = NULLAGINO;
-		pag->pagl_rightrec = NULLAGINO;
-		xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
-		xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
-		goto restart_pagno;
+		if (searchdistance <= 0) {
+			/*
+			 * Not in range - save last search
+			 * location and allocate a new inode
+			 */
+			xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+			pag->pagl_leftrec = trec.ir_startino;
+			pag->pagl_rightrec = rec.ir_startino;
+			pag->pagl_pagino = pagino;
+
+		} else {
+			/*
+			 * We've reached the end of the btree. because
+			 * we are only searching a small chunk of the
+			 * btree each search, there is obviously free
+			 * inodes closer to the parent inode than we
+			 * are now. restart the search again.
+			 */
+			pag->pagl_pagino = NULLAGINO;
+			pag->pagl_leftrec = NULLAGINO;
+			pag->pagl_rightrec = NULLAGINO;
+			xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+			xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+			goto restart_pagno;
+		}
 	}
 
 	/*
 	 * In a different AG from the parent.
 	 * See if the most recently allocated block has any free.
 	 */
-newino:
 	if (agi->agi_newino != cpu_to_be32(NULLAGINO)) {
 		error = xfs_inobt_lookup(cur, be32_to_cpu(agi->agi_newino),
 					 XFS_LOOKUP_EQ, &i);
-- 
cgit v1.2.3


From 799ea9e9c59949008770aab4e1da87f10e99dbe4 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Fri, 18 Aug 2017 18:08:25 -0700
Subject: xfs: evict all inodes involved with log redo item

When we introduced the bmap redo log items, we set MS_ACTIVE on the
mountpoint and XFS_IRECOVERY on the inode to prevent unlinked inodes
from being truncated prematurely during log recovery.  This also had the
effect of putting linked inodes on the lru instead of evicting them.

Unfortunately, we neglected to find all those unreferenced lru inodes
and evict them after finishing log recovery, which means that we leak
them if anything goes wrong in the rest of xfs_mountfs, because the lru
is only cleaned out on unmount.

Therefore, evict unreferenced inodes in the lru list immediately
after clearing MS_ACTIVE.

Fixes: 17c12bcd30 ("xfs: when replaying bmap operations, don't let unlinked inodes get reaped")
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Cc: viro@ZenIV.linux.org.uk
Reviewed-by: Brian Foster <bfoster@redhat.com>
---
 fs/inode.c       |  1 +
 fs/internal.h    |  1 -
 fs/xfs/xfs_log.c | 12 ++++++++++++
 3 files changed, 13 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/inode.c b/fs/inode.c
index 50370599e371..6a1626e0edaf 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -637,6 +637,7 @@ again:
 
 	dispose_list(&dispose);
 }
+EXPORT_SYMBOL_GPL(evict_inodes);
 
 /**
  * invalidate_inodes	- attempt to free all inodes on a superblock
diff --git a/fs/internal.h b/fs/internal.h
index 9676fe11c093..fedfe94d84ba 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -132,7 +132,6 @@ static inline bool atime_needs_update_rcu(const struct path *path,
 extern void inode_io_list_del(struct inode *inode);
 
 extern long get_nr_dirty_inodes(void);
-extern void evict_inodes(struct super_block *);
 extern int invalidate_inodes(struct super_block *, bool);
 
 /*
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index f467f1230444..bcb2f860e508 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -761,12 +761,24 @@ xfs_log_mount_finish(
 	 * inodes.  Turn it off immediately after recovery finishes
 	 * so that we don't leak the quota inodes if subsequent mount
 	 * activities fail.
+	 *
+	 * We let all inodes involved in redo item processing end up on
+	 * the LRU instead of being evicted immediately so that if we do
+	 * something to an unlinked inode, the irele won't cause
+	 * premature truncation and freeing of the inode, which results
+	 * in log recovery failure.  We have to evict the unreferenced
+	 * lru inodes after clearing MS_ACTIVE because we don't
+	 * otherwise clean up the lru if there's a subsequent failure in
+	 * xfs_mountfs, which leads to us leaking the inodes if nothing
+	 * else (e.g. quotacheck) references the inodes before the
+	 * mount failure occurs.
 	 */
 	mp->m_super->s_flags |= MS_ACTIVE;
 	error = xlog_recover_finish(mp->m_log);
 	if (!error)
 		xfs_log_work_queue(mp);
 	mp->m_super->s_flags &= ~MS_ACTIVE;
+	evict_inodes(mp->m_super);
 
 	if (readonly)
 		mp->m_flags |= XFS_MOUNT_RDONLY;
-- 
cgit v1.2.3


From f2e9ad212def50bcf4c098c6288779dd97fff0f0 Mon Sep 17 00:00:00 2001
From: Omar Sandoval <osandov@fb.com>
Date: Fri, 25 Aug 2017 10:05:26 -0700
Subject: xfs: check for race with xfs_reclaim_inode() in xfs_ifree_cluster()

After xfs_ifree_cluster() finds an inode in the radix tree and verifies
that the inode number is what it expected, xfs_reclaim_inode() can swoop
in and free it. xfs_ifree_cluster() will then happily continue working
on the freed inode. Most importantly, it will mark the inode stale,
which will probably be overwritten when the inode slab object is
reallocated, but if it has already been reallocated then we can end up
with an inode spuriously marked stale.

In 8a17d7ddedb4 ("xfs: mark reclaimed inodes invalid earlier") we added
a second check to xfs_iflush_cluster() to detect this race, but the
similar RCU lookup in xfs_ifree_cluster() needs the same treatment.

Signed-off-by: Omar Sandoval <osandov@fb.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/xfs_icache.c | 10 +++++-----
 fs/xfs/xfs_inode.c  | 23 ++++++++++++++++++-----
 2 files changed, 23 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 0a9e6985a0d0..34227115a5d6 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -1124,11 +1124,11 @@ reclaim:
 	 * Because we use RCU freeing we need to ensure the inode always appears
 	 * to be reclaimed with an invalid inode number when in the free state.
 	 * We do this as early as possible under the ILOCK so that
-	 * xfs_iflush_cluster() can be guaranteed to detect races with us here.
-	 * By doing this, we guarantee that once xfs_iflush_cluster has locked
-	 * XFS_ILOCK that it will see either a valid, flushable inode that will
-	 * serialise correctly, or it will see a clean (and invalid) inode that
-	 * it can skip.
+	 * xfs_iflush_cluster() and xfs_ifree_cluster() can be guaranteed to
+	 * detect races with us here. By doing this, we guarantee that once
+	 * xfs_iflush_cluster() or xfs_ifree_cluster() has locked XFS_ILOCK that
+	 * it will see either a valid inode that will serialise correctly, or it
+	 * will see an invalid inode that it can skip.
 	 */
 	spin_lock(&ip->i_flags_lock);
 	ip->i_flags = XFS_IRECLAIM;
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index ff48f0096810..97045e8dfed5 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -2359,11 +2359,24 @@ retry:
 			 * already marked stale. If we can't lock it, back off
 			 * and retry.
 			 */
-			if (ip != free_ip &&
-			    !xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
-				rcu_read_unlock();
-				delay(1);
-				goto retry;
+			if (ip != free_ip) {
+				if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
+					rcu_read_unlock();
+					delay(1);
+					goto retry;
+				}
+
+				/*
+				 * Check the inode number again in case we're
+				 * racing with freeing in xfs_reclaim_inode().
+				 * See the comments in that function for more
+				 * information as to why the initial check is
+				 * not sufficient.
+				 */
+				if (ip->i_ino != inum + i) {
+					xfs_iunlock(ip, XFS_ILOCK_EXCL);
+					continue;
+				}
 			}
 			rcu_read_unlock();
 
-- 
cgit v1.2.3


From 411350df14a3d6f1c769ea64a8b43a71f8d9760e Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 28 Aug 2017 10:21:03 -0700
Subject: xfs: refactor xfs_trans_roll

Split xfs_trans_roll into a low-level helper that just rolls the
actual transaction and a new higher level xfs_trans_roll_inode
that takes care of logging and rejoining the inode.  This gets
rid of the NULL inode case, and allows to simplify the special
cases in the deferred operation code.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/libxfs/xfs_attr.c        | 16 ++++++++--------
 fs/xfs/libxfs/xfs_attr_leaf.c   |  6 +++---
 fs/xfs/libxfs/xfs_attr_remote.c |  4 ++--
 fs/xfs/libxfs/xfs_defer.c       | 23 +++++++++--------------
 fs/xfs/xfs_attr_inactive.c      |  6 +++---
 fs/xfs/xfs_inode.c              |  4 ++--
 fs/xfs/xfs_trans.c              | 28 +++++-----------------------
 fs/xfs/xfs_trans.h              |  3 ++-
 fs/xfs/xfs_trans_inode.c        | 14 ++++++++++++++
 9 files changed, 48 insertions(+), 56 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c
index de7b9bd30bec..bafa0f6bfafa 100644
--- a/fs/xfs/libxfs/xfs_attr.c
+++ b/fs/xfs/libxfs/xfs_attr.c
@@ -341,7 +341,7 @@ xfs_attr_set(
 		 * transaction to add the new attribute to the leaf.
 		 */
 
-		error = xfs_trans_roll(&args.trans, dp);
+		error = xfs_trans_roll_inode(&args.trans, dp);
 		if (error)
 			goto out;
 
@@ -605,7 +605,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
 		 * Commit the current trans (including the inode) and start
 		 * a new one.
 		 */
-		error = xfs_trans_roll(&args->trans, dp);
+		error = xfs_trans_roll_inode(&args->trans, dp);
 		if (error)
 			return error;
 
@@ -620,7 +620,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
 	 * Commit the transaction that added the attr name so that
 	 * later routines can manage their own transactions.
 	 */
-	error = xfs_trans_roll(&args->trans, dp);
+	error = xfs_trans_roll_inode(&args->trans, dp);
 	if (error)
 		return error;
 
@@ -697,7 +697,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
 		/*
 		 * Commit the remove and start the next trans in series.
 		 */
-		error = xfs_trans_roll(&args->trans, dp);
+		error = xfs_trans_roll_inode(&args->trans, dp);
 
 	} else if (args->rmtblkno > 0) {
 		/*
@@ -885,7 +885,7 @@ restart:
 			 * Commit the node conversion and start the next
 			 * trans in the chain.
 			 */
-			error = xfs_trans_roll(&args->trans, dp);
+			error = xfs_trans_roll_inode(&args->trans, dp);
 			if (error)
 				goto out;
 
@@ -925,7 +925,7 @@ restart:
 	 * Commit the leaf addition or btree split and start the next
 	 * trans in the chain.
 	 */
-	error = xfs_trans_roll(&args->trans, dp);
+	error = xfs_trans_roll_inode(&args->trans, dp);
 	if (error)
 		goto out;
 
@@ -1012,7 +1012,7 @@ restart:
 		/*
 		 * Commit and start the next trans in the chain.
 		 */
-		error = xfs_trans_roll(&args->trans, dp);
+		error = xfs_trans_roll_inode(&args->trans, dp);
 		if (error)
 			goto out;
 
@@ -1132,7 +1132,7 @@ xfs_attr_node_removename(xfs_da_args_t *args)
 		/*
 		 * Commit the Btree join operation and start a new trans.
 		 */
-		error = xfs_trans_roll(&args->trans, dp);
+		error = xfs_trans_roll_inode(&args->trans, dp);
 		if (error)
 			goto out;
 	}
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c
index c6c15e5717e4..5c16db86b38f 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.c
+++ b/fs/xfs/libxfs/xfs_attr_leaf.c
@@ -2608,7 +2608,7 @@ xfs_attr3_leaf_clearflag(
 	/*
 	 * Commit the flag value change and start the next trans in series.
 	 */
-	return xfs_trans_roll(&args->trans, args->dp);
+	return xfs_trans_roll_inode(&args->trans, args->dp);
 }
 
 /*
@@ -2659,7 +2659,7 @@ xfs_attr3_leaf_setflag(
 	/*
 	 * Commit the flag value change and start the next trans in series.
 	 */
-	return xfs_trans_roll(&args->trans, args->dp);
+	return xfs_trans_roll_inode(&args->trans, args->dp);
 }
 
 /*
@@ -2777,7 +2777,7 @@ xfs_attr3_leaf_flipflags(
 	/*
 	 * Commit the flag value change and start the next trans in series.
 	 */
-	error = xfs_trans_roll(&args->trans, args->dp);
+	error = xfs_trans_roll_inode(&args->trans, args->dp);
 
 	return error;
 }
diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c
index 5236d8e45146..433c36714e40 100644
--- a/fs/xfs/libxfs/xfs_attr_remote.c
+++ b/fs/xfs/libxfs/xfs_attr_remote.c
@@ -484,7 +484,7 @@ xfs_attr_rmtval_set(
 		/*
 		 * Start the next trans in the chain.
 		 */
-		error = xfs_trans_roll(&args->trans, dp);
+		error = xfs_trans_roll_inode(&args->trans, dp);
 		if (error)
 			return error;
 	}
@@ -621,7 +621,7 @@ xfs_attr_rmtval_remove(
 		/*
 		 * Close out trans and start the next one in the chain.
 		 */
-		error = xfs_trans_roll(&args->trans, args->dp);
+		error = xfs_trans_roll_inode(&args->trans, args->dp);
 		if (error)
 			return error;
 	}
diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c
index 5c2929f94bd3..4ea2f068d95c 100644
--- a/fs/xfs/libxfs/xfs_defer.c
+++ b/fs/xfs/libxfs/xfs_defer.c
@@ -240,23 +240,19 @@ xfs_defer_trans_abort(
 STATIC int
 xfs_defer_trans_roll(
 	struct xfs_trans		**tp,
-	struct xfs_defer_ops		*dop,
-	struct xfs_inode		*ip)
+	struct xfs_defer_ops		*dop)
 {
 	int				i;
 	int				error;
 
-	/* Log all the joined inodes except the one we passed in. */
-	for (i = 0; i < XFS_DEFER_OPS_NR_INODES && dop->dop_inodes[i]; i++) {
-		if (dop->dop_inodes[i] == ip)
-			continue;
+	/* Log all the joined inodes. */
+	for (i = 0; i < XFS_DEFER_OPS_NR_INODES && dop->dop_inodes[i]; i++)
 		xfs_trans_log_inode(*tp, dop->dop_inodes[i], XFS_ILOG_CORE);
-	}
 
 	trace_xfs_defer_trans_roll((*tp)->t_mountp, dop);
 
 	/* Roll the transaction. */
-	error = xfs_trans_roll(tp, ip);
+	error = xfs_trans_roll(tp);
 	if (error) {
 		trace_xfs_defer_trans_roll_error((*tp)->t_mountp, dop, error);
 		xfs_defer_trans_abort(*tp, dop, error);
@@ -264,12 +260,9 @@ xfs_defer_trans_roll(
 	}
 	dop->dop_committed = true;
 
-	/* Rejoin the joined inodes except the one we passed in. */
-	for (i = 0; i < XFS_DEFER_OPS_NR_INODES && dop->dop_inodes[i]; i++) {
-		if (dop->dop_inodes[i] == ip)
-			continue;
+	/* Rejoin the joined inodes. */
+	for (i = 0; i < XFS_DEFER_OPS_NR_INODES && dop->dop_inodes[i]; i++)
 		xfs_trans_ijoin(*tp, dop->dop_inodes[i], 0);
-	}
 
 	return error;
 }
@@ -331,13 +324,15 @@ xfs_defer_finish(
 
 	trace_xfs_defer_finish((*tp)->t_mountp, dop);
 
+	xfs_defer_join(dop, ip);
+
 	/* Until we run out of pending work to finish... */
 	while (xfs_defer_has_unfinished_work(dop)) {
 		/* Log intents for work items sitting in the intake. */
 		xfs_defer_intake_work(*tp, dop);
 
 		/* Roll the transaction. */
-		error = xfs_defer_trans_roll(tp, dop, ip);
+		error = xfs_defer_trans_roll(tp, dop);
 		if (error)
 			goto out;
 
diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c
index be0b79d8900f..ebd66b19fbfc 100644
--- a/fs/xfs/xfs_attr_inactive.c
+++ b/fs/xfs/xfs_attr_inactive.c
@@ -97,7 +97,7 @@ xfs_attr3_leaf_freextent(
 			/*
 			 * Roll to next transaction.
 			 */
-			error = xfs_trans_roll(trans, dp);
+			error = xfs_trans_roll_inode(trans, dp);
 			if (error)
 				return error;
 		}
@@ -308,7 +308,7 @@ xfs_attr3_node_inactive(
 		/*
 		 * Atomically commit the whole invalidate stuff.
 		 */
-		error = xfs_trans_roll(trans, dp);
+		error = xfs_trans_roll_inode(trans, dp);
 		if (error)
 			return  error;
 	}
@@ -375,7 +375,7 @@ xfs_attr3_root_inactive(
 	/*
 	 * Commit the invalidate and start the next transaction.
 	 */
-	error = xfs_trans_roll(trans, dp);
+	error = xfs_trans_roll_inode(trans, dp);
 
 	return error;
 }
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 97045e8dfed5..f739a031986d 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1055,7 +1055,7 @@ xfs_dir_ialloc(
 			tp->t_flags &= ~(XFS_TRANS_DQ_DIRTY);
 		}
 
-		code = xfs_trans_roll(&tp, NULL);
+		code = xfs_trans_roll(&tp);
 		if (committed != NULL)
 			*committed = 1;
 
@@ -1611,7 +1611,7 @@ xfs_itruncate_extents(
 		if (error)
 			goto out_bmap_cancel;
 
-		error = xfs_trans_roll(&tp, ip);
+		error = xfs_trans_roll_inode(&tp, ip);
 		if (error)
 			goto out;
 	}
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 2011620008de..a87f657f59c9 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -1035,25 +1035,18 @@ xfs_trans_cancel(
  */
 int
 xfs_trans_roll(
-	struct xfs_trans	**tpp,
-	struct xfs_inode	*dp)
+	struct xfs_trans	**tpp)
 {
-	struct xfs_trans	*trans;
+	struct xfs_trans	*trans = *tpp;
 	struct xfs_trans_res	tres;
 	int			error;
 
-	/*
-	 * Ensure that the inode is always logged.
-	 */
-	trans = *tpp;
-	if (dp)
-		xfs_trans_log_inode(trans, dp, XFS_ILOG_CORE);
-
 	/*
 	 * Copy the critical parameters from one trans to the next.
 	 */
 	tres.tr_logres = trans->t_log_res;
 	tres.tr_logcount = trans->t_log_count;
+
 	*tpp = xfs_trans_dup(trans);
 
 	/*
@@ -1067,10 +1060,8 @@ xfs_trans_roll(
 	if (error)
 		return error;
 
-	trans = *tpp;
-
 	/*
-	 * Reserve space in the log for th next transaction.
+	 * Reserve space in the log for the next transaction.
 	 * This also pushes items in the "AIL", the list of logged items,
 	 * out to disk if they are taking up space at the tail of the log
 	 * that we want to use.  This requires that either nothing be locked
@@ -1078,14 +1069,5 @@ xfs_trans_roll(
 	 * the prior and the next transactions.
 	 */
 	tres.tr_logflags = XFS_TRANS_PERM_LOG_RES;
-	error = xfs_trans_reserve(trans, &tres, 0, 0);
-	/*
-	 *  Ensure that the inode is in the new transaction and locked.
-	 */
-	if (error)
-		return error;
-
-	if (dp)
-		xfs_trans_ijoin(trans, dp, 0);
-	return 0;
+	return xfs_trans_reserve(*tpp, &tres, 0, 0);
 }
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 7d627721e4b3..b25d3d22e289 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -228,7 +228,8 @@ int		xfs_trans_free_extent(struct xfs_trans *,
 				      struct xfs_efd_log_item *, xfs_fsblock_t,
 				      xfs_extlen_t, struct xfs_owner_info *);
 int		xfs_trans_commit(struct xfs_trans *);
-int		xfs_trans_roll(struct xfs_trans **, struct xfs_inode *);
+int		xfs_trans_roll(struct xfs_trans **);
+int		xfs_trans_roll_inode(struct xfs_trans **, struct xfs_inode *);
 void		xfs_trans_cancel(xfs_trans_t *);
 int		xfs_trans_ail_init(struct xfs_mount *);
 void		xfs_trans_ail_destroy(struct xfs_mount *);
diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c
index dab8daa676f9..daa7615497f9 100644
--- a/fs/xfs/xfs_trans_inode.c
+++ b/fs/xfs/xfs_trans_inode.c
@@ -134,3 +134,17 @@ xfs_trans_log_inode(
 	flags |= ip->i_itemp->ili_last_fields;
 	ip->i_itemp->ili_fields |= flags;
 }
+
+int
+xfs_trans_roll_inode(
+	struct xfs_trans	**tpp,
+	struct xfs_inode	*ip)
+{
+	int			error;
+
+	xfs_trans_log_inode(*tpp, ip, XFS_ILOG_CORE);
+	error = xfs_trans_roll(tpp);
+	if (!error)
+		xfs_trans_ijoin(*tpp, ip, 0);
+	return error;
+}
-- 
cgit v1.2.3


From 882d8785fb87f691000a0b33c215364d74bd2ceb Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 28 Aug 2017 10:21:03 -0700
Subject: xfs: rename xfs_defer_join to xfs_defer_ijoin

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/libxfs/xfs_bmap.c  | 2 +-
 fs/xfs/libxfs/xfs_defer.c | 4 ++--
 fs/xfs/libxfs/xfs_defer.h | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index c09c16b1ad3b..dcefadd4fc3a 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -6452,7 +6452,7 @@ __xfs_bmap_add(
 	bi->bi_whichfork = whichfork;
 	bi->bi_bmap = *bmap;
 
-	error = xfs_defer_join(dfops, bi->bi_owner);
+	error = xfs_defer_ijoin(dfops, bi->bi_owner);
 	if (error) {
 		kmem_free(bi);
 		return error;
diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c
index 4ea2f068d95c..6c0da24c68c9 100644
--- a/fs/xfs/libxfs/xfs_defer.c
+++ b/fs/xfs/libxfs/xfs_defer.c
@@ -281,7 +281,7 @@ xfs_defer_has_unfinished_work(
  * to xfs_defer_finish().
  */
 int
-xfs_defer_join(
+xfs_defer_ijoin(
 	struct xfs_defer_ops		*dop,
 	struct xfs_inode		*ip)
 {
@@ -324,7 +324,7 @@ xfs_defer_finish(
 
 	trace_xfs_defer_finish((*tp)->t_mountp, dop);
 
-	xfs_defer_join(dop, ip);
+	xfs_defer_ijoin(dop, ip);
 
 	/* Until we run out of pending work to finish... */
 	while (xfs_defer_has_unfinished_work(dop)) {
diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h
index f6e93ef0bffe..70c944b21a2a 100644
--- a/fs/xfs/libxfs/xfs_defer.h
+++ b/fs/xfs/libxfs/xfs_defer.h
@@ -77,7 +77,7 @@ int xfs_defer_finish(struct xfs_trans **tp, struct xfs_defer_ops *dop,
 void xfs_defer_cancel(struct xfs_defer_ops *dop);
 void xfs_defer_init(struct xfs_defer_ops *dop, xfs_fsblock_t *fbp);
 bool xfs_defer_has_unfinished_work(struct xfs_defer_ops *dop);
-int xfs_defer_join(struct xfs_defer_ops *dop, struct xfs_inode *ip);
+int xfs_defer_ijoin(struct xfs_defer_ops *dop, struct xfs_inode *ip);
 
 /* Description of a deferred type. */
 struct xfs_defer_op_type {
-- 
cgit v1.2.3


From 8ad7c629b18695ec1ee8654fb27599864049862b Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 28 Aug 2017 10:21:04 -0700
Subject: xfs: remove the ip argument to xfs_defer_finish

And instead require callers to explicitly join the inode using
xfs_defer_ijoin.  Also consolidate the defer error handling in
a few places using a goto label.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/libxfs/xfs_attr.c        | 140 +++++++++++++++++++++-------------------
 fs/xfs/libxfs/xfs_attr_remote.c |  35 +++++-----
 fs/xfs/libxfs/xfs_bmap.c        |   4 +-
 fs/xfs/libxfs/xfs_defer.c       |   8 +--
 fs/xfs/libxfs/xfs_defer.h       |   3 +-
 fs/xfs/libxfs/xfs_refcount.c    |   2 +-
 fs/xfs/xfs_bmap_item.c          |   2 +-
 fs/xfs/xfs_bmap_util.c          |  10 +--
 fs/xfs/xfs_dquot.c              |   2 +-
 fs/xfs/xfs_inode.c              |  13 ++--
 fs/xfs/xfs_iomap.c              |   6 +-
 fs/xfs/xfs_refcount_item.c      |   2 +-
 fs/xfs/xfs_reflink.c            |  11 ++--
 fs/xfs/xfs_rtalloc.c            |   2 +-
 fs/xfs/xfs_symlink.c            |   5 +-
 15 files changed, 129 insertions(+), 116 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c
index bafa0f6bfafa..6249c92671de 100644
--- a/fs/xfs/libxfs/xfs_attr.c
+++ b/fs/xfs/libxfs/xfs_attr.c
@@ -328,13 +328,12 @@ xfs_attr_set(
 		 */
 		xfs_defer_init(args.dfops, args.firstblock);
 		error = xfs_attr_shortform_to_leaf(&args);
-		if (!error)
-			error = xfs_defer_finish(&args.trans, args.dfops, dp);
-		if (error) {
-			args.trans = NULL;
-			xfs_defer_cancel(&dfops);
-			goto out;
-		}
+		if (error)
+			goto out_defer_cancel;
+		xfs_defer_ijoin(args.dfops, dp);
+		error = xfs_defer_finish(&args.trans, args.dfops);
+		if (error)
+			goto out_defer_cancel;
 
 		/*
 		 * Commit the leaf transformation.  We'll need another (lin