From 4f9d956d1939f97e2cb278b9615b6c683cd90e97 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 24 Aug 2017 11:52:21 -0400
Subject: ext4: do not unnecessarily allocate buffer in recently_deleted()

In recently_deleted() function we want to check whether inode is still
cached in buffer cache. Use sb_find_get_block() for that instead of
sb_getblk() to avoid unnecessary allocation of bdev page and buffer
heads.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/ialloc.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 507bfb3344d4..0d03e73dccaf 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -707,9 +707,9 @@ static int recently_deleted(struct super_block *sb, ext4_group_t group, int ino)
 	if (unlikely(!gdp))
 		return 0;
 
-	bh = sb_getblk(sb, ext4_inode_table(sb, gdp) +
+	bh = sb_find_get_block(sb, ext4_inode_table(sb, gdp) +
 		       (ino / inodes_per_block));
-	if (unlikely(!bh) || !buffer_uptodate(bh))
+	if (!bh || !buffer_uptodate(bh))
 		/*
 		 * If the block is not in the buffer cache, then it
 		 * must have been written out.
-- 
cgit v1.2.3


From 2fe435d8b0746cab8f5e13e1352a22742e84ff1a Mon Sep 17 00:00:00 2001
From: Wang Shilong <wshilong@ddn.com>
Date: Thu, 24 Aug 2017 11:58:18 -0400
Subject: ext4: cleanup goto next group

avoid duplicated codes, also we need goto
next group in case we found reserved inode.

Signed-off-by: Wang Shilong <wshilong@ddn.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Reviewed-by: Jan Kara <jack@suse.cz>
---
 fs/ext4/ialloc.c | 20 ++++++--------------
 1 file changed, 6 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 0d03e73dccaf..9e6eb1c0e2ee 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -892,19 +892,13 @@ got_group:
 		/*
 		 * Check free inodes count before loading bitmap.
 		 */
-		if (ext4_free_inodes_count(sb, gdp) == 0) {
-			if (++group == ngroups)
-				group = 0;
-			continue;
-		}
+		if (ext4_free_inodes_count(sb, gdp) == 0)
+			goto next_group;
 
 		grp = ext4_get_group_info(sb, group);
 		/* Skip groups with already-known suspicious inode tables */
-		if (EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) {
-			if (++group == ngroups)
-				group = 0;
-			continue;
-		}
+		if (EXT4_MB_GRP_IBITMAP_CORRUPT(grp))
+			goto next_group;
 
 		brelse(inode_bitmap_bh);
 		inode_bitmap_bh = ext4_read_inode_bitmap(sb, group);
@@ -912,9 +906,7 @@ got_group:
 		if (EXT4_MB_GRP_IBITMAP_CORRUPT(grp) ||
 		    IS_ERR(inode_bitmap_bh)) {
 			inode_bitmap_bh = NULL;
-			if (++group == ngroups)
-				group = 0;
-			continue;
+			goto next_group;
 		}
 
 repeat_in_this_group:
@@ -926,7 +918,7 @@ repeat_in_this_group:
 		if (group == 0 && (ino+1) < EXT4_FIRST_INO(sb)) {
 			ext4_error(sb, "reserved inode found cleared - "
 				   "inode=%lu", ino + 1);
-			continue;
+			goto next_group;
 		}
 		if ((EXT4_SB(sb)->s_journal == NULL) &&
 		    recently_deleted(sb, group, ino)) {
-- 
cgit v1.2.3


From 901ed070df3c2c19e3083a734eafc82599fe991b Mon Sep 17 00:00:00 2001
From: Wang Shilong <wshilong@ddn.com>
Date: Thu, 24 Aug 2017 12:56:35 -0400
Subject: ext4: reduce lock contention in __ext4_new_inode

While running number of creating file threads concurrently,
we found heavy lock contention on group spinlock:

FUNC                           TOTAL_TIME(us)       COUNT        AVG(us)
ext4_create                    1707443399           1440000      1185.72
_raw_spin_lock                 1317641501           180899929    7.28
jbd2__journal_start            287821030            1453950      197.96
jbd2_journal_get_write_access  33441470             73077185     0.46
ext4_add_nondir                29435963             1440000      20.44
ext4_add_entry                 26015166             1440049      18.07
ext4_dx_add_entry              25729337             1432814      17.96
ext4_mark_inode_dirty          12302433             5774407      2.13

most of cpu time blames to _raw_spin_lock, here is some testing
numbers with/without patch.

Test environment:
Server : SuperMicro Sever (2 x E5-2690 v3@2.60GHz, 128GB 2133MHz
         DDR4 Memory, 8GbFC)
Storage : 2 x RAID1 (DDN SFA7700X, 4 x Toshiba PX02SMU020 200GB
          Read Intensive SSD)

format command:
        mkfs.ext4 -J size=4096

test command:
        mpirun -np 48 mdtest -n 30000 -d /ext4/mdtest.out -F -C \
                -r -i 1 -v -p 10 -u #first run to load inode

        mpirun -np 48 mdtest -n 30000 -d /ext4/mdtest.out -F -C \
                -r -i 3 -v -p 10 -u

Kernel version: 4.13.0-rc3

Test  1,440,000 files with 48 directories by 48 processes:

Without patch:

File Creation   File removal
79,033          289,569 ops/per second
81,463          285,359
79,875          288,475

With patch:
File Creation   File removal
810669		301694
812805		302711
813965		297670

Creation performance is improved more than 10X with large
journal size. The main problem here is we test bitmap
and do some check and journal operations which could be
slept, then we test and set with lock hold, this could
be racy, and make 'inode' steal by other process.

However, after first try, we could confirm handle has
been started and inode bitmap journaled too, then
we could find and set bit with lock hold directly, this
will mostly gurateee success with second try.

Tested-by: Shuichi Ihara <sihara@ddn.com>
Signed-off-by: Wang Shilong <wshilong@ddn.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Reviewed-by: Jan Kara <jack@suse.cz>
---
 fs/ext4/ialloc.c | 50 +++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 39 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 9e6eb1c0e2ee..fb83a36b9723 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -730,6 +730,27 @@ out:
 	return ret;
 }
 
+static int find_inode_bit(struct super_block *sb, ext4_group_t group,
+			  struct buffer_head *bitmap, unsigned long *ino)
+{
+next:
+	*ino = ext4_find_next_zero_bit((unsigned long *)
+				       bitmap->b_data,
+				       EXT4_INODES_PER_GROUP(sb), *ino);
+	if (*ino >= EXT4_INODES_PER_GROUP(sb))
+		return 0;
+
+	if ((EXT4_SB(sb)->s_journal == NULL) &&
+	    recently_deleted(sb, group, *ino)) {
+		*ino = *ino + 1;
+		if (*ino < EXT4_INODES_PER_GROUP(sb))
+			goto next;
+		return 0;
+	}
+
+	return 1;
+}
+
 /*
  * There are two policies for allocating an inode.  If the new inode is
  * a directory, then a forward search is made for a block group with both
@@ -910,21 +931,16 @@ got_group:
 		}
 
 repeat_in_this_group:
-		ino = ext4_find_next_zero_bit((unsigned long *)
-					      inode_bitmap_bh->b_data,
-					      EXT4_INODES_PER_GROUP(sb), ino);
-		if (ino >= EXT4_INODES_PER_GROUP(sb))
+		ret2 = find_inode_bit(sb, group, inode_bitmap_bh, &ino);
+		if (!ret2)
 			goto next_group;
-		if (group == 0 && (ino+1) < EXT4_FIRST_INO(sb)) {
+
+		if (group == 0 && (ino + 1) < EXT4_FIRST_INO(sb)) {
 			ext4_error(sb, "reserved inode found cleared - "
 				   "inode=%lu", ino + 1);
 			goto next_group;
 		}
-		if ((EXT4_SB(sb)->s_journal == NULL) &&
-		    recently_deleted(sb, group, ino)) {
-			ino++;
-			goto next_inode;
-		}
+
 		if (!handle) {
 			BUG_ON(nblocks <= 0);
 			handle = __ext4_journal_start_sb(dir->i_sb, line_no,
@@ -944,11 +960,23 @@ repeat_in_this_group:
 		}
 		ext4_lock_group(sb, group);
 		ret2 = ext4_test_and_set_bit(ino, inode_bitmap_bh->b_data);
+		if (ret2) {
+			/* Someone already took the bit. Repeat the search
+			 * with lock held.
+			 */
+			ret2 = find_inode_bit(sb, group, inode_bitmap_bh, &ino);
+			if (ret2) {
+				ext4_set_bit(ino, inode_bitmap_bh->b_data);
+				ret2 = 0;
+			} else {
+				ret2 = 1; /* we didn't grab the inode */
+			}
+		}
 		ext4_unlock_group(sb, group);
 		ino++;		/* the inode bitmap is zero-based */
 		if (!ret2)
 			goto got; /* we grabbed the inode! */
-next_inode:
+
 		if (ino < EXT4_INODES_PER_GROUP(sb))
 			goto repeat_in_this_group;
 next_group:
-- 
cgit v1.2.3


From 1bd8d6cd3e413d64e543ec3e69ff43e75a1cf1ea Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Thu, 24 Aug 2017 13:22:06 -0400
Subject: ext4: in ext4_seek_{hole,data}, return -ENXIO for negative offsets

In the ext4 implementations of SEEK_HOLE and SEEK_DATA, make sure we
return -ENXIO for negative offsets instead of banging around inside
the extent code and returning -EFSCORRUPTED.

Reported-by: Mateusz S <muttdini@gmail.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Cc: stable@vger.kernel.org # 4.6
---
 fs/ext4/file.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 0d7cf0cc9b87..86ea1d92839a 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -595,7 +595,7 @@ static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)
 	inode_lock(inode);
 
 	isize = i_size_read(inode);
-	if (offset >= isize) {
+	if (offset < 0 || offset >= isize) {
 		inode_unlock(inode);
 		return -ENXIO;
 	}
@@ -658,7 +658,7 @@ static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize)
 	inode_lock(inode);
 
 	isize = i_size_read(inode);
-	if (offset >= isize) {
+	if (offset < 0 || offset >= isize) {
 		inode_unlock(inode);
 		return -ENXIO;
 	}
-- 
cgit v1.2.3


From d695a1bea351276615ad270860bc1fbddcfbaeb3 Mon Sep 17 00:00:00 2001
From: Markus Elfring <elfring@users.sourceforge.net>
Date: Thu, 24 Aug 2017 13:50:24 -0400
Subject: ext4: use sizeof(*ptr)

Replace the specification of data structures by pointer dereferences
as the parameter for the operator "sizeof" to make the corresponding size
determination a bit safer according to the Linux coding style convention.

This issue was detected by using the Coccinelle software.

Signed-off-by: Markus Elfring <elfring@users.sourceforge.net>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Reviewed-by: Eric Sandeen <sandeen@redhat.com>
---
 fs/ext4/dir.c | 2 +-
 fs/ext4/mmp.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index e8b365000d73..b04e882179c6 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -411,7 +411,7 @@ static struct dir_private_info *ext4_htree_create_dir_info(struct file *filp,
 {
 	struct dir_private_info *p;
 
-	p = kzalloc(sizeof(struct dir_private_info), GFP_KERNEL);
+	p = kzalloc(sizeof(*p), GFP_KERNEL);
 	if (!p)
 		return NULL;
 	p->curr_hash = pos2maj_hash(filp, pos);
diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c
index eb9835638680..77cdce1f17ce 100644
--- a/fs/ext4/mmp.c
+++ b/fs/ext4/mmp.c
@@ -367,7 +367,7 @@ skip:
 		goto failed;
 	}
 
-	mmpd_data = kmalloc(sizeof(struct mmpd_data), GFP_KERNEL);
+	mmpd_data = kmalloc(sizeof(*mmpd_data), GFP_KERNEL);
 	if (!mmpd_data) {
 		ext4_warning(sb, "not enough memory for mmpd_data");
 		goto failed;
-- 
cgit v1.2.3


From eaa093d2c0d15ad48f66eee414452e4eaa2be3d2 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 24 Aug 2017 13:59:24 -0400
Subject: ext4: remove timebomb in ext4_decode_extra_time()

Changing behavior based on the version code is a timebomb waiting to
happen, and not easily bisectable.  Drop it and leave any removal
to explicit developer action. (And I don't think file system
should _ever_ remove backwards compatibility that has no explicit
flag, but I'll leave that to the ext4 folks).

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Reviewed-by: Eric Biggers <ebiggers@google.com>
---
 fs/ext4/ext4.h | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index a2bb7d2870e4..08d693255567 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -838,13 +838,11 @@ static inline void ext4_decode_extra_time(struct timespec *time, __le32 extra)
 {
 	if (unlikely(sizeof(time->tv_sec) > 4 &&
 			(extra & cpu_to_le32(EXT4_EPOCH_MASK)))) {
-#if LINUX_VERSION_CODE < KERNEL_VERSION(4,20,0)
+
+#if 1
 		/* Handle legacy encoding of pre-1970 dates with epoch
-		 * bits 1,1.  We assume that by kernel version 4.20,
-		 * everyone will have run fsck over the affected
-		 * filesystems to correct the problem.  (This
-		 * backwards compatibility may be removed before this
-		 * time, at the discretion of the ext4 developers.)
+		 * bits 1,1. (This backwards compatibility may be removed
+		 * at the discretion of the ext4 developers.)
 		 */
 		u64 extra_bits = le32_to_cpu(extra) & EXT4_EPOCH_MASK;
 		if (extra_bits == 3 && ((time->tv_sec) & 0x80000000) != 0)
-- 
cgit v1.2.3


From a6d05676047ec9ef7b98087f8b19a5283dd5a8ce Mon Sep 17 00:00:00 2001
From: Tahsin Erdogan <tahsin@google.com>
Date: Thu, 24 Aug 2017 14:25:02 -0400
Subject: ext4: backward compatibility support for Lustre ea_inode
 implementation

Original Lustre ea_inode feature did not have ref counts on xattr inodes
because there was always one parent that referenced it. New
implementation expects ref count to be initialized which is not true for
Lustre case. Handle this by detecting Lustre created xattr inode and set
its ref count to 1.

The quota handling of xattr inodes have also changed with deduplication
support. New implementation manually manages quotas to support sharing
across multiple users. A consequence is that, a referencing inode
incorporates the blocks of xattr inode into its own i_block field.

We need to know how a xattr inode was created so that we can reverse the
block charges during reference removal. This is handled by introducing a
EXT4_STATE_LUSTRE_EA_INODE flag. The flag is set on a xattr inode if
inode appears to have been created by Lustre. During xattr inode reference
removal, the manual quota uncharge is skipped if the flag is set.

Signed-off-by: Tahsin Erdogan <tahsin@google.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/ext4.h  |   1 +
 fs/ext4/inode.c |   8 ----
 fs/ext4/xattr.c | 141 +++++++++++++++++++++++++++++++++++++-------------------
 3 files changed, 94 insertions(+), 56 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 08d693255567..84b9da192238 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1565,6 +1565,7 @@ enum {
 					   nolocking */
 	EXT4_STATE_MAY_INLINE_DATA,	/* may have in-inode data */
 	EXT4_STATE_EXT_PRECACHED,	/* extents have been precached */
+	EXT4_STATE_LUSTRE_EA_INODE,	/* Lustre-style ea_inode */
 };
 
 #define EXT4_INODE_BIT_FNS(name, field, offset)				\
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index c774bdc22759..714396760616 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4897,14 +4897,6 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 	brelse(iloc.bh);
 	ext4_set_inode_flags(inode);
 
-	if (ei->i_flags & EXT4_EA_INODE_FL) {
-		ext4_xattr_inode_set_class(inode);
-
-		inode_lock(inode);
-		inode->i_flags |= S_NOQUOTA;
-		inode_unlock(inode);
-	}
-
 	unlock_new_inode(inode);
 	return inode;
 
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 3dd970168448..3b69330a4250 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -354,8 +354,10 @@ free_bhs:
 	return ret;
 }
 
+#define EXT4_XATTR_INODE_GET_PARENT(inode) ((__u32)(inode)->i_mtime.tv_sec)
+
 static int ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino,
-				 struct inode **ea_inode)
+				 u32 ea_inode_hash, struct inode **ea_inode)
 {
 	struct inode *inode;
 	int err;
@@ -385,6 +387,24 @@ static int ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino,
 		goto error;
 	}
 
+	ext4_xattr_inode_set_class(inode);
+
+	/*
+	 * Check whether this is an old Lustre-style xattr inode. Lustre
+	 * implementation does not have hash validation, rather it has a
+	 * backpointer from ea_inode to the parent inode.
+	 */
+	if (ea_inode_hash != ext4_xattr_inode_get_hash(inode) &&
+	    EXT4_XATTR_INODE_GET_PARENT(inode) == parent->i_ino &&
+	    inode->i_generation == parent->i_generation) {
+		ext4_set_inode_state(inode, EXT4_STATE_LUSTRE_EA_INODE);
+		ext4_xattr_inode_set_ref(inode, 1);
+	} else {
+		inode_lock(inode);
+		inode->i_flags |= S_NOQUOTA;
+		inode_unlock(inode);
+	}
+
 	*ea_inode = inode;
 	return 0;
 error:
@@ -417,8 +437,6 @@ ext4_xattr_inode_verify_hashes(struct inode *ea_inode,
 	return 0;
 }
 
-#define EXT4_XATTR_INODE_GET_PARENT(inode) ((__u32)(inode)->i_mtime.tv_sec)
-
 /*
  * Read xattr value from the EA inode.
  */
@@ -431,7 +449,7 @@ ext4_xattr_inode_get(struct inode *inode, struct ext4_xattr_entry *entry,
 	int err;
 
 	err = ext4_xattr_inode_iget(inode, le32_to_cpu(entry->e_value_inum),
-				    &ea_inode);
+				    le32_to_cpu(entry->e_hash), &ea_inode);
 	if (err) {
 		ea_inode = NULL;
 		goto out;
@@ -449,29 +467,20 @@ ext4_xattr_inode_get(struct inode *inode, struct ext4_xattr_entry *entry,
 	if (err)
 		goto out;
 
-	err = ext4_xattr_inode_verify_hashes(ea_inode, entry, buffer, size);
-	/*
-	 * Compatibility check for old Lustre ea_inode implementation. Old
-	 * version does not have hash validation, but it has a backpointer
-	 * from ea_inode to the parent inode.
-	 */
-	if (err == -EFSCORRUPTED) {
-		if (EXT4_XATTR_INODE_GET_PARENT(ea_inode) != inode->i_ino ||
-		    ea_inode->i_generation != inode->i_generation) {
+	if (!ext4_test_inode_state(ea_inode, EXT4_STATE_LUSTRE_EA_INODE)) {
+		err = ext4_xattr_inode_verify_hashes(ea_inode, entry, buffer,
+						     size);
+		if (err) {
 			ext4_warning_inode(ea_inode,
 					   "EA inode hash validation failed");
 			goto out;
 		}
-		/* Do not add ea_inode to the cache. */
-		ea_inode_cache = NULL;
-		err = 0;
-	} else if (err)
-		goto out;
 
-	if (ea_inode_cache)
-		mb_cache_entry_create(ea_inode_cache, GFP_NOFS,
-				      ext4_xattr_inode_get_hash(ea_inode),
-				      ea_inode->i_ino, true /* reusable */);
+		if (ea_inode_cache)
+			mb_cache_entry_create(ea_inode_cache, GFP_NOFS,
+					ext4_xattr_inode_get_hash(ea_inode),
+					ea_inode->i_ino, true /* reusable */);
+	}
 out:
 	iput(ea_inode);
 	return err;
@@ -838,10 +847,15 @@ static int ext4_xattr_inode_alloc_quota(struct inode *inode, size_t len)
 	return err;
 }
 
-static void ext4_xattr_inode_free_quota(struct inode *inode, size_t len)
+static void ext4_xattr_inode_free_quota(struct inode *parent,
+					struct inode *ea_inode,
+					size_t len)
 {
-	dquot_free_space_nodirty(inode, round_up_cluster(inode, len));
-	dquot_free_inode(inode);
+	if (ea_inode &&
+	    ext4_test_inode_state(ea_inode, EXT4_STATE_LUSTRE_EA_INODE))
+		return;
+	dquot_free_space_nodirty(parent, round_up_cluster(parent, len));
+	dquot_free_inode(parent);
 }
 
 int __ext4_xattr_set_credits(struct super_block *sb, struct inode *inode,
@@ -1071,7 +1085,9 @@ static int ext4_xattr_inode_inc_ref_all(handle_t *handle, struct inode *parent,
 		if (!entry->e_value_inum)
 			continue;
 		ea_ino = le32_to_cpu(entry->e_value_inum);
-		err = ext4_xattr_inode_iget(parent, ea_ino, &ea_inode);
+		err = ext4_xattr_inode_iget(parent, ea_ino,
+					    le32_to_cpu(entry->e_hash),
+					    &ea_inode);
 		if (err)
 			goto cleanup;
 		err = ext4_xattr_inode_inc_ref(handle, ea_inode);
@@ -1093,7 +1109,9 @@ cleanup:
 		if (!entry->e_value_inum)
 			continue;
 		ea_ino = le32_to_cpu(entry->e_value_inum);
-		err = ext4_xattr_inode_iget(parent, ea_ino, &ea_inode);
+		err = ext4_xattr_inode_iget(parent, ea_ino,
+					    le32_to_cpu(entry->e_hash),
+					    &ea_inode);
 		if (err) {
 			ext4_warning(parent->i_sb,
 				     "cleanup ea_ino %u iget error %d", ea_ino,
@@ -1131,7 +1149,9 @@ ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent,
 		if (!entry->e_value_inum)
 			continue;
 		ea_ino = le32_to_cpu(entry->e_value_inum);
-		err = ext4_xattr_inode_iget(parent, ea_ino, &ea_inode);
+		err = ext4_xattr_inode_iget(parent, ea_ino,
+					    le32_to_cpu(entry->e_hash),
+					    &ea_inode);
 		if (err)
 			continue;
 
@@ -1159,7 +1179,7 @@ ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent,
 		}
 
 		if (!skip_quota)
-			ext4_xattr_inode_free_quota(parent,
+			ext4_xattr_inode_free_quota(parent, ea_inode,
 					      le32_to_cpu(entry->e_value_size));
 
 		/*
@@ -1591,6 +1611,7 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i,
 	if (!s->not_found && here->e_value_inum) {
 		ret = ext4_xattr_inode_iget(inode,
 					    le32_to_cpu(here->e_value_inum),
+					    le32_to_cpu(here->e_hash),
 					    &old_ea_inode);
 		if (ret) {
 			old_ea_inode = NULL;
@@ -1609,7 +1630,7 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i,
 						     &new_ea_inode);
 		if (ret) {
 			new_ea_inode = NULL;
-			ext4_xattr_inode_free_quota(inode, i->value_len);
+			ext4_xattr_inode_free_quota(inode, NULL, i->value_len);
 			goto out;
 		}
 	}
@@ -1628,13 +1649,13 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i,
 					ext4_warning_inode(new_ea_inode,
 						  "dec ref new_ea_inode err=%d",
 						  err);
-				ext4_xattr_inode_free_quota(inode,
+				ext4_xattr_inode_free_quota(inode, new_ea_inode,
 							    i->value_len);
 			}
 			goto out;
 		}
 
-		ext4_xattr_inode_free_quota(inode,
+		ext4_xattr_inode_free_quota(inode, old_ea_inode,
 					    le32_to_cpu(here->e_value_size));
 	}
 
@@ -1805,8 +1826,10 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
 	struct mb_cache_entry *ce = NULL;
 	int error = 0;
 	struct mb_cache *ea_block_cache = EA_BLOCK_CACHE(inode);
-	struct inode *ea_inode = NULL;
-	size_t old_ea_inode_size = 0;
+	struct inode *ea_inode = NULL, *tmp_inode;
+	size_t old_ea_inode_quota = 0;
+	unsigned int ea_ino;
+
 
 #define header(x) ((struct ext4_xattr_header *)(x))
 
@@ -1865,12 +1888,24 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
 			 * like it has an empty value.
 			 */
 			if (!s->not_found && s->here->e_value_inum) {
-				/*
-				 * Defer quota free call for previous inode
-				 * until success is guaranteed.
-				 */
-				old_ea_inode_size = le32_to_cpu(
+				ea_ino = le32_to_cpu(s->here->e_value_inum);
+				error = ext4_xattr_inode_iget(inode, ea_ino,
+					      le32_to_cpu(s->here->e_hash),
+					      &tmp_inode);
+				if (error)
+					goto cleanup;
+
+				if (!ext4_test_inode_state(tmp_inode,
+						EXT4_STATE_LUSTRE_EA_INODE)) {
+					/*
+					 * Defer quota free call for previous
+					 * inode until success is guaranteed.
+					 */
+					old_ea_inode_quota = le32_to_cpu(
 							s->here->e_value_size);
+				}
+				iput(tmp_inode);
+
 				s->here->e_value_inum = 0;
 				s->here->e_value_size = 0;
 			}
@@ -1897,8 +1932,6 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
 		goto cleanup;
 
 	if (i->value && s->here->e_value_inum) {
-		unsigned int ea_ino;
-
 		/*
 		 * A ref count on ea_inode has been taken as part of the call to
 		 * ext4_xattr_set_entry() above. We would like to drop this
@@ -1906,7 +1939,9 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
 		 * initialized and has its own ref count on the ea_inode.
 		 */
 		ea_ino = le32_to_cpu(s->here->e_value_inum);
-		error = ext4_xattr_inode_iget(inode, ea_ino, &ea_inode);
+		error = ext4_xattr_inode_iget(inode, ea_ino,
+					      le32_to_cpu(s->here->e_hash),
+					      &ea_inode);
 		if (error) {
 			ea_inode = NULL;
 			goto cleanup;
@@ -2056,8 +2091,8 @@ getblk_failed:
 		}
 	}
 
-	if (old_ea_inode_size)
-		ext4_xattr_inode_free_quota(inode, old_ea_inode_size);
+	if (old_ea_inode_quota)
+		ext4_xattr_inode_free_quota(inode, NULL, old_ea_inode_quota);
 
 	/* Update the inode. */
 	EXT4_I(inode)->i_file_acl = new_bh ? new_bh->b_blocknr : 0;
@@ -2084,7 +2119,7 @@ cleanup:
 
 		/* If there was an error, revert the quota charge. */
 		if (error)
-			ext4_xattr_inode_free_quota(inode,
+			ext4_xattr_inode_free_quota(inode, ea_inode,
 						    i_size_read(ea_inode));
 		iput(ea_inode);
 	}
@@ -2800,6 +2835,7 @@ int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
 	struct ext4_xattr_ibody_header *header;
 	struct ext4_iloc iloc = { .bh = NULL };
 	struct ext4_xattr_entry *entry;
+	struct inode *ea_inode;
 	int error;
 
 	error = ext4_xattr_ensure_credits(handle, inode, extra_credits,
@@ -2854,10 +2890,19 @@ int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
 
 		if (ext4_has_feature_ea_inode(inode->i_sb)) {
 			for (entry = BFIRST(bh); !IS_LAST_ENTRY(entry);
-			     entry = EXT4_XATTR_NEXT(entry))
-				if (entry->e_value_inum)
-					ext4_xattr_inode_free_quota(inode,
+			     entry = EXT4_XATTR_NEXT(entry)) {
+				if (!entry->e_value_inum)
+					continue;
+				error = ext4_xattr_inode_iget(inode,
+					      le32_to_cpu(entry->e_value_inum),
+					      le32_to_cpu(entry->e_hash),
+					      &ea_inode);
+				if (error)
+					continue;
+				ext4_xattr_inode_free_quota(inode, ea_inode,
 					      le32_to_cpu(entry->e_value_size));
+				iput(ea_inode);
+			}
 
 		}
 
-- 
cgit v1.2.3


From 918dc9d0ab8565886de3ea8bac329e2b102e7f3a Mon Sep 17 00:00:00 2001
From: Damien Guibouret <damien.guibouret@partition-saving.com>
Date: Thu, 24 Aug 2017 15:11:34 -0400
Subject: ext4: remove useless test and assignment in strtohash functions

On transformation of str to hash, computed value is initialised before
first byte modulo 4. But it is already initialised before entering loop
and after processing last byte modulo 4. So the corresponding test and
initialisation could be removed.

Signed-off-by: Damien Guibouret <damien.guibouret@partition-saving.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/hash.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c
index 38b8a96eb97c..00c6dd29e621 100644
--- a/fs/ext4/hash.c
+++ b/fs/ext4/hash.c
@@ -148,8 +148,6 @@ static void str2hashbuf_signed(const char *msg, int len, __u32 *buf, int num)
 	if (len > num*4)
 		len = num * 4;
 	for (i = 0; i < len; i++) {
-		if ((i % 4) == 0)
-			val = pad;
 		val = ((int) scp[i]) + (val << 8);
 		if ((i % 4) == 3) {
 			*buf++ = val;
@@ -176,8 +174,6 @@ static void str2hashbuf_unsigned(const char *msg, int len, __u32 *buf, int num)
 	if (len > num*4)
 		len = num * 4;
 	for (i = 0; i < len; i++) {
-		if ((i % 4) == 0)
-			val = pad;
 		val = ((int) ucp[i]) + (val << 8);
 		if ((i % 4) == 3) {
 			*buf++ = val;
-- 
cgit v1.2.3


From b0a5a9589decd07db755d6a8d9c0910d96ff7992 Mon Sep 17 00:00:00 2001
From: "zhangyi (F)" <yi.zhang@huawei.com>
Date: Thu, 24 Aug 2017 15:19:39 -0400
Subject: ext4: fix incorrect quotaoff if the quota feature is enabled

Current ext4 quota should always "usage enabled" if the
quota feautre is enabled. But in ext4_orphan_cleanup(), it
turn quotas off directly (used for the older journaled
quota), so we cannot turn it on again via "quotaon" unless
umount and remount ext4.

Simple reproduce:

  mkfs.ext4 -O project,quota /dev/vdb1
  mount -o prjquota /dev/vdb1 /mnt
  chattr -p 123 /mnt
  chattr +P /mnt
  touch /mnt/aa /mnt/bb
  exec 100<>/mnt/aa
  rm -f /mnt/aa
  sync
  echo c > /proc/sysrq-trigger

  #reboot and mount
  mount -o prjquota /dev/vdb1 /mnt
  #query status
  quotaon -Ppv /dev/vdb1
  #output
  quotaon: Cannot find mountpoint for device /dev/vdb1
  quotaon: No correct mountpoint specified.

This patch add check for journaled quotas to avoid incorrect
quotaoff when ext4 has quota feautre.

Signed-off-by: zhangyi (F) <yi.zhang@huawei.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: stable@vger.kernel.org # 3.18
---
 fs/ext4/super.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index d61a70e2193a..cade5c894f4a 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -2442,7 +2442,7 @@ static void ext4_orphan_cleanup(struct super_block *sb,
 #ifdef CONFIG_QUOTA
 	/* Needed for iput() to work correctly and not trash data */
 	sb->s_flags |= MS_ACTIVE;
-	/* Turn on quotas so that they are updated correctly */
+	/* Turn on journaled quotas so that they are updated correctly */
 	for (i = 0; i < EXT4_MAXQUOTAS; i++) {
 		if (EXT4_SB(sb)->s_qf_names[i]) {
 			int ret = ext4_quota_on_mount(sb, i);
@@ -2510,9 +2510,9 @@ static void ext4_orphan_cleanup(struct super_block *sb,
 		ext4_msg(sb, KERN_INFO, "%d truncate%s cleaned up",
 		       PLURAL(nr_truncates));
 #ifdef CONFIG_QUOTA
-	/* Turn quotas off */
+	/* Turn off journaled quotas if they were enabled for orphan cleanup */
 	for (i = 0; i < EXT4_MAXQUOTAS; i++) {
-		if (sb_dqopt(sb)->files[i])
+		if (EXT4_SB(sb)->s_qf_names[i] && sb_dqopt(sb)->files[i])
 			dquot_quota_off(sb, i);
 	}
 #endif
-- 
cgit v1.2.3


From 95f1fda47c9d8738f858c3861add7bf0a36a7c0b Mon Sep 17 00:00:00 2001
From: "zhangyi (F)" <yi.zhang@huawei.com>
Date: Thu, 24 Aug 2017 15:21:50 -0400
Subject: ext4: fix quota inconsistency during orphan cleanup for read-only
 mounts

Quota does not get enabled for read-only mounts if filesystem
has quota feature, so that quotas cannot updated during orphan
cleanup, which will lead to quota inconsistency.

This patch turn on quotas during orphan cleanup for this case,
make sure quotas can be updated correctly.

Reported-by: Jan Kara <jack@suse.cz>
Signed-off-by: zhangyi (F) <yi.zhang@huawei.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: stable@vger.kernel.org # 3.18+
---
 fs/ext4/super.c | 38 +++++++++++++++++++++++++++++++-------
 1 file changed, 31 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index cade5c894f4a..c9e7be58756b 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -2404,6 +2404,7 @@ static void ext4_orphan_cleanup(struct super_block *sb,
 	unsigned int s_flags = sb->s_flags;
 	int ret, nr_orphans = 0, nr_truncates = 0;
 #ifdef CONFIG_QUOTA
+	int quota_update = 0;
 	int i;
 #endif
 	if (!es->s_last_orphan) {
@@ -2442,14 +2443,32 @@ static void ext4_orphan_cleanup(struct super_block *sb,
 #ifdef CONFIG_QUOTA
 	/* Needed for iput() to work correctly and not trash data */
 	sb->s_flags |= MS_ACTIVE;
-	/* Turn on journaled quotas so that they are updated correctly */
+
+	/*
+	 * Turn on quotas which were not enabled for read-only mounts if
+	 * filesystem has quota feature, so that they are updated correctly.
+	 */
+	if (ext4_has_feature_quota(sb) && (s_flags & MS_RDONLY)) {
+		int ret = ext4_enable_quotas(sb);
+
+		if (!ret)
+			quota_update = 1;
+		else
+			ext4_msg(sb, KERN_ERR,
+				"Cannot turn on quotas: error %d", ret);
+	}
+
+	/* Turn on journaled quotas used for old sytle */
 	for (i = 0; i < EXT4_MAXQUOTAS; i++) {
 		if (EXT4_SB(sb)->s_qf_names[i]) {
 			int ret = ext4_quota_on_mount(sb, i);
-			if (ret < 0)
+
+			if (!ret)
+				quota_update = 1;
+			else
 				ext4_msg(sb, KERN_ERR,
 					"Cannot turn on journaled "
-					"quota: error %d", ret);
+					"quota: type %d: error %d", i, ret);
 		}
 	}
 #endif
@@ -2510,10 +2529,12 @@ static void ext4_orphan_cleanup(struct super_block *sb,
 		ext4_msg(sb, KERN_INFO, "%d truncate%s cleaned up",
 		       PLURAL(nr_truncates));
 #ifdef CONFIG_QUOTA
-	/* Turn off journaled quotas if they were enabled for orphan cleanup */
-	for (i = 0; i < EXT4_MAXQUOTAS; i++) {
-		if (EXT4_SB(sb)->s_qf_names[i] && sb_dqopt(sb)->files[i])
-			dquot_quota_off(sb, i);
+	/* Turn off quotas if they were enabled for orphan cleanup */
+	if (quota_update) {
+		for (i = 0; i < EXT4_MAXQUOTAS; i++) {
+			if (sb_dqopt(sb)->files[i])
+				dquot_quota_off(sb, i);
+		}
 	}
 #endif
 	sb->s_flags = s_flags; /* Restore MS_RDONLY status */
@@ -5512,6 +5533,9 @@ static int ext4_enable_quotas(struct super_block *sb)
 				DQUOT_USAGE_ENABLED |
 				(quota_mopt[type] ? DQUOT_LIMITS_ENABLED : 0));
 			if (err) {
+				for (type--; type >= 0; type--)
+					dquot_quota_off(sb, type);
+
 				ext4_warning(sb,
 					"Failed to enable quota tracking "
 					"(type=%d, err=%d). Please run "
-- 
cgit v1.2.3


From fd96b8da68d32a9403726db09b229f4b5ac849c7 Mon Sep 17 00:00:00 2001
From: Randy Dodgen <dodgen@google.com>
Date: Thu, 24 Aug 2017 15:26:01 -0400
Subject: ext4: fix fault handling when mounted with -o dax,ro

If an ext4 filesystem is mounted with both the DAX and read-only
options, executables on that filesystem will fail to start (claiming
'Segmentation fault') due to the fault handler returning
VM_FAULT_SIGBUS.

This is due to the DAX fault handler (see ext4_dax_huge_fault)
attempting to write to the journal when FAULT_FLAG_WRITE is set. This is
the wrong behavior for write faults which will lead to a COW page; in
particular, this fails for readonly mounts.

This change avoids journal writes for faults that are expected to COW.

It might be the case that this could be better handled in
ext4_iomap_begin / ext4_iomap_end (called via iomap_ops inside
dax_iomap_fault). These is some overlap already (e.g. grabbing journal
handles).

Signed-off-by: Randy Dodgen <dodgen@google.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com>
---
 fs/ext4/file.c | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 86ea1d92839a..197653ea6041 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -279,7 +279,20 @@ static int ext4_dax_huge_fault(struct vm_fault *vmf,
 	handle_t *handle = NULL;
 	struct inode *inode = file_inode(vmf->vma->vm_file);
 	struct super_block *sb = inode->i_sb;
-	bool write = vmf->flags & FAULT_FLAG_WRITE;
+
+	/*
+	 * We have to distinguish real writes from writes which will result in a
+	 * COW page; COW writes should *not* poke the journal (the file will not
+	 * be changed). Doing so would cause unintended failures when mounted
+	 * read-only.
+	 *
+	 * We check for VM_SHARED rather than vmf->cow_page since the latter is
+	 * unset for pe_size != PE_SIZE_PTE (i.e. only in do_cow_fault); for
+	 * other sizes, dax_iomap_fault will handle splitting / fallback so that
+	 * we eventually come back with a COW page.
+	 */
+	bool write = (vmf->flags & FAULT_FLAG_WRITE) &&
+		(vmf->vma->vm_flags & VM_SHARED);
 
 	if (write) {
 		sb_start_pagefault(sb);
-- 
cgit v1.2.3


From b5f515735bea4ae71c248aea3e049073f8852889 Mon Sep 17 00:00:00 2001
From: Andreas Dilger <adilger@dilger.ca>
Date: Thu, 31 Aug 2017 11:09:45 -0400
Subject: ext4: avoid Y2038 overflow in recently_deleted()

Avoid a 32-bit time overflow in recently_deleted() since i_dtime
(inode deletion time) is stored only as a 32-bit value on disk.
Since i_dtime isn't used for much beyond a boolean value in e2fsck
and is otherwise only used in this function in the kernel, there is
no benefit to use more space in the inode for this field on disk.

Instead, compare only the relative deletion time with the low
32 bits of the time using the newly-added time_before32() helper,
which is similar to time_before() and time_after() for jiffies.

Increase RECENTCY_DIRTY to 300s based on Ted's comments about
usage experience at Google.

Signed-off-by: Andreas Dilger <adilger@dilger.ca>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Reviewed-by: Arnd Bergmann <arnd@arndb.de>
---
 fs/ext4/ialloc.c | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index fb83a36b9723..71e93a23cec3 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -692,16 +692,17 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
  * somewhat arbitrary...)
  */
 #define RECENTCY_MIN	5
-#define RECENTCY_DIRTY	30
+#define RECENTCY_DIRTY	300
 
 static int recently_deleted(struct super_block *sb, ext4_group_t group, int ino)
 {
 	struct ext4_group_desc	*gdp;
 	struct ext4_inode	*raw_inode;
 	struct buffer_head	*bh;
-	unsigned long		dtime, now;
-	int	inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
-	int	offset, ret = 0, recentcy = RECENTCY_MIN;
+	int inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
+	int offset, ret = 0;
+	int recentcy = RECENTCY_MIN;
+	u32 dtime, now;
 
 	gdp = ext4_get_group_desc(sb, group, NULL);
 	if (unlikely(!gdp))
@@ -718,12 +719,18 @@ static int recently_deleted(struct super_block *sb, ext4_group_t group, int ino)
 
 	offset = (ino % inodes_per_block) * EXT4_INODE_SIZE(sb);
 	raw_inode = (struct ext4_inode *) (bh->b_data + offset);
+
+	/* i_dtime is only 32 bits on disk, but we only care about relative
+	 * times in the range of a few minutes (i.e. long enough to sync a
+	 * recently-deleted inode to disk), so using the low 32 bits of the
+	 * clock (a 68 year range) is enough, see time_before32() */
 	dtime = le32_to_cpu(raw_inode->i_dtime);
-	now = get_seconds();
+	now = ktime_get_real_seconds();
 	if (buffer_dirty(bh))
 		recentcy += RECENTCY_DIRTY;
 
-	if (dtime && (dtime < now) && (now < dtime + recentcy))
+	if (dtime && time_before32(dtime, now) &&
+	    time_before32(now, dtime + recentcy))
 		ret = 1;
 out:
 	brelse(bh);
-- 
cgit v1.2.3