summaryrefslogtreecommitdiffstats
path: root/fs/ext4
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2014-12-12 09:28:03 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2014-12-12 09:28:03 -0800
commit9bfccec24e31f4f83445cfe0c1b0a5ef97900628 (patch)
treecea50a0797abbd27a5a4a47853d1e09b97cd8c83 /fs/ext4
parent2756d373a3f45a3a9ebf4ac389f9e0e02bd35a93 (diff)
parent50db71abc529c48b21f4c3034d3cff27cfb25795 (diff)
Merge tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4
Pull ext4 updates from Ted Ts'o: "Lots of bugs fixes, including Zheng and Jan's extent status shrinker fixes, which should improve CPU utilization and potential soft lockups under heavy memory pressure, and Eric Whitney's bigalloc fixes" * tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4: (26 commits) ext4: ext4_da_convert_inline_data_to_extent drop locked page after error ext4: fix suboptimal seek_{data,hole} extents traversial ext4: ext4_inline_data_fiemap should respect callers argument ext4: prevent fsreentrance deadlock for inline_data ext4: forbid journal_async_commit in data=ordered mode jbd2: remove unnecessary NULL check before iput() ext4: Remove an unnecessary check for NULL before iput() ext4: remove unneeded code in ext4_unlink ext4: don't count external journal blocks as overhead ext4: remove never taken branch from ext4_ext_shift_path_extents() ext4: create nojournal_checksum mount option ext4: update comments regarding ext4_delete_inode() ext4: cleanup GFP flags inside resize path ext4: introduce aging to extent status tree ext4: cleanup flag definitions for extent status tree ext4: limit number of scanned extents in status tree shrinker ext4: move handling of list of shrinkable inodes into extent status code ext4: change LRU to round-robin in extent status tree shrinker ext4: cache extent hole in extent status tree for ext4_da_map_blocks() ext4: fix block reservation for bigalloc filesystems ...
Diffstat (limited to 'fs/ext4')
-rw-r--r--fs/ext4/ext4.h41
-rw-r--r--fs/ext4/extents.c223
-rw-r--r--fs/ext4/extents_status.c321
-rw-r--r--fs/ext4/extents_status.h82
-rw-r--r--fs/ext4/file.c220
-rw-r--r--fs/ext4/inline.c35
-rw-r--r--fs/ext4/inode.c37
-rw-r--r--fs/ext4/ioctl.c2
-rw-r--r--fs/ext4/mballoc.c15
-rw-r--r--fs/ext4/migrate.c2
-rw-r--r--fs/ext4/move_extent.c8
-rw-r--r--fs/ext4/namei.c1
-rw-r--r--fs/ext4/resize.c6
-rw-r--r--fs/ext4/super.c51
14 files changed, 526 insertions, 518 deletions
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index db3f772e57ae..a75fba67bb1f 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -158,17 +158,8 @@ struct ext4_allocation_request {
#define EXT4_MAP_MAPPED (1 << BH_Mapped)
#define EXT4_MAP_UNWRITTEN (1 << BH_Unwritten)
#define EXT4_MAP_BOUNDARY (1 << BH_Boundary)
-/* Sometimes (in the bigalloc case, from ext4_da_get_block_prep) the caller of
- * ext4_map_blocks wants to know whether or not the underlying cluster has
- * already been accounted for. EXT4_MAP_FROM_CLUSTER conveys to the caller that
- * the requested mapping was from previously mapped (or delayed allocated)
- * cluster. We use BH_AllocFromCluster only for this flag. BH_AllocFromCluster
- * should never appear on buffer_head's state flags.
- */
-#define EXT4_MAP_FROM_CLUSTER (1 << BH_AllocFromCluster)
#define EXT4_MAP_FLAGS (EXT4_MAP_NEW | EXT4_MAP_MAPPED |\
- EXT4_MAP_UNWRITTEN | EXT4_MAP_BOUNDARY |\
- EXT4_MAP_FROM_CLUSTER)
+ EXT4_MAP_UNWRITTEN | EXT4_MAP_BOUNDARY)
struct ext4_map_blocks {
ext4_fsblk_t m_pblk;
@@ -565,10 +556,8 @@ enum {
#define EXT4_GET_BLOCKS_KEEP_SIZE 0x0080
/* Do not take i_data_sem locking in ext4_map_blocks */
#define EXT4_GET_BLOCKS_NO_LOCK 0x0100
- /* Do not put hole in extent cache */
-#define EXT4_GET_BLOCKS_NO_PUT_HOLE 0x0200
/* Convert written extents to unwritten */
-#define EXT4_GET_BLOCKS_CONVERT_UNWRITTEN 0x0400
+#define EXT4_GET_BLOCKS_CONVERT_UNWRITTEN 0x0200
/*
* The bit position of these flags must not overlap with any of the
@@ -889,10 +878,12 @@ struct ext4_inode_info {
/* extents status tree */
struct ext4_es_tree i_es_tree;
rwlock_t i_es_lock;
- struct list_head i_es_lru;
+ struct list_head i_es_list;
unsigned int i_es_all_nr; /* protected by i_es_lock */
- unsigned int i_es_lru_nr; /* protected by i_es_lock */
- unsigned long i_touch_when; /* jiffies of last accessing */
+ unsigned int i_es_shk_nr; /* protected by i_es_lock */
+ ext4_lblk_t i_es_shrink_lblk; /* Offset where we start searching for
+ extents to shrink. Protected by
+ i_es_lock */
/* ialloc */
ext4_group_t i_last_alloc_group;
@@ -1337,10 +1328,11 @@ struct ext4_sb_info {
/* Reclaim extents from extent status tree */
struct shrinker s_es_shrinker;
- struct list_head s_es_lru;
+ struct list_head s_es_list; /* List of inodes with reclaimable extents */
+ long s_es_nr_inode;
struct ext4_es_stats s_es_stats;
struct mb_cache *s_mb_cache;
- spinlock_t s_es_lru_lock ____cacheline_aligned_in_smp;
+ spinlock_t s_es_lock ____cacheline_aligned_in_smp;
/* Ratelimit ext4 messages. */
struct ratelimit_state s_err_ratelimit_state;
@@ -2196,7 +2188,6 @@ extern int ext4_calculate_overhead(struct super_block *sb);
extern void ext4_superblock_csum_set(struct super_block *sb);
extern void *ext4_kvmalloc(size_t size, gfp_t flags);
extern void *ext4_kvzalloc(size_t size, gfp_t flags);
-extern void ext4_kvfree(void *ptr);
extern int ext4_alloc_flex_bg_array(struct super_block *sb,
ext4_group_t ngroup);
extern const char *ext4_decode_error(struct super_block *sb, int errno,
@@ -2647,7 +2638,7 @@ extern struct buffer_head *ext4_get_first_inline_block(struct inode *inode,
int *retval);
extern int ext4_inline_data_fiemap(struct inode *inode,
struct fiemap_extent_info *fieinfo,
- int *has_inline);
+ int *has_inline, __u64 start, __u64 len);
extern int ext4_try_to_evict_inline_data(handle_t *handle,
struct inode *inode,
int needed);
@@ -2795,16 +2786,6 @@ extern int ext4_bio_write_page(struct ext4_io_submit *io,
extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t);
/*
- * Note that these flags will never ever appear in a buffer_head's state flag.
- * See EXT4_MAP_... to see where this is used.
- */
-enum ext4_state_bits {
- BH_AllocFromCluster /* allocated blocks were part of already
- * allocated cluster. */
- = BH_JBDPrivateStart
-};
-
-/*
* Add new method to test whether block and inode bitmaps are properly
* initialized. With uninit_bg reading the block from disk is not enough
* to mark the bitmap uptodate. We need to also zero-out the bitmap
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 0b16fb4c06d3..e5d3eadf47b1 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2306,16 +2306,16 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
ext4_lblk_t block)
{
int depth = ext_depth(inode);
- unsigned long len = 0;
- ext4_lblk_t lblock = 0;
+ ext4_lblk_t len;
+ ext4_lblk_t lblock;
struct ext4_extent *ex;
+ struct extent_status es;
ex = path[depth].p_ext;
if (ex == NULL) {
- /*
- * there is no extent yet, so gap is [0;-] and we
- * don't cache it
- */
+ /* there is no extent yet, so gap is [0;-] */
+ lblock = 0;
+ len = EXT_MAX_BLOCKS;
ext_debug("cache gap(whole file):");
} else if (block < le32_to_cpu(ex->ee_block)) {
lblock = block;
@@ -2324,9 +2324,6 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
block,
le32_to_cpu(ex->ee_block),
ext4_ext_get_actual_len(ex));
- if (!ext4_find_delalloc_range(inode, lblock, lblock + len - 1))
- ext4_es_insert_extent(inode, lblock, len, ~0,
- EXTENT_STATUS_HOLE);
} else if (block >= le32_to_cpu(ex->ee_block)
+ ext4_ext_get_actual_len(ex)) {
ext4_lblk_t next;
@@ -2340,14 +2337,19 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
block);
BUG_ON(next == lblock);
len = next - lblock;
- if (!ext4_find_delalloc_range(inode, lblock, lblock + len - 1))
- ext4_es_insert_extent(inode, lblock, len, ~0,
- EXTENT_STATUS_HOLE);
} else {
BUG();
}
- ext_debug(" -> %u:%lu\n", lblock, len);
+ ext4_es_find_delayed_extent_range(inode, lblock, lblock + len - 1, &es);
+ if (es.es_len) {
+ /* There's delayed extent containing lblock? */
+ if (es.es_lblk <= lblock)
+ return;
+ len = min(es.es_lblk - lblock, len);
+ }
+ ext_debug(" -> %u:%u\n", lblock, len);
+ ext4_es_insert_extent(inode, lblock, len, ~0, EXTENT_STATUS_HOLE);
}
/*
@@ -2481,7 +2483,7 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
ext4_lblk_t from, ext4_lblk_t to)
{
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
- unsigned short ee_len = ext4_ext_get_actual_len(ex);
+ unsigned short ee_len = ext4_ext_get_actual_len(ex);
ext4_fsblk_t pblk;
int flags = get_default_free_blocks_flags(inode);
@@ -2490,7 +2492,7 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
* at the beginning of the extent. Instead, we make a note
* that we tried freeing the cluster, and check to see if we
* need to free it on a subsequent call to ext4_remove_blocks,
- * or at the end of the ext4_truncate() operation.
+ * or at the end of ext4_ext_rm_leaf or ext4_ext_remove_space.
*/
flags |= EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER;
@@ -2501,8 +2503,8 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
* partial cluster here.
*/
pblk = ext4_ext_pblock(ex) + ee_len - 1;
- if ((*partial_cluster > 0) &&
- (EXT4_B2C(sbi, pblk) != *partial_cluster)) {
+ if (*partial_cluster > 0 &&
+ *partial_cluster != (long long) EXT4_B2C(sbi, pblk)) {
ext4_free_blocks(handle, inode, NULL,
EXT4_C2B(sbi, *partial_cluster),
sbi->s_cluster_ratio, flags);
@@ -2528,7 +2530,7 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
&& to == le32_to_cpu(ex->ee_block) + ee_len - 1) {
/* tail removal */
ext4_lblk_t num;
- unsigned int unaligned;
+ long long first_cluster;
num = le32_to_cpu(ex->ee_block) + ee_len - from;
pblk = ext4_ext_pblock(ex) + ee_len - num;
@@ -2538,7 +2540,7 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
* used by any other extent (partial_cluster is negative).
*/
if (*partial_cluster < 0 &&
- -(*partial_cluster) == EXT4_B2C(sbi, pblk + num - 1))
+ *partial_cluster == -(long long) EXT4_B2C(sbi, pblk+num-1))
flags |= EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER;
ext_debug("free last %u blocks starting %llu partial %lld\n",
@@ -2549,21 +2551,24 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
* beginning of a cluster, and we removed the entire
* extent and the cluster is not used by any other extent,
* save the partial cluster here, since we might need to
- * delete if we determine that the truncate operation has
- * removed all of the blocks in the cluster.
+ * delete if we determine that the truncate or punch hole
+ * operation has removed all of the blocks in the cluster.
+ * If that cluster is used by another extent, preserve its
+ * negative value so it isn't freed later on.
*
- * On the other hand, if we did not manage to free the whole
- * extent, we have to mark the cluster as used (store negative
- * cluster number in partial_cluster).
+ * If the whole extent wasn't freed, we've reached the
+ * start of the truncated/punched region and have finished
+ * removing blocks. If there's a partial cluster here it's
+ * shared with the remainder of the extent and is no longer
+ * a candidate for removal.
*/
- unaligned = EXT4_PBLK_COFF(sbi, pblk);
- if (unaligned && (ee_len == num) &&
- (*partial_cluster != -((long long)EXT4_B2C(sbi, pblk))))
- *partial_cluster = EXT4_B2C(sbi, pblk);
- else if (unaligned)
- *partial_cluster = -((long long)EXT4_B2C(sbi, pblk));
- else if (*partial_cluster > 0)
+ if (EXT4_PBLK_COFF(sbi, pblk) && ee_len == num) {
+ first_cluster = (long long) EXT4_B2C(sbi, pblk);
+ if (first_cluster != -*partial_cluster)
+ *partial_cluster = first_cluster;
+ } else {
*partial_cluster = 0;
+ }
} else
ext4_error(sbi->s_sb, "strange request: removal(2) "
"%u-%u from %u:%u\n",
@@ -2574,15 +2579,16 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
/*
* ext4_ext_rm_leaf() Removes the extents associated with the
- * blocks appearing between "start" and "end", and splits the extents
- * if "start" and "end" appear in the same extent
+ * blocks appearing between "start" and "end". Both "start"
+ * and "end" must appear in the same extent or EIO is returned.
*
* @handle: The journal handle
* @inode: The files inode
* @path: The path to the leaf
* @partial_cluster: The cluster which we'll have to free if all extents
- * has been released from it. It gets negative in case
- * that the cluster is still used.
+ * has been released from it. However, if this value is
+ * negative, it's a cluster just to the right of the
+ * punched region and it must not be freed.
* @start: The first block to remove
* @end: The last block to remove
*/
@@ -2621,27 +2627,6 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
ex_ee_block = le32_to_cpu(ex->ee_block);
ex_ee_len = ext4_ext_get_actual_len(ex);
- /*
- * If we're starting with an extent other than the last one in the
- * node, we need to see if it shares a cluster with the extent to
- * the right (towards the end of the file). If its leftmost cluster
- * is this extent's rightmost cluster and it is not cluster aligned,
- * we'll mark it as a partial that is not to be deallocated.
- */
-
- if (ex != EXT_LAST_EXTENT(eh)) {
- ext4_fsblk_t current_pblk, right_pblk;
- long long current_cluster, right_cluster;
-
- current_pblk = ext4_ext_pblock(ex) + ex_ee_len - 1;
- current_cluster = (long long)EXT4_B2C(sbi, current_pblk);
- right_pblk = ext4_ext_pblock(ex + 1);
- right_cluster = (long long)EXT4_B2C(sbi, right_pblk);
- if (current_cluster == right_cluster &&
- EXT4_PBLK_COFF(sbi, right_pblk))
- *partial_cluster = -right_cluster;
- }
-
trace_ext4_ext_rm_leaf(inode, start, ex, *partial_cluster);
while (ex >= EXT_FIRST_EXTENT(eh) &&
@@ -2666,14 +2651,16 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
if (end < ex_ee_block) {
/*
* We're going to skip this extent and move to another,
- * so if this extent is not cluster aligned we have
- * to mark the current cluster as used to avoid
- * accidentally freeing it later on
+ * so note that its first cluster is in use to avoid
+ * freeing it when removing blocks. Eventually, the
+ * right edge of the truncated/punched region will
+ * be just to the left.
*/
- pblk = ext4_ext_pblock(ex);
- if (EXT4_PBLK_COFF(sbi, pblk))
+ if (sbi->s_cluster_ratio > 1) {
+ pblk = ext4_ext_pblock(ex);
*partial_cluster =
- -((long long)EXT4_B2C(sbi, pblk));
+ -(long long) EXT4_B2C(sbi, pblk);
+ }
ex--;
ex_ee_block = le32_to_cpu(ex->ee_block);
ex_ee_len = ext4_ext_get_actual_len(ex);
@@ -2749,8 +2736,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
sizeof(struct ext4_extent));
}
le16_add_cpu(&eh->eh_entries, -1);
- } else if (*partial_cluster > 0)
- *partial_cluster = 0;
+ }
err = ext4_ext_dirty(handle, inode, path + depth);
if (err)
@@ -2769,20 +2755,18 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
/*
* If there's a partial cluster and at least one extent remains in
* the leaf, free the partial cluster if it isn't shared with the
- * current extent. If there's a partial cluster and no extents
- * remain in the leaf, it can't be freed here. It can only be
- * freed when it's possible to determine if it's not shared with
- * any other extent - when the next leaf is processed or when space
- * removal is complete.
+ * current extent. If it is shared with the current extent
+ * we zero partial_cluster because we've reached the start of the
+ * truncated/punched region and we're done removing blocks.
*/
- if (*partial_cluster > 0 && eh->eh_entries &&
- (EXT4_B2C(sbi, ext4_ext_pblock(ex) + ex_ee_len - 1) !=
- *partial_cluster)) {
- int flags = get_default_free_blocks_flags(inode);
-
- ext4_free_blocks(handle, inode, NULL,
- EXT4_C2B(sbi, *partial_cluster),
- sbi->s_cluster_ratio, flags);
+ if (*partial_cluster > 0 && ex >= EXT_FIRST_EXTENT(eh)) {
+ pblk = ext4_ext_pblock(ex) + ex_ee_len - 1;
+ if (*partial_cluster != (long long) EXT4_B2C(sbi, pblk)) {
+ ext4_free_blocks(handle, inode, NULL,
+ EXT4_C2B(sbi, *partial_cluster),
+ sbi->s_cluster_ratio,
+ get_default_free_blocks_flags(inode));
+ }
*partial_cluster = 0;
}
@@ -2819,7 +2803,7 @@ ext4_ext_more_to_rm(struct ext4_ext_path *path)
int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
ext4_lblk_t end)
{
- struct super_block *sb = inode->i_sb;
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
int depth = ext_depth(inode);
struct ext4_ext_path *path = NULL;
long long partial_cluster = 0;
@@ -2845,9 +2829,10 @@ again:
*/
if (end < EXT_MAX_BLOCKS - 1) {
struct ext4_extent *ex;
- ext4_lblk_t ee_block;
+ ext4_lblk_t ee_block, ex_end, lblk;
+ ext4_fsblk_t pblk;
- /* find extent for this block */
+ /* find extent for or closest extent to this block */
path = ext4_find_extent(inode, end, NULL, EXT4_EX_NOCACHE);
if (IS_ERR(path)) {
ext4_journal_stop(handle);
@@ -2867,6 +2852,7 @@ again:
}
ee_block = le32_to_cpu(ex->ee_block);
+ ex_end = ee_block + ext4_ext_get_actual_len(ex) - 1;
/*
* See if the last block is inside the extent, if so split
@@ -2874,8 +2860,19 @@ again:
* tail of the first part of the split extent in
* ext4_ext_rm_leaf().
*/
- if (end >= ee_block &&
- end < ee_block + ext4_ext_get_actual_len(ex) - 1) {
+ if (end >= ee_block && end < ex_end) {
+
+ /*
+ * If we're going to split the extent, note that
+ * the cluster containing the block after 'end' is
+ * in use to avoid freeing it when removing blocks.
+ */
+ if (sbi->s_cluster_ratio > 1) {
+ pblk = ext4_ext_pblock(ex) + end - ee_block + 2;
+ partial_cluster =
+ -(long long) EXT4_B2C(sbi, pblk);
+ }
+
/*
* Split the extent in two so that 'end' is the last
* block in the first new extent. Also we should not
@@ -2886,6 +2883,24 @@ again:
end + 1, 1);
if (err < 0)
goto out;
+
+ } else if (sbi->s_cluster_ratio > 1 && end >= ex_end) {
+ /*
+ * If there's an extent to the right its first cluster
+ * contains the immediate right boundary of the
+ * truncated/punched region. Set partial_cluster to
+ * its negative value so it won't be freed if shared
+ * with the current extent. The end < ee_block case
+ * is handled in ext4_ext_rm_leaf().
+ */
+ lblk = ex_end + 1;
+ err = ext4_ext_search_right(inode, path, &lblk, &pblk,
+ &ex);
+ if (err)
+ goto out;
+ if (pblk)
+ partial_cluster =
+ -(long long) EXT4_B2C(sbi, pblk);
}
}
/*
@@ -2996,16 +3011,18 @@ again:
trace_ext4_ext_remove_space_done(inode, start, end, depth,
partial_cluster, path->p_hdr->eh_entries);
- /* If we still have something in the partial cluster and we have removed
+ /*
+ * If we still have something in the partial cluster and we have removed
* even the first extent, then we should free the blocks in the partial
- * cluster as well. */
- if (partial_cluster > 0 && path->p_hdr->eh_entries == 0) {
- int flags = get_default_free_blocks_flags(inode);
-
+ * cluster as well. (This code will only run when there are no leaves
+ * to the immediate left of the truncated/punched region.)
+ */
+ if (partial_cluster > 0 && err == 0) {
+ /* don't zero partial_cluster since it's not used afterwards */
ext4_free_blocks(handle, inode, NULL,
- EXT4_C2B(EXT4_SB(sb), partial_cluster),
- EXT4_SB(sb)->s_cluster_ratio, flags);
- partial_cluster = 0;
+ EXT4_C2B(sbi, partial_cluster),
+ sbi->s_cluster_ratio,
+ get_default_free_blocks_flags(inode));
}
/* TODO: flexible tree reduction should be here */
@@ -4267,6 +4284,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
ext4_io_end_t *io = ext4_inode_aio(inode);
ext4_lblk_t cluster_offset;
int set_unwritten = 0;
+ bool map_from_cluster = false;
ext_debug("blocks %u/%u requested for inode %lu\n",
map->m_lblk, map->m_len, inode->i_ino);
@@ -4343,10 +4361,6 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
}
}
- if ((sbi->s_cluster_ratio > 1) &&
- ext4_find_delalloc_cluster(inode, map->m_lblk))
- map->m_flags |= EXT4_MAP_FROM_CLUSTER;
-
/*
* requested block isn't allocated yet;
* we couldn't try to create block if create flag is zero
@@ -4356,15 +4370,13 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
* put just found gap into cache to speed up
* subsequent requests
*/
- if ((flags & EXT4_GET_BLOCKS_NO_PUT_HOLE) == 0)
- ext4_ext_put_gap_in_cache(inode, path, map->m_lblk);
+ ext4_ext_put_gap_in_cache(inode, path, map->m_lblk);
goto out2;
}
/*
* Okay, we need to do block allocation.
*/
- map->m_flags &= ~EXT4_MAP_FROM_CLUSTER;
newex.ee_block = cpu_to_le32(map->m_lblk);
cluster_offset = EXT4_LBLK_COFF(sbi, map->m_lblk);
@@ -4376,7 +4388,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
get_implied_cluster_alloc(inode->i_sb, map, ex, path)) {
ar.len = allocated = map->m_len;
newblock = map->m_pblk;
- map->m_flags |= EXT4_MAP_FROM_CLUSTER;
+ map_from_cluster = true;
goto got_allocated_blocks;
}
@@ -4397,7 +4409,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
get_implied_cluster_alloc(inode->i_sb, map, ex2, path)) {
ar.len = allocated = map->m_len;
newblock = map->m_pblk;
- map->m_flags |= EXT4_MAP_FROM_CLUSTER;
+ map_from_cluster = true;
goto got_allocated_blocks;
}
@@ -4523,7 +4535,7 @@ got_allocated_blocks:
*/
reserved_clusters = get_reserved_cluster_alloc(inode,
map->m_lblk, allocated);
- if (map->m_flags & EXT4_MAP_FROM_CLUSTER) {
+ if (map_from_cluster) {
if (reserved_clusters) {
/*
* We have clusters reserved for this range.
@@ -4620,7 +4632,6 @@ out2:
trace_ext4_ext_map_blocks_exit(inode, flags, map,
err ? err : allocated);
- ext4_es_lru_add(inode);
return err ? err : allocated;
}
@@ -5140,7 +5151,8 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
if (ext4_has_inline_data(inode)) {
int has_inline = 1;
- error = ext4_inline_data_fiemap(inode, fieinfo, &has_inline);
+ error = ext4_inline_data_fiemap(inode, fieinfo, &has_inline,
+ start, len);
if (has_inline)
return error;
@@ -5154,8 +5166,8 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
/* fallback to generic here if not in extents fmt */
if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
- return generic_block_fiemap(inode, fieinfo, start, len,
- ext4_get_block);
+ return __generic_block_fiemap(inode, fieinfo, start, len,
+ ext4_get_block);
if (fiemap_check_flags(fieinfo, EXT4_FIEMAP_FLAGS))
return -EBADR;
@@ -5179,7 +5191,6 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
error = ext4_fill_fiemap_extents(inode, start_blk,
len_blks, fieinfo);
}
- ext4_es_lru_add(inode);
return error;
}
@@ -5239,8 +5250,6 @@ ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift,
return -EIO;
ex_last = EXT_LAST_EXTENT(path[depth].p_hdr);
- if (!ex_last)
- return -EIO;
err = ext4_access_path(handle, inode, path + depth);
if (err)
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index 94e7855ae71b..e04d45733976 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -147,10 +147,9 @@ static struct kmem_cache *ext4_es_cachep;
static int __es_insert_extent(struct inode *inode, struct extent_status *newes);
static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
ext4_lblk_t end);
-static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei,
- int nr_to_scan);
-static int __ext4_es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
- struct ext4_inode_info *locked_ei);
+static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan);
+static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
+ struct ext4_inode_info *locked_ei);
int __init ext4_init_es(void)
{
@@ -298,6 +297,36 @@ out:
trace_ext4_es_find_delayed_extent_range_exit(inode, es);
}
+static void ext4_es_list_add(struct inode *inode)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+
+ if (!list_empty(&ei->i_es_list))
+ return;
+
+ spin_lock(&sbi->s_es_lock);
+ if (list_empty(&ei->i_es_list)) {
+ list_add_tail(&ei->i_es_list, &sbi->s_es_list);
+ sbi->s_es_nr_inode++;
+ }
+ spin_unlock(&sbi->s_es_lock);
+}
+
+static void ext4_es_list_del(struct inode *inode)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+
+ spin_lock(&sbi->s_es_lock);
+ if (!list_empty(&ei->i_es_list)) {
+ list_del_init(&ei->i_es_list);
+ sbi->s_es_nr_inode--;
+ WARN_ON_ONCE(sbi->s_es_nr_inode < 0);
+ }
+ spin_unlock(&sbi->s_es_lock);
+}
+
static struct extent_status *
ext4_es_alloc_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len,
ext4_fsblk_t pblk)
@@ -314,9 +343,10 @@ ext4_es_alloc_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len,
* We don't count delayed extent because we never try to reclaim them
*/
if (!ext4_es_is_delayed(es)) {
- EXT4_I(inode)->i_es_lru_nr++;
+ if (!EXT4_I(inode)->i_es_shk_nr++)
+ ext4_es_list_add(inode);
percpu_counter_inc(&EXT4_SB(inode->i_sb)->
- s_es_stats.es_stats_lru_cnt);
+ s_es_stats.es_stats_shk_cnt);
}
EXT4_I(inode)->i_es_all_nr++;
@@ -330,12 +360,13 @@ static void ext4_es_free_extent(struct inode *inode, struct extent_status *es)
EXT4_I(inode)->i_es_all_nr--;
percpu_counter_dec(&EXT4_SB(inode->i_sb)->s_es_stats.es_stats_all_cnt);
- /* Decrease the lru counter when this es is not delayed */
+ /* Decrease the shrink counter when this es is not delayed */
if (!ext4_es_is_delayed(es)) {
- BUG_ON(EXT4_I(inode)->i_es_lru_nr == 0);
- EXT4_I(inode)->i_es_lru_nr--;
+ BUG_ON(EXT4_I(inode)->i_es_shk_nr == 0);
+ if (!--EXT4_I(inode)->i_es_shk_nr)
+ ext4_es_list_del(inode);
percpu_counter_dec(&EXT4_SB(inode->i_sb)->
- s_es_stats.es_stats_lru_cnt);
+ s_es_stats.es_stats_shk_cnt);
}
kmem_cache_free(ext4_es_cachep, es);
@@ -351,7 +382,7 @@ static void ext4_es_free_extent(struct inode *inode, struct extent_status *es)
static int ext4_es_can_be_merged(struct extent_status *es1,
struct extent_status *es2)
{
- if (ext4_es_status(es1) != ext4_es_status(es2))
+ if (ext4_es_type(es1) != ext4_es_type(es2))
return 0;
if (((__u64) es1->es_len) + es2->es_len > EXT_MAX_BLOCKS) {
@@ -394,6 +425,8 @@ ext4_es_try_to_merge_left(struct inode *inode, struct extent_status *es)
es1 = rb_entry(node, struct extent_status, rb_node);
if (ext4_es_can_be_merged(es1, es)) {
es1->es_len += es->es_len;
+ if (ext4_es_is_referenced(es))
+ ext4_es_set_referenced(es1);
rb_erase(&es->rb_node, &tree->root);
ext4_es_free_extent(inode, es);
es = es1;
@@ -416,6 +449,8 @@ ext4_es_try_to_merge_right(struct inode *inode, struct extent_status *es)
es1 = rb_entry(node, struct extent_status, rb_node);
if (ext4_es_can_be_merged(es, es1)) {
es->es_len += es1->es_len;
+ if (ext4_es_is_referenced(es1))
+ ext4_es_set_referenced(es);
rb_erase(node, &tree->root);
ext4_es_free_extent(inode, es1);
}
@@ -683,8 +718,8 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
goto error;
retry:
err = __es_insert_extent(inode, &newes);
- if (err == -ENOMEM && __ext4_es_shrink(EXT4_SB(inode->i_sb), 1,
- EXT4_I(inode)))
+ if (err == -ENOMEM && __es_shrink(EXT4_SB(inode->i_sb),
+ 128, EXT4_I(inode)))
goto retry;
if (err == -ENOMEM && !ext4_es_is_delayed(&newes))
err = 0;
@@ -782,6 +817,8 @@ out:
es->es_lblk = es1->es_lblk;
es->es_len = es1->es_len;
es->es_pblk = es1->es_pblk;
+ if (!ext4_es_is_referenced(es))
+ ext4_es_set_referenced(es);
stats->es_stats_cache_hits++;
} else {
stats->es_stats_cache_misses++;
@@ -841,8 +878,8 @@ retry:
es->es_lblk = orig_es.es_lblk;
es->es_len = orig_es.es_len;
if ((err == -ENOMEM) &&
- __ext4_es_shrink(EXT4_SB(inode->i_sb), 1,
- EXT4_I(inode)))
+ __es_shrink(EXT4_SB(inode->i_sb),
+ 128, EXT4_I(inode)))
goto retry;
goto out;
}
@@ -914,6 +951,11 @@ int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
end = lblk + len - 1;
BUG_ON(end < lblk);
+ /*
+ * ext4_clear_inode() depends on us taking i_es_lock unconditionally
+ * so that we are sure __es_shrink() is done with the inode before it
+ * is reclaimed.
+ */
write_lock(&EXT4_I(inode)->i_es_lock);
err = __es_remove_extent(inode, lblk, end);
write_unlock(&EXT4_I(inode)->i_es_lock);
@@ -921,114 +963,75 @@ int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
return err;
}
-static int ext4_inode_touch_time_cmp(void *priv, struct list_head *a,
- struct list_head *b)
-{
- struct ext4_inode_info *eia, *eib;
- eia = list_entry(a, struct ext4_inode_info, i_es_lru);
- eib = list_entry(b, struct ext4_inode_info, i_es_lru);
-
- if (ext4_test_inode_state(&eia->vfs_inode, EXT4_STATE_EXT_PRECACHED) &&
- !ext4_test_inode_state(&eib->vfs_inode, EXT4_STATE_EXT_PRECACHED))
- return 1;
- if (!ext4_test_inode_state(&eia->vfs_inode, EXT4_STATE_EXT_PRECACHED) &&
- ext4_test_inode_state(&eib->vfs_inode, EXT4_STATE_EXT_PRECACHED))
- return -1;
- if (eia->i_touch_when == eib->i_touch_when)
- return 0;
- if (time_after(eia->i_touch_when, eib->i_touch_when))
- return 1;
- else
- return -1;
-}
-
-static int __ext4_es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
- struct ext4_inode_info *locked_ei)
+static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
+ struct ext4_inode_info *locked_ei)
{
struct ext4_inode_info *ei;
struct ext4_es_stats *es_stats;
- struct list_head *cur, *tmp;
- LIST_HEAD(skipped);
ktime_t start_time;
u64 scan_time;
+ int nr_to_walk;
int nr_shrunk = 0;
- int retried = 0, skip_precached = 1, nr_skipped = 0;
+ int retried = 0, nr_skipped = 0;
es_stats = &sbi->s_es_stats;
start_time = ktime_get();
- spin_lock(&sbi->s_es_lru_lock);
retry:
- list_for_each_safe(cur, tmp, &sbi->s_es_lru) {
- int shrunk;
-
- /*
- * If we have already reclaimed all extents from extent
- * status tree, just stop the loop immediately.
- */
- if (percpu_counter_read_positive(
- &es_stats->es_stats_lru_cnt) == 0)
- break;
-
- ei = list_entry(cur, struct ext4_inode_info, i_es_lru);
+ spin_lock(&sbi->s_es_lock);
+ nr_to_walk = sbi->s_es_nr_inode;
+ while (nr_to_walk-- > 0) {
+ if (list_empty(&sbi->s_es_list)) {
+ spin_unlock(&sbi->s_es_lock);
+ goto out;
+ }
+ ei = list_first_entry(&sbi->s_es_list, struct ext4_inode_info,
+ i_es_list);
+ /* Move the inode to the tail */
+ list_move_tail(&ei->i_es_list, &sbi->s_es_list);
/*
- * Skip the inode that is newer than the last_sorted
- * time. Normally we try hard to avoid shrinking
- * precached inodes, but we will as a last resort.
+ * Normally we try hard to avoid shrinking precached inodes,
+ * but we will as a last resort.
*/
- if ((es_stats->es_stats_last_sorted < ei->i_touch_when) ||
- (skip_precached && ext4_test_inode_state(&ei->vfs_inode,
- EXT4_STATE_EXT_PRECACHED))) {
+ if (!retried && ext4_test_inode_state(&ei->vfs_inode,
+ EXT4_STATE_EXT_PRECACHED)) {
nr_skipped++;
- list_move_tail(cur, &skipped);
continue;
}
- if (ei->i_es_lru_nr == 0 || ei == locked_ei ||
- !write_trylock(&ei->i_es_lock))
+ if (ei == locked_ei || !write_trylock(&ei->i_es_lock)) {
+ nr_skipped++;
continue;
+ }
+ /*
+ * Now we hold i_es_lock which protects us from inode reclaim
+ * freeing inode under us
+ */
+ spin_unlock(&sbi->s_es_lock);
- shrunk = __es_try_to_reclaim_extents(ei, nr_to_scan);
- if (ei->i_es_lru_nr == 0)
- list_del_init(&ei->i_es_lru);
+ nr_shrunk += es_reclaim_extents(ei, &nr_to_scan);
write_unlock(&ei->i_es_lock);
- nr_shrunk += shrunk;
- nr_to_scan -= shrunk;
- if (nr_to_scan == 0)
- break;
+ if (nr_to_scan <= 0)
+ goto out;
+ spin_lock(&sbi->s_es_lock);
}
-
- /* Move the newer inodes into the tail of the LRU list. */
- list_splice_tail(&skipped, &sbi->s_es_lru);
- INIT_LIST_HEAD(&skipped);
+ spin_unlock(&sbi->s_es_lock);
/*
* If we skipped any inodes, and we weren't able to make any
- * forward progress, sort the list and try again.
+ * forward progress, try again to scan precached inodes.
*/
if ((nr_shrunk == 0) && nr_skipped && !retried) {
retried++;
- list_sort(NULL, &sbi->s_es_lru, ext4_inode_touch_time_cmp);
- es_stats->es_stats_last_sorted = jiffies;
- ei = list_first_entry(&sbi->s_es_lru, struct ext4_inode_info,
- i_es_lru);
- /*
- * If there are no non-precached inodes left on the
- * list, start releasing precached extents.
- */
- if (ext4_test_inode_state(&ei->vfs_inode,
- EXT4_STATE_EXT_PRECACHED))
- skip_precached = 0;
goto retry;
}
- spin_unlock(&sbi->s_es_lru_lock);
-
if (locked_ei && nr_shrunk == 0)
- nr_shrunk = __es_try_to_reclaim_extents(locked_ei, nr_to_scan);
+ nr_shrunk = es_reclaim_extents(locked_ei, &nr_to_scan);
+out