summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2019-11-30 10:53:02 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2019-11-30 10:53:02 -0800
commit50b8b3f85a01543fb82d3bb9bfe7d06659522c70 (patch)
tree178bcf210025fec174a9a6fa9a094283f3fcc3ad
parentf112a2fd1f5999c6029551f901952392d900cf99 (diff)
parentdfdeeb41fb08fbe11d3cfefba9c0fcd00c95a36d (diff)
Merge tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4
Pull ext4 updates from Ted Ts'o: "This merge window saw the the following new featuers added to ext4: - Direct I/O via iomap (required the iomap-for-next branch from Darrick as a prereq). - Support for using dioread-nolock where the block size < page size. - Support for encryption for file systems where the block size < page size. - Rework of journal credits handling so a revoke-heavy workload will not cause the journal to run out of space. - Replace bit-spinlocks with spinlocks in jbd2 Also included were some bug fixes and cleanups, mostly to clean up corner cases from fuzzed file systems and error path handling" * tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4: (59 commits) ext4: work around deleting a file with i_nlink == 0 safely ext4: add more paranoia checking in ext4_expand_extra_isize handling jbd2: make jbd2_handle_buffer_credits() handle reserved handles ext4: fix a bug in ext4_wait_for_tail_page_commit ext4: bio_alloc with __GFP_DIRECT_RECLAIM never fails ext4: code cleanup for get_next_id ext4: fix leak of quota reservations ext4: remove unused variable warning in parse_options() ext4: Enable encryption for subpage-sized blocks fs/buffer.c: support fscrypt in block_read_full_page() ext4: Add error handling for io_end_vec struct allocation jbd2: Fine tune estimate of necessary descriptor blocks jbd2: Provide trace event for handle restarts ext4: Reserve revoke credits for freed blocks jbd2: Make credit checking more strict jbd2: Rename h_buffer_credits to h_total_credits jbd2: Reserve space for revoke descriptor blocks jbd2: Drop jbd2_space_needed() jbd2: Account descriptor blocks into t_outstanding_credits jbd2: Factor out common parts of stopping and restarting a handle ...
-rw-r--r--Documentation/filesystems/fscrypt.rst4
-rw-r--r--fs/buffer.c48
-rw-r--r--fs/ext4/ext4.h22
-rw-r--r--fs/ext4/ext4_jbd2.c32
-rw-r--r--fs/ext4/ext4_jbd2.h106
-rw-r--r--fs/ext4/extents.c149
-rw-r--r--fs/ext4/file.c412
-rw-r--r--fs/ext4/fsync.c72
-rw-r--r--fs/ext4/ialloc.c7
-rw-r--r--fs/ext4/indirect.c125
-rw-r--r--fs/ext4/inode.c926
-rw-r--r--fs/ext4/migrate.c103
-rw-r--r--fs/ext4/namei.c50
-rw-r--r--fs/ext4/page-io.c167
-rw-r--r--fs/ext4/readpage.c6
-rw-r--r--fs/ext4/resize.c46
-rw-r--r--fs/ext4/super.c57
-rw-r--r--fs/ext4/xattr.c94
-rw-r--r--fs/jbd2/checkpoint.c2
-rw-r--r--fs/jbd2/commit.c26
-rw-r--r--fs/jbd2/journal.c65
-rw-r--r--fs/jbd2/revoke.c6
-rw-r--r--fs/jbd2/transaction.c400
-rw-r--r--fs/ocfs2/alloc.c32
-rw-r--r--fs/ocfs2/journal.c8
-rw-r--r--fs/ocfs2/suballoc.c19
-rw-r--r--include/linux/jbd2.h118
-rw-r--r--include/linux/journal-head.h21
-rw-r--r--include/trace/events/ext4.h13
-rw-r--r--include/trace/events/jbd2.h16
30 files changed, 1691 insertions, 1461 deletions
diff --git a/Documentation/filesystems/fscrypt.rst b/Documentation/filesystems/fscrypt.rst
index 471a511c7508..68c2bc8275cf 100644
--- a/Documentation/filesystems/fscrypt.rst
+++ b/Documentation/filesystems/fscrypt.rst
@@ -342,8 +342,8 @@ Contents encryption
-------------------
For file contents, each filesystem block is encrypted independently.
-Currently, only the case where the filesystem block size is equal to
-the system's page size (usually 4096 bytes) is supported.
+Starting from Linux kernel 5.5, encryption of filesystems with block
+size less than system's page size is supported.
Each block's IV is set to the logical block number within the file as
a little endian number, except that:
diff --git a/fs/buffer.c b/fs/buffer.c
index 86a38b979323..d39838090b22 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -47,6 +47,7 @@
#include <linux/pagevec.h>
#include <linux/sched/mm.h>
#include <trace/events/block.h>
+#include <linux/fscrypt.h>
static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh,
@@ -246,10 +247,6 @@ out:
return ret;
}
-/*
- * I/O completion handler for block_read_full_page() - pages
- * which come unlocked at the end of I/O.
- */
static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
{
unsigned long flags;
@@ -307,6 +304,47 @@ still_busy:
return;
}
+struct decrypt_bh_ctx {
+ struct work_struct work;
+ struct buffer_head *bh;
+};
+
+static void decrypt_bh(struct work_struct *work)
+{
+ struct decrypt_bh_ctx *ctx =
+ container_of(work, struct decrypt_bh_ctx, work);
+ struct buffer_head *bh = ctx->bh;
+ int err;
+
+ err = fscrypt_decrypt_pagecache_blocks(bh->b_page, bh->b_size,
+ bh_offset(bh));
+ end_buffer_async_read(bh, err == 0);
+ kfree(ctx);
+}
+
+/*
+ * I/O completion handler for block_read_full_page() - pages
+ * which come unlocked at the end of I/O.
+ */
+static void end_buffer_async_read_io(struct buffer_head *bh, int uptodate)
+{
+ /* Decrypt if needed */
+ if (uptodate && IS_ENABLED(CONFIG_FS_ENCRYPTION) &&
+ IS_ENCRYPTED(bh->b_page->mapping->host) &&
+ S_ISREG(bh->b_page->mapping->host->i_mode)) {
+ struct decrypt_bh_ctx *ctx = kmalloc(sizeof(*ctx), GFP_ATOMIC);
+
+ if (ctx) {
+ INIT_WORK(&ctx->work, decrypt_bh);
+ ctx->bh = bh;
+ fscrypt_enqueue_decrypt_work(&ctx->work);
+ return;
+ }
+ uptodate = 0;
+ }
+ end_buffer_async_read(bh, uptodate);
+}
+
/*
* Completion handler for block_write_full_page() - pages which are unlocked
* during I/O, and which have PageWriteback cleared upon I/O completion.
@@ -379,7 +417,7 @@ EXPORT_SYMBOL(end_buffer_async_write);
*/
static void mark_buffer_async_read(struct buffer_head *bh)
{
- bh->b_end_io = end_buffer_async_read;
+ bh->b_end_io = end_buffer_async_read_io;
set_buffer_async_read(bh);
}
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index b3a2cc7c0252..f8578caba40d 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -198,6 +198,12 @@ struct ext4_system_blocks {
*/
#define EXT4_IO_END_UNWRITTEN 0x0001
+struct ext4_io_end_vec {
+ struct list_head list; /* list of io_end_vec */
+ loff_t offset; /* offset in the file */
+ ssize_t size; /* size of the extent */
+};
+
/*
* For converting unwritten extents on a work queue. 'handle' is used for
* buffered writeback.
@@ -211,8 +217,7 @@ typedef struct ext4_io_end {
* bios covering the extent */
unsigned int flag; /* unwritten or not */
atomic_t count; /* reference counter */
- loff_t offset; /* offset in the file */
- ssize_t size; /* size of the extent */
+ struct list_head list_vec; /* list of ext4_io_end_vec */
} ext4_io_end_t;
struct ext4_io_submit {
@@ -1579,7 +1584,6 @@ enum {
EXT4_STATE_NO_EXPAND, /* No space for expansion */
EXT4_STATE_DA_ALLOC_CLOSE, /* Alloc DA blks on close */
EXT4_STATE_EXT_MIGRATE, /* Inode is migrating */
- EXT4_STATE_DIO_UNWRITTEN, /* need convert on dio done*/
EXT4_STATE_NEWENTRY, /* File just added to dir */
EXT4_STATE_MAY_INLINE_DATA, /* may have in-inode data */
EXT4_STATE_EXT_PRECACHED, /* extents have been precached */
@@ -2562,8 +2566,6 @@ int ext4_get_block_unwritten(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create);
int ext4_get_block(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create);
-int ext4_dio_get_block(struct inode *inode, sector_t iblock,
- struct buffer_head *bh_result, int create);
int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
struct buffer_head *bh, int create);
int ext4_walk_page_buffers(handle_t *handle,
@@ -2606,7 +2608,6 @@ extern int ext4_can_truncate(struct inode *inode);
extern int ext4_truncate(struct inode *);
extern int ext4_break_layouts(struct inode *);
extern int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length);
-extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks);
extern void ext4_set_inode_flags(struct inode *);
extern int ext4_alloc_da_blocks(struct inode *inode);
extern void ext4_set_aops(struct inode *inode);
@@ -3266,6 +3267,8 @@ extern long ext4_fallocate(struct file *file, int mode, loff_t offset,
loff_t len);
extern int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode,
loff_t offset, ssize_t len);
+extern int ext4_convert_unwritten_io_end_vec(handle_t *handle,
+ ext4_io_end_t *io_end);
extern int ext4_map_blocks(handle_t *handle, struct inode *inode,
struct ext4_map_blocks *map, int flags);
extern int ext4_ext_calc_metadata_amount(struct inode *inode,
@@ -3298,6 +3301,10 @@ extern int ext4_swap_extents(handle_t *handle, struct inode *inode1,
ext4_lblk_t lblk2, ext4_lblk_t count,
int mark_unwritten,int *err);
extern int ext4_clu_mapped(struct inode *inode, ext4_lblk_t lclu);
+extern int ext4_datasem_ensure_credits(handle_t *handle, struct inode *inode,
+ int check_cred, int restart_cred,
+ int revoke_cred);
+
/* move_extent.c */
extern void ext4_double_down_write_data_sem(struct inode *first,
@@ -3324,6 +3331,8 @@ extern int ext4_bio_write_page(struct ext4_io_submit *io,
int len,
struct writeback_control *wbc,
bool keep_towrite);
+extern struct ext4_io_end_vec *ext4_alloc_io_end_vec(ext4_io_end_t *io_end);
+extern struct ext4_io_end_vec *ext4_last_io_end_vec(ext4_io_end_t *io_end);
/* mmp.c */
extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t);
@@ -3381,6 +3390,7 @@ static inline void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end)
}
extern const struct iomap_ops ext4_iomap_ops;
+extern const struct iomap_ops ext4_iomap_report_ops;
static inline int ext4_buffer_uptodate(struct buffer_head *bh)
{
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index 7c70b08d104c..d3b8cdea5df7 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -65,12 +65,14 @@ static int ext4_journal_check_start(struct super_block *sb)
}
handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line,
- int type, int blocks, int rsv_blocks)
+ int type, int blocks, int rsv_blocks,
+ int revoke_creds)
{
journal_t *journal;
int err;
- trace_ext4_journal_start(sb, blocks, rsv_blocks, _RET_IP_);
+ trace_ext4_journal_start(sb, blocks, rsv_blocks, revoke_creds,
+ _RET_IP_);
err = ext4_journal_check_start(sb);
if (err < 0)
return ERR_PTR(err);
@@ -78,8 +80,8 @@ handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line,
journal = EXT4_SB(sb)->s_journal;
if (!journal)
return ext4_get_nojournal();
- return jbd2__journal_start(journal, blocks, rsv_blocks, GFP_NOFS,
- type, line);
+ return jbd2__journal_start(journal, blocks, rsv_blocks, revoke_creds,
+ GFP_NOFS, type, line);
}
int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle)
@@ -119,8 +121,8 @@ handle_t *__ext4_journal_start_reserved(handle_t *handle, unsigned int line,
return ext4_get_nojournal();
sb = handle->h_journal->j_private;
- trace_ext4_journal_start_reserved(sb, handle->h_buffer_credits,
- _RET_IP_);
+ trace_ext4_journal_start_reserved(sb,
+ jbd2_handle_buffer_credits(handle), _RET_IP_);
err = ext4_journal_check_start(sb);
if (err < 0) {
jbd2_journal_free_reserved(handle);
@@ -133,6 +135,19 @@ handle_t *__ext4_journal_start_reserved(handle_t *handle, unsigned int line,
return handle;
}
+int __ext4_journal_ensure_credits(handle_t *handle, int check_cred,
+ int extend_cred, int revoke_cred)
+{
+ if (!ext4_handle_valid(handle))
+ return 0;
+ if (jbd2_handle_buffer_credits(handle) >= check_cred &&
+ handle->h_revoke_credits >= revoke_cred)
+ return 0;
+ extend_cred = max(0, extend_cred - jbd2_handle_buffer_credits(handle));
+ revoke_cred = max(0, revoke_cred - handle->h_revoke_credits);
+ return ext4_journal_extend(handle, extend_cred, revoke_cred);
+}
+
static void ext4_journal_abort_handle(const char *caller, unsigned int line,
const char *err_fn,
struct buffer_head *bh,
@@ -278,7 +293,7 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
handle->h_type,
handle->h_line_no,
handle->h_requested_credits,
- handle->h_buffer_credits, err);
+ jbd2_handle_buffer_credits(handle), err);
return err;
}
ext4_error_inode(inode, where, line,
@@ -289,7 +304,8 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
handle->h_type,
handle->h_line_no,
handle->h_requested_credits,
- handle->h_buffer_credits, err);
+ jbd2_handle_buffer_credits(handle),
+ err);
}
} else {
if (inode)
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index ef8fcf7d0d3b..a6b9b66dbfad 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -261,7 +261,8 @@ int __ext4_handle_dirty_super(const char *where, unsigned int line,
__ext4_handle_dirty_super(__func__, __LINE__, (handle), (sb))
handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line,
- int type, int blocks, int rsv_blocks);
+ int type, int blocks, int rsv_blocks,
+ int revoke_creds);
int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle);
#define EXT4_NOJOURNAL_MAX_REF_COUNT ((unsigned long) 4096)
@@ -288,28 +289,41 @@ static inline int ext4_handle_is_aborted(handle_t *handle)
return 0;
}
-static inline int ext4_handle_has_enough_credits(handle_t *handle, int needed)
+static inline int ext4_free_metadata_revoke_credits(struct super_block *sb,
+ int blocks)
{
- if (ext4_handle_valid(handle) && handle->h_buffer_credits < needed)
- return 0;
- return 1;
+ /* Freeing each metadata block can result in freeing one cluster */
+ return blocks * EXT4_SB(sb)->s_cluster_ratio;
+}
+
+static inline int ext4_trans_default_revoke_credits(struct super_block *sb)
+{
+ return ext4_free_metadata_revoke_credits(sb, 8);
}
#define ext4_journal_start_sb(sb, type, nblocks) \
- __ext4_journal_start_sb((sb), __LINE__, (type), (nblocks), 0)
+ __ext4_journal_start_sb((sb), __LINE__, (type), (nblocks), 0, \
+ ext4_trans_default_revoke_credits(sb))
#define ext4_journal_start(inode, type, nblocks) \
- __ext4_journal_start((inode), __LINE__, (type), (nblocks), 0)
+ __ext4_journal_start((inode), __LINE__, (type), (nblocks), 0, \
+ ext4_trans_default_revoke_credits((inode)->i_sb))
+
+#define ext4_journal_start_with_reserve(inode, type, blocks, rsv_blocks)\
+ __ext4_journal_start((inode), __LINE__, (type), (blocks), (rsv_blocks),\
+ ext4_trans_default_revoke_credits((inode)->i_sb))
-#define ext4_journal_start_with_reserve(inode, type, blocks, rsv_blocks) \
- __ext4_journal_start((inode), __LINE__, (type), (blocks), (rsv_blocks))
+#define ext4_journal_start_with_revoke(inode, type, blocks, revoke_creds) \
+ __ext4_journal_start((inode), __LINE__, (type), (blocks), 0, \
+ (revoke_creds))
static inline handle_t *__ext4_journal_start(struct inode *inode,
unsigned int line, int type,
- int blocks, int rsv_blocks)
+ int blocks, int rsv_blocks,
+ int revoke_creds)
{
return __ext4_journal_start_sb(inode->i_sb, line, type, blocks,
- rsv_blocks);
+ rsv_blocks, revoke_creds);
}
#define ext4_journal_stop(handle) \
@@ -332,20 +346,68 @@ static inline handle_t *ext4_journal_current_handle(void)
return journal_current_handle();
}
-static inline int ext4_journal_extend(handle_t *handle, int nblocks)
+static inline int ext4_journal_extend(handle_t *handle, int nblocks, int revoke)
{
if (ext4_handle_valid(handle))
- return jbd2_journal_extend(handle, nblocks);
+ return jbd2_journal_extend(handle, nblocks, revoke);
return 0;
}
-static inline int ext4_journal_restart(handle_t *handle, int nblocks)
+static inline int ext4_journal_restart(handle_t *handle, int nblocks,
+ int revoke)
{
if (ext4_handle_valid(handle))
- return jbd2_journal_restart(handle, nblocks);
+ return jbd2__journal_restart(handle, nblocks, revoke, GFP_NOFS);
return 0;
}
+int __ext4_journal_ensure_credits(handle_t *handle, int check_cred,
+ int extend_cred, int revoke_cred);
+
+
+/*
+ * Ensure @handle has at least @check_creds credits available. If not,
+ * transaction will be extended or restarted to contain at least @extend_cred
+ * credits. Before restarting transaction @fn is executed to allow for cleanup
+ * before the transaction is restarted.
+ *
+ * The return value is < 0 in case of error, 0 in case the handle has enough
+ * credits or transaction extension succeeded, 1 in case transaction had to be
+ * restarted.
+ */
+#define ext4_journal_ensure_credits_fn(handle, check_cred, extend_cred, \
+ revoke_cred, fn) \
+({ \
+ __label__ __ensure_end; \
+ int err = __ext4_journal_ensure_credits((handle), (check_cred), \
+ (extend_cred), (revoke_cred)); \
+ \
+ if (err <= 0) \
+ goto __ensure_end; \
+ err = (fn); \
+ if (err < 0) \
+ goto __ensure_end; \
+ err = ext4_journal_restart((handle), (extend_cred), (revoke_cred)); \
+ if (err == 0) \
+ err = 1; \
+__ensure_end: \
+ err; \
+})
+
+/*
+ * Ensure given handle has at least requested amount of credits available,
+ * possibly restarting transaction if needed. We also make sure the transaction
+ * has space for at least ext4_trans_default_revoke_credits(sb) revoke records
+ * as freeing one or two blocks is very common pattern and requesting this is
+ * very cheap.
+ */
+static inline int ext4_journal_ensure_credits(handle_t *handle, int credits,
+ int revoke_creds)
+{
+ return ext4_journal_ensure_credits_fn(handle, credits, credits,
+ revoke_creds, 0);
+}
+
static inline int ext4_journal_blocks_per_page(struct inode *inode)
{
if (EXT4_JOURNAL(inode) != NULL)
@@ -407,6 +469,7 @@ static inline int ext4_inode_journal_mode(struct inode *inode)
return EXT4_INODE_WRITEBACK_DATA_MODE; /* writeback */
/* We do not support data journalling with delayed allocation */
if (!S_ISREG(inode->i_mode) ||
+ ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE) ||
test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ||
(ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA) &&
!test_opt(inode->i_sb, DELALLOC))) {
@@ -437,6 +500,19 @@ static inline int ext4_should_writeback_data(struct inode *inode)
return ext4_inode_journal_mode(inode) & EXT4_INODE_WRITEBACK_DATA_MODE;
}
+static inline int ext4_free_data_revoke_credits(struct inode *inode, int blocks)
+{
+ if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
+ return 0;
+ if (!ext4_should_journal_data(inode))
+ return 0;
+ /*
+ * Data blocks in one extent are contiguous, just account for partial
+ * clusters at extent boundaries
+ */
+ return blocks + 2*(EXT4_SB(inode->i_sb)->s_cluster_ratio - 1);
+}
+
/*
* This function controls whether or not we should try to go down the
* dioread_nolock code paths, which makes it safe to avoid taking
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index fb0f99dc8c22..0e8708b77da6 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -100,29 +100,41 @@ static int ext4_split_extent_at(handle_t *handle,
static int ext4_find_delayed_extent(struct inode *inode,
struct extent_status *newes);
-static int ext4_ext_truncate_extend_restart(handle_t *handle,
- struct inode *inode,
- int needed)
+static int ext4_ext_trunc_restart_fn(struct inode *inode, int *dropped)
{
- int err;
-
- if (!ext4_handle_valid(handle))
- return 0;
- if (handle->h_buffer_credits >= needed)
- return 0;
/*
- * If we need to extend the journal get a few extra blocks
- * while we're at it for efficiency's sake.
+ * Drop i_data_sem to avoid deadlock with ext4_map_blocks. At this
+ * moment, get_block can be called only for blocks inside i_size since
+ * page cache has been already dropped and writes are blocked by
+ * i_mutex. So we can safely drop the i_data_sem here.
*/
- needed += 3;
- err = ext4_journal_extend(handle, needed - handle->h_buffer_credits);
- if (err <= 0)
- return err;
- err = ext4_truncate_restart_trans(handle, inode, needed);
- if (err == 0)
- err = -EAGAIN;
+ BUG_ON(EXT4_JOURNAL(inode) == NULL);
+ ext4_discard_preallocations(inode);
+ up_write(&EXT4_I(inode)->i_data_sem);
+ *dropped = 1;
+ return 0;
+}
- return err;
+/*
+ * Make sure 'handle' has at least 'check_cred' credits. If not, restart
+ * transaction with 'restart_cred' credits. The function drops i_data_sem
+ * when restarting transaction and gets it after transaction is restarted.
+ *
+ * The function returns 0 on success, 1 if transaction had to be restarted,
+ * and < 0 in case of fatal error.
+ */
+int ext4_datasem_ensure_credits(handle_t *handle, struct inode *inode,
+ int check_cred, int restart_cred,
+ int revoke_cred)
+{
+ int ret;
+ int dropped = 0;
+
+ ret = ext4_journal_ensure_credits_fn(handle, check_cred, restart_cred,
+ revoke_cred, ext4_ext_trunc_restart_fn(inode, &dropped));
+ if (dropped)
+ down_write(&EXT4_I(inode)->i_data_sem);
+ return ret;
}
/*
@@ -1753,16 +1765,9 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
*/
if (ext1_ee_len + ext2_ee_len > EXT_INIT_MAX_LEN)
return 0;
- /*
- * The check for IO to unwritten extent is somewhat racy as we
- * increment i_unwritten / set EXT4_STATE_DIO_UNWRITTEN only after
- * dropping i_data_sem. But reserved blocks should save us in that
- * case.
- */
+
if (ext4_ext_is_unwritten(ex1) &&
- (ext4_test_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN) ||
- atomic_read(&EXT4_I(inode)->i_unwritten) ||
- (ext1_ee_len + ext2_ee_len > EXT_UNWRITTEN_MAX_LEN)))
+ ext1_ee_len + ext2_ee_len > EXT_UNWRITTEN_MAX_LEN)
return 0;
#ifdef AGGRESSIVE_TEST
if (ext1_ee_len >= 4)
@@ -1840,7 +1845,8 @@ static void ext4_ext_try_to_merge_up(handle_t *handle,
* group descriptor to release the extent tree block. If we
* can't get the journal credits, give up.
*/
- if (ext4_journal_extend(handle, 2))
+ if (ext4_journal_extend(handle, 2,
+ ext4_free_metadata_revoke_credits(inode->i_sb, 1)))
return;
/*
@@ -2727,7 +2733,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
{
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
int err = 0, correct_index = 0;
- int depth = ext_depth(inode), credits;
+ int depth = ext_depth(inode), credits, revoke_credits;
struct ext4_extent_header *eh;
ext4_lblk_t a, b;
unsigned num;
@@ -2819,10 +2825,23 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
credits += (ext_depth(inode)) + 1;
}
credits += EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb);
-
- err = ext4_ext_truncate_extend_restart(handle, inode, credits);
- if (err)
+ /*
+ * We may end up freeing some index blocks and data from the
+ * punched range. Note that partial clusters are accounted for
+ * by ext4_free_data_revoke_credits().
+ */
+ revoke_credits =
+ ext4_free_metadata_revoke_credits(inode->i_sb,
+ ext_depth(inode)) +
+ ext4_free_data_revoke_credits(inode, b - a + 1);
+
+ err = ext4_datasem_ensure_credits(handle, inode, credits,
+ credits, revoke_credits);
+ if (err) {
+ if (err > 0)
+ err = -EAGAIN;
goto out;
+ }
err = ext4_ext_get_access(handle, inode, path + depth);
if (err)
@@ -2948,7 +2967,9 @@ int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
ext_debug("truncate since %u to %u\n", start, end);
/* probably first extent we're gonna free will be last in block */
- handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, depth + 1);
+ handle = ext4_journal_start_with_revoke(inode, EXT4_HT_TRUNCATE,
+ depth + 1,
+ ext4_free_metadata_revoke_credits(inode->i_sb, depth));
if (IS_ERR(handle))
return PTR_ERR(handle);
@@ -4962,23 +4983,13 @@ int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode,
int ret = 0;
int ret2 = 0;
struct ext4_map_blocks map;
- unsigned int credits, blkbits = inode->i_blkbits;
+ unsigned int blkbits = inode->i_blkbits;
+ unsigned int credits = 0;
map.m_lblk = offset >> blkbits;
max_blocks = EXT4_MAX_BLOCKS(len, offset, blkbits);
- /*
- * This is somewhat ugly but the idea is clear: When transaction is
- * reserved, everything goes into it. Otherwise we rather start several
- * smaller transactions for conversion of each extent separately.
- */
- if (handle) {
- handle = ext4_journal_start_reserved(handle,
- EXT4_HT_EXT_CONVERT);
- if (IS_ERR(handle))
- return PTR_ERR(handle);
- credits = 0;
- } else {
+ if (!handle) {
/*
* credits to insert 1 extent into extent tree
*/
@@ -5009,11 +5020,40 @@ int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode,
if (ret <= 0 || ret2)
break;
}
- if (!credits)
- ret2 = ext4_journal_stop(handle);
return ret > 0 ? ret2 : ret;
}
+int ext4_convert_unwritten_io_end_vec(handle_t *handle, ext4_io_end_t *io_end)
+{
+ int ret, err = 0;
+ struct ext4_io_end_vec *io_end_vec;
+
+ /*
+ * This is somewhat ugly but the idea is clear: When transaction is
+ * reserved, everything goes into it. Otherwise we rather start several
+ * smaller transactions for conversion of each extent separately.
+ */
+ if (handle) {
+ handle = ext4_journal_start_reserved(handle,
+ EXT4_HT_EXT_CONVERT);
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+ }
+
+ list_for_each_entry(io_end_vec, &io_end->list_vec, list) {
+ ret = ext4_convert_unwritten_extents(handle, io_end->inode,
+ io_end_vec->offset,
+ io_end_vec->size);
+ if (ret)
+ break;
+ }
+
+ if (handle)
+ err = ext4_journal_stop(handle);
+
+ return ret < 0 ? ret : err;
+}
+
/*
* If newes is not existing extent (newes->ec_pblk equals zero) find
* delayed extent at start of newes and update newes accordingly and
@@ -5206,13 +5246,10 @@ ext4_access_path(handle_t *handle, struct inode *inode,
* descriptor) for each block group; assume two block
* groups
*/
- if (handle->h_buffer_credits < 7) {
- credits = ext4_writepage_trans_blocks(inode);
- err = ext4_ext_truncate_extend_restart(handle, inode, credits);
- /* EAGAIN is success */
- if (err && err != -EAGAIN)
- return err;
- }
+ credits = ext4_writepage_trans_blocks(inode);
+ err = ext4_datasem_ensure_credits(handle, inode, 7, credits, 0);
+ if (err < 0)
+ return err;
err = ext4_ext_get_access(handle, inode, path);
return err;
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 8d2bbcc2d813..6a7293a5cda2 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -29,10 +29,58 @@
#include <linux/pagevec.h>
#include <linux/uio.h>
#include <linux/mman.h>
+#include <linux/backing-dev.h>
#include "ext4.h"
#include "ext4_jbd2.h"
#include "xattr.h"
#include "acl.h"
+#include "truncate.h"
+
+static bool ext4_dio_supported(struct inode *inode)
+{
+ if (IS_ENABLED(CONFIG_FS_ENCRYPTION) && IS_ENCRYPTED(inode))
+ return false;
+ if (fsverity_active(inode))
+ return false;
+ if (ext4_should_journal_data(inode))
+ return false;
+ if (ext4_has_inline_data(inode))
+ return false;
+ return true;
+}
+
+static ssize_t ext4_dio_read_iter(struct kiocb *iocb, struct iov_iter *to)
+{
+ ssize_t ret;
+ struct inode *inode = file_inode(iocb->ki_filp);
+
+ if (iocb->ki_flags & IOCB_NOWAIT) {
+ if (!inode_trylock_shared(inode))
+ return -EAGAIN;
+ } else {
+ inode_lock_shared(inode);
+ }
+
+ if (!ext4_dio_supported(inode)) {
+ inode_unlock_shared(inode);
+ /*
+ * Fallback to buffered I/O if the operation being performed on
+ * the inode is not supported by direct I/O. The IOCB_DIRECT
+ * flag needs to be cleared here in order to ensure that the
+ * direct I/O path within generic_file_read_iter() is not
+ * taken.
+ */
+ iocb->ki_flags &= ~IOCB_DIRECT;
+ return generic_file_read_iter(iocb, to);
+ }
+
+ ret = iomap_dio_rw(iocb, to, &ext4_iomap_ops, NULL,
+ is_sync_kiocb(iocb));
+ inode_unlock_shared(inode);
+
+ file_accessed(iocb->ki_filp);
+ return ret;
+}
#ifdef CONFIG_FS_DAX
static ssize_t ext4_dax_read_iter(struct kiocb *iocb, struct iov_iter *to)
@@ -64,16 +112,21 @@ static ssize_t ext4_dax_read_iter(struct kiocb *iocb, struct iov_iter *to)
static ssize_t ext4_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
- if (unlikely(ext4_forced_shutdown(EXT4_SB(file_inode(iocb->ki_filp)->i_sb))))
+ struct inode *inode = file_inode(iocb->ki_filp);
+
+ if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
return -EIO;
if (!iov_iter_count(to))
return 0; /* skip atime */
#ifdef CONFIG_FS_DAX
- if (IS_DAX(file_inode(iocb->ki_filp)))
+ if (IS_DAX(inode))
return ext4_dax_read_iter(iocb, to);
#endif
+ if (iocb->ki_flags & IOCB_DIRECT)
+ return ext4_dio_read_iter(iocb, to);
+
return generic_file_read_iter(iocb, to);
}
@@ -103,13 +156,6 @@ static int ext4_release_file(struct inode *inode, struct file *filp)
return 0;
}
-static void ext4_unwritten_wait(struct inode *inode)
-{
- wait_queue_head_t *wq = ext4_ioend_wq(inode);
-
- wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_unwritten) == 0));
-}
-
/*
* This tests whether the IO in question is block-aligned or not.
* Ext4 utilizes unwritten extents when hole-filling during direct IO, and they
@@ -162,13 +208,13 @@ static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from)
struct inode *inode = file_inode(iocb->ki_filp);
ssize_t ret;
+ if (unlikely(IS_IMMUTABLE(inode)))
+ return -EPERM;
+
ret = generic_write_checks(iocb, from);
if (ret <= 0)
return ret;
- if (unlikely(IS_IMMUTABLE(inode)))
- return -EPERM;
-
/*
* If we have encountered a bitmap-format file, the size limit
* is smaller than s_maxbytes, which is for extent-mapped files.
@@ -180,56 +226,266 @@ static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from)
return -EFBIG;
iov_iter_truncate(from, sbi->s_bitmap_maxbytes - iocb->ki_pos);
}
+
+ ret = file_modified(iocb->ki_filp);
+ if (ret)
+ return ret;
+
return iov_iter_count(from);
}