summaryrefslogtreecommitdiffstats
path: root/fs/io_uring.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2020-08-03 13:01:22 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2020-08-03 13:01:22 -0700
commitcdc8fcb49905c0b67e355e027cb462ee168ffaa3 (patch)
tree51d691db50761bd26e71e9f491c3dc82c2fe147c /fs/io_uring.c
parent382625d0d4325fb14a29444eb8dce8dcc2eb9b51 (diff)
parentfa15bafb71fd7a4d6018dae87cfaf890fd4ab47f (diff)
Merge tag 'for-5.9/io_uring-20200802' of git://git.kernel.dk/linux-block
Pull io_uring updates from Jens Axboe: "Lots of cleanups in here, hardening the code and/or making it easier to read and fixing bugs, but a core feature/change too adding support for real async buffered reads. With the latter in place, we just need buffered write async support and we're done relying on kthreads for the fast path. In detail: - Cleanup how memory accounting is done on ring setup/free (Bijan) - sq array offset calculation fixup (Dmitry) - Consistently handle blocking off O_DIRECT submission path (me) - Support proper async buffered reads, instead of relying on kthread offload for that. This uses the page waitqueue to drive retries from task_work, like we handle poll based retry. (me) - IO completion optimizations (me) - Fix race with accounting and ring fd install (me) - Support EPOLLEXCLUSIVE (Jiufei) - Get rid of the io_kiocb unionizing, made possible by shrinking other bits (Pavel) - Completion side cleanups (Pavel) - Cleanup REQ_F_ flags handling, and kill off many of them (Pavel) - Request environment grabbing cleanups (Pavel) - File and socket read/write cleanups (Pavel) - Improve kiocb_set_rw_flags() (Pavel) - Tons of fixes and cleanups (Pavel) - IORING_SQ_NEED_WAKEUP clear fix (Xiaoguang)" * tag 'for-5.9/io_uring-20200802' of git://git.kernel.dk/linux-block: (127 commits) io_uring: flip if handling after io_setup_async_rw fs: optimise kiocb_set_rw_flags() io_uring: don't touch 'ctx' after installing file descriptor io_uring: get rid of atomic FAA for cq_timeouts io_uring: consolidate *_check_overflow accounting io_uring: fix stalled deferred requests io_uring: fix racy overflow count reporting io_uring: deduplicate __io_complete_rw() io_uring: de-unionise io_kiocb io-wq: update hash bits io_uring: fix missing io_queue_linked_timeout() io_uring: mark ->work uninitialised after cleanup io_uring: deduplicate io_grab_files() calls io_uring: don't do opcode prep twice io_uring: clear IORING_SQ_NEED_WAKEUP after executing task works io_uring: batch put_task_struct() tasks: add put_task_struct_many() io_uring: return locked and pinned page accounting io_uring: don't miscount pinned memory io_uring: don't open-code recv kbuf managment ...
Diffstat (limited to 'fs/io_uring.c')
-rw-r--r--fs/io_uring.c2545
1 files changed, 1449 insertions, 1096 deletions
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 493e5047e67c..2a3af95be4ca 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -78,6 +78,7 @@
#include <linux/fs_struct.h>
#include <linux/splice.h>
#include <linux/task_work.h>
+#include <linux/pagemap.h>
#define CREATE_TRACE_POINTS
#include <trace/events/io_uring.h>
@@ -226,7 +227,7 @@ struct io_ring_ctx {
struct {
unsigned int flags;
unsigned int compat: 1;
- unsigned int account_mem: 1;
+ unsigned int limit_mem: 1;
unsigned int cq_overflow_flushed: 1;
unsigned int drain_next: 1;
unsigned int eventfd_async: 1;
@@ -319,12 +320,12 @@ struct io_ring_ctx {
spinlock_t completion_lock;
/*
- * ->poll_list is protected by the ctx->uring_lock for
+ * ->iopoll_list is protected by the ctx->uring_lock for
* io_uring instances that don't use IORING_SETUP_SQPOLL.
* For SQPOLL, only the single threaded io_sq_thread() will
* manipulate the list, hence no extra locking is needed there.
*/
- struct list_head poll_list;
+ struct list_head iopoll_list;
struct hlist_head *cancel_hash;
unsigned cancel_hash_bits;
bool poll_multi_file;
@@ -395,6 +396,7 @@ struct io_timeout {
int flags;
u32 off;
u32 target_seq;
+ struct list_head list;
};
struct io_rw {
@@ -413,7 +415,7 @@ struct io_connect {
struct io_sr_msg {
struct file *file;
union {
- struct user_msghdr __user *msg;
+ struct user_msghdr __user *umsg;
void __user *buf;
};
int msg_flags;
@@ -486,6 +488,12 @@ struct io_statx {
struct statx __user *buffer;
};
+struct io_completion {
+ struct file *file;
+ struct list_head list;
+ int cflags;
+};
+
struct io_async_connect {
struct sockaddr_storage address;
};
@@ -503,6 +511,7 @@ struct io_async_rw {
struct iovec *iov;
ssize_t nr_segs;
ssize_t size;
+ struct wait_page_queue wpq;
};
struct io_async_ctx {
@@ -523,23 +532,18 @@ enum {
REQ_F_BUFFER_SELECT_BIT = IOSQE_BUFFER_SELECT_BIT,
REQ_F_LINK_HEAD_BIT,
- REQ_F_LINK_NEXT_BIT,
REQ_F_FAIL_LINK_BIT,
REQ_F_INFLIGHT_BIT,
REQ_F_CUR_POS_BIT,
REQ_F_NOWAIT_BIT,
REQ_F_LINK_TIMEOUT_BIT,
- REQ_F_TIMEOUT_BIT,
REQ_F_ISREG_BIT,
- REQ_F_MUST_PUNT_BIT,
- REQ_F_TIMEOUT_NOSEQ_BIT,
REQ_F_COMP_LOCKED_BIT,
REQ_F_NEED_CLEANUP_BIT,
REQ_F_OVERFLOW_BIT,
REQ_F_POLLED_BIT,
REQ_F_BUFFER_SELECTED_BIT,
REQ_F_NO_FILE_TABLE_BIT,
- REQ_F_QUEUE_TIMEOUT_BIT,
REQ_F_WORK_INITIALIZED_BIT,
REQ_F_TASK_PINNED_BIT,
@@ -563,8 +567,6 @@ enum {
/* head of a link */
REQ_F_LINK_HEAD = BIT(REQ_F_LINK_HEAD_BIT),
- /* already grabbed next link */
- REQ_F_LINK_NEXT = BIT(REQ_F_LINK_NEXT_BIT),
/* fail rest of links */
REQ_F_FAIL_LINK = BIT(REQ_F_FAIL_LINK_BIT),
/* on inflight list */
@@ -575,14 +577,8 @@ enum {
REQ_F_NOWAIT = BIT(REQ_F_NOWAIT_BIT),
/* has linked timeout */
REQ_F_LINK_TIMEOUT = BIT(REQ_F_LINK_TIMEOUT_BIT),
- /* timeout request */
- REQ_F_TIMEOUT = BIT(REQ_F_TIMEOUT_BIT),
/* regular file */
REQ_F_ISREG = BIT(REQ_F_ISREG_BIT),
- /* must be punted even for NONBLOCK */
- REQ_F_MUST_PUNT = BIT(REQ_F_MUST_PUNT_BIT),
- /* no timeout sequence */
- REQ_F_TIMEOUT_NOSEQ = BIT(REQ_F_TIMEOUT_NOSEQ_BIT),
/* completion under lock */
REQ_F_COMP_LOCKED = BIT(REQ_F_COMP_LOCKED_BIT),
/* needs cleanup */
@@ -595,8 +591,6 @@ enum {
REQ_F_BUFFER_SELECTED = BIT(REQ_F_BUFFER_SELECTED_BIT),
/* doesn't need file table for this request */
REQ_F_NO_FILE_TABLE = BIT(REQ_F_NO_FILE_TABLE_BIT),
- /* needs to queue linked timeout */
- REQ_F_QUEUE_TIMEOUT = BIT(REQ_F_QUEUE_TIMEOUT_BIT),
/* io_wq_work is initialized */
REQ_F_WORK_INITIALIZED = BIT(REQ_F_WORK_INITIALIZED_BIT),
/* req->task is refcounted */
@@ -606,7 +600,6 @@ enum {
struct async_poll {
struct io_poll_iocb poll;
struct io_poll_iocb *double_poll;
- struct io_wq_work work;
};
/*
@@ -635,51 +628,54 @@ struct io_kiocb {
struct io_splice splice;
struct io_provide_buf pbuf;
struct io_statx statx;
+ /* use only after cleaning per-op data, see io_clean_op() */
+ struct io_completion compl;
};
struct io_async_ctx *io;
- int cflags;
u8 opcode;
/* polled IO has completed */
u8 iopoll_completed;
u16 buf_index;
+ u32 result;
- struct io_ring_ctx *ctx;
- struct list_head list;
- unsigned int flags;
- refcount_t refs;
- struct task_struct *task;
- unsigned long fsize;
- u64 user_data;
- u32 result;
- u32 sequence;
-
- struct list_head link_list;
+ struct io_ring_ctx *ctx;
+ unsigned int flags;
+ refcount_t refs;
+ struct task_struct *task;
+ u64 user_data;
- struct list_head inflight_entry;
+ struct list_head link_list;
- struct percpu_ref *fixed_file_refs;
+ /*
+ * 1. used with ctx->iopoll_list with reads/writes
+ * 2. to track reqs with ->files (see io_op_def::file_table)
+ */
+ struct list_head inflight_entry;
+
+ struct percpu_ref *fixed_file_refs;
+ struct callback_head task_work;
+ /* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */
+ struct hlist_node hash_node;
+ struct async_poll *apoll;
+ struct io_wq_work work;
+};
- union {
- /*
- * Only commands that never go async can use the below fields,
- * obviously. Right now only IORING_OP_POLL_ADD uses them, and
- * async armed poll handlers for regular commands. The latter
- * restore the work, if needed.
- */
- struct {
- struct callback_head task_work;
- struct hlist_node hash_node;
- struct async_poll *apoll;
- };
- struct io_wq_work work;
- };
+struct io_defer_entry {
+ struct list_head list;
+ struct io_kiocb *req;
+ u32 seq;
};
-#define IO_PLUG_THRESHOLD 2
#define IO_IOPOLL_BATCH 8
+struct io_comp_state {
+ unsigned int nr;
+ struct list_head list;
+ struct io_ring_ctx *ctx;
+};
+
struct io_submit_state {
struct blk_plug plug;
@@ -690,12 +686,16 @@ struct io_submit_state {
unsigned int free_reqs;
/*
+ * Batch completion logic
+ */
+ struct io_comp_state comp;
+
+ /*
* File reference cache
*/
struct file *file;
unsigned int fd;
unsigned int has_refs;
- unsigned int used_refs;
unsigned int ios_left;
};
@@ -723,6 +723,7 @@ struct io_op_def {
unsigned pollout : 1;
/* op supports buffer selection */
unsigned buffer_select : 1;
+ unsigned needs_fsize : 1;
};
static const struct io_op_def io_op_defs[] = {
@@ -742,6 +743,7 @@ static const struct io_op_def io_op_defs[] = {
.hash_reg_file = 1,
.unbound_nonreg_file = 1,
.pollout = 1,
+ .needs_fsize = 1,
},
[IORING_OP_FSYNC] = {
.needs_file = 1,
@@ -756,6 +758,7 @@ static const struct io_op_def io_op_defs[] = {
.hash_reg_file = 1,
.unbound_nonreg_file = 1,
.pollout = 1,
+ .needs_fsize = 1,
},
[IORING_OP_POLL_ADD] = {
.needs_file = 1,
@@ -808,6 +811,7 @@ static const struct io_op_def io_op_defs[] = {
},
[IORING_OP_FALLOCATE] = {
.needs_file = 1,
+ .needs_fsize = 1,
},
[IORING_OP_OPENAT] = {
.file_table = 1,
@@ -839,6 +843,7 @@ static const struct io_op_def io_op_defs[] = {
.needs_file = 1,
.unbound_nonreg_file = 1,
.pollout = 1,
+ .needs_fsize = 1,
},
[IORING_OP_FADVISE] = {
.needs_file = 1,
@@ -881,22 +886,37 @@ static const struct io_op_def io_op_defs[] = {
},
};
-static void io_wq_submit_work(struct io_wq_work **workptr);
+enum io_mem_account {
+ ACCT_LOCKED,
+ ACCT_PINNED,
+};
+
+static void __io_complete_rw(struct io_kiocb *req, long res, long res2,
+ struct io_comp_state *cs);
static void io_cqring_fill_event(struct io_kiocb *req, long res);
static void io_put_req(struct io_kiocb *req);
+static void io_double_put_req(struct io_kiocb *req);
static void __io_double_put_req(struct io_kiocb *req);
static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req);
static void io_queue_linked_timeout(struct io_kiocb *req);
static int __io_sqe_files_update(struct io_ring_ctx *ctx,
struct io_uring_files_update *ip,
unsigned nr_args);
-static int io_grab_files(struct io_kiocb *req);
-static void io_complete_rw_common(struct kiocb *kiocb, long res);
-static void io_cleanup_req(struct io_kiocb *req);
+static int io_prep_work_files(struct io_kiocb *req);
+static void __io_clean_op(struct io_kiocb *req);
static int io_file_get(struct io_submit_state *state, struct io_kiocb *req,
int fd, struct file **out_file, bool fixed);
static void __io_queue_sqe(struct io_kiocb *req,
- const struct io_uring_sqe *sqe);
+ const struct io_uring_sqe *sqe,
+ struct io_comp_state *cs);
+static void io_file_put_work(struct work_struct *work);
+
+static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
+ struct iovec **iovec, struct iov_iter *iter,
+ bool needs_lock);
+static int io_setup_async_rw(struct io_kiocb *req, ssize_t io_size,
+ struct iovec *iovec, struct iovec *fast_iov,
+ struct iov_iter *iter);
static struct kmem_cache *req_cachep;
@@ -923,6 +943,12 @@ static void io_get_req_task(struct io_kiocb *req)
req->flags |= REQ_F_TASK_PINNED;
}
+static inline void io_clean_op(struct io_kiocb *req)
+{
+ if (req->flags & (REQ_F_NEED_CLEANUP | REQ_F_BUFFER_SELECTED))
+ __io_clean_op(req);
+}
+
/* not idempotent -- it doesn't clear REQ_F_TASK_PINNED */
static void __io_put_req_task(struct io_kiocb *req)
{
@@ -930,7 +956,41 @@ static void __io_put_req_task(struct io_kiocb *req)
put_task_struct(req->task);
}
-static void io_file_put_work(struct work_struct *work);
+static void io_sq_thread_drop_mm(void)
+{
+ struct mm_struct *mm = current->mm;
+
+ if (mm) {
+ kthread_unuse_mm(mm);
+ mmput(mm);
+ }
+}
+
+static int __io_sq_thread_acquire_mm(struct io_ring_ctx *ctx)
+{
+ if (!current->mm) {
+ if (unlikely(!(ctx->flags & IORING_SETUP_SQPOLL) ||
+ !mmget_not_zero(ctx->sqo_mm)))
+ return -EFAULT;
+ kthread_use_mm(ctx->sqo_mm);
+ }
+
+ return 0;
+}
+
+static int io_sq_thread_acquire_mm(struct io_ring_ctx *ctx,
+ struct io_kiocb *req)
+{
+ if (!io_op_defs[req->opcode].needs_mm)
+ return 0;
+ return __io_sq_thread_acquire_mm(ctx);
+}
+
+static inline void req_set_fail_links(struct io_kiocb *req)
+{
+ if ((req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) == REQ_F_LINK)
+ req->flags |= REQ_F_FAIL_LINK;
+}
/*
* Note: must call io_req_init_async() for the first time you
@@ -957,6 +1017,11 @@ static void io_ring_ctx_ref_free(struct percpu_ref *ref)
complete(&ctx->ref_comp);
}
+static inline bool io_is_timeout_noseq(struct io_kiocb *req)
+{
+ return !req->timeout.off;
+}
+
static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
{
struct io_ring_ctx *ctx;
@@ -1000,7 +1065,7 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
mutex_init(&ctx->uring_lock);
init_waitqueue_head(&ctx->wait);
spin_lock_init(&ctx->completion_lock);
- INIT_LIST_HEAD(&ctx->poll_list);
+ INIT_LIST_HEAD(&ctx->iopoll_list);
INIT_LIST_HEAD(&ctx->defer_list);
INIT_LIST_HEAD(&ctx->timeout_list);
init_waitqueue_head(&ctx->inflight_wait);
@@ -1017,18 +1082,14 @@ err:
return NULL;
}
-static inline bool __req_need_defer(struct io_kiocb *req)
+static bool req_need_defer(struct io_kiocb *req, u32 seq)
{
- struct io_ring_ctx *ctx = req->ctx;
+ if (unlikely(req->flags & REQ_F_IO_DRAIN)) {
+ struct io_ring_ctx *ctx = req->ctx;
- return req->sequence != ctx->cached_cq_tail
+ return seq != ctx->cached_cq_tail
+ atomic_read(&ctx->cached_cq_overflow);
-}
-
-static inline bool req_need_defer(struct io_kiocb *req)
-{
- if (unlikely(req->flags & REQ_F_IO_DRAIN))
- return __req_need_defer(req);
+ }
return false;
}
@@ -1046,28 +1107,7 @@ static void __io_commit_cqring(struct io_ring_ctx *ctx)
}
}
-static inline void io_req_work_grab_env(struct io_kiocb *req,
- const struct io_op_def *def)
-{
- if (!req->work.mm && def->needs_mm) {
- mmgrab(current->mm);
- req->work.mm = current->mm;
- }
- if (!req->work.creds)
- req->work.creds = get_current_cred();
- if (!req->work.fs && def->needs_fs) {
- spin_lock(&current->fs->lock);
- if (!current->fs->in_exec) {
- req->work.fs = current->fs;
- req->work.fs->users++;
- } else {
- req->work.flags |= IO_WQ_WORK_CANCEL;
- }
- spin_unlock(&current->fs->lock);
- }
-}
-
-static inline void io_req_work_drop_env(struct io_kiocb *req)
+static void io_req_clean_work(struct io_kiocb *req)
{
if (!(req->flags & REQ_F_WORK_INITIALIZED))
return;
@@ -1089,11 +1129,12 @@ static inline void io_req_work_drop_env(struct io_kiocb *req)
spin_unlock(&req->work.fs->lock);
if (fs)
free_fs_struct(fs);
+ req->work.fs = NULL;
}
+ req->flags &= ~REQ_F_WORK_INITIALIZED;
}
-static inline void io_prep_async_work(struct io_kiocb *req,
- struct io_kiocb **link)
+static void io_prep_async_work(struct io_kiocb *req)
{
const struct io_op_def *def = &io_op_defs[req->opcode];
@@ -1106,18 +1147,42 @@ static inline void io_prep_async_work(struct io_kiocb *req,
if (def->unbound_nonreg_file)
req->work.flags |= IO_WQ_WORK_UNBOUND;
}
+ if (!req->work.mm && def->needs_mm) {
+ mmgrab(current->mm);
+ req->work.mm = current->mm;
+ }
+ if (!req->work.creds)
+ req->work.creds = get_current_cred();
+ if (!req->work.fs && def->needs_fs) {
+ spin_lock(&current->fs->lock);
+ if (!current->fs->in_exec) {
+ req->work.fs = current->fs;
+ req->work.fs->users++;
+ } else {
+ req->work.flags |= IO_WQ_WORK_CANCEL;
+ }
+ spin_unlock(&current->fs->lock);
+ }
+ if (def->needs_fsize)
+ req->work.fsize = rlimit(RLIMIT_FSIZE);
+ else
+ req->work.fsize = RLIM_INFINITY;
+}
- io_req_work_grab_env(req, def);
+static void io_prep_async_link(struct io_kiocb *req)
+{
+ struct io_kiocb *cur;
- *link = io_prep_linked_timeout(req);
+ io_prep_async_work(req);
+ if (req->flags & REQ_F_LINK_HEAD)
+ list_for_each_entry(cur, &req->link_list, link_list)
+ io_prep_async_work(cur);
}
-static inline void io_queue_async_work(struct io_kiocb *req)
+static void __io_queue_async_work(struct io_kiocb *req)
{
struct io_ring_ctx *ctx = req->ctx;
- struct io_kiocb *link;
-
- io_prep_async_work(req, &link);
+ struct io_kiocb *link = io_prep_linked_timeout(req);
trace_io_uring_queue_async_work(ctx, io_wq_is_hashed(&req->work), req,
&req->work, req->flags);
@@ -1127,14 +1192,22 @@ static inline void io_queue_async_work(struct io_kiocb *req)
io_queue_linked_timeout(link);
}
+static void io_queue_async_work(struct io_kiocb *req)
+{
+ /* init ->work of the whole link before punting */
+ io_prep_async_link(req);
+ __io_queue_async_work(req);
+}
+
static void io_kill_timeout(struct io_kiocb *req)
{
int ret;
ret = hrtimer_try_to_cancel(&req->io->timeout.timer);
if (ret != -1) {
- atomic_inc(&req->ctx->cq_timeouts);
- list_del_init(&req->list);
+ atomic_set(&req->ctx->cq_timeouts,
+ atomic_read(&req->ctx->cq_timeouts) + 1);
+ list_del_init(&req->timeout.list);
req->flags |= REQ_F_COMP_LOCKED;
io_cqring_fill_event(req, 0);
io_put_req(req);
@@ -1146,7 +1219,7 @@ static void io_kill_timeouts(struct io_ring_ctx *ctx)
struct io_kiocb *req, *tmp;
spin_lock_irq(&ctx->completion_lock);
- list_for_each_entry_safe(req, tmp, &ctx->timeout_list, list)
+ list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list)
io_kill_timeout(req);
spin_unlock_irq(&ctx->completion_lock);
}
@@ -1154,13 +1227,15 @@ static void io_kill_timeouts(struct io_ring_ctx *ctx)
static void __io_queue_deferred(struct io_ring_ctx *ctx)
{
do {
- struct io_kiocb *req = list_first_entry(&ctx->defer_list,
- struct io_kiocb, list);
+ struct io_defer_entry *de = list_first_entry(&ctx->defer_list,
+ struct io_defer_entry, list);
- if (req_need_defer(req))
+ if (req_need_defer(de->req, de->seq))
break;
- list_del_init(&req->list);
- io_queue_async_work(req);
+ list_del_init(&de->list);
+ /* punt-init is done before queueing for defer */
+ __io_queue_async_work(de->req);
+ kfree(de);
} while (!list_empty(&ctx->defer_list));
}
@@ -1168,15 +1243,15 @@ static void io_flush_timeouts(struct io_ring_ctx *ctx)
{
while (!list_empty(&ctx->timeout_list)) {
struct io_kiocb *req = list_first_entry(&ctx->timeout_list,
- struct io_kiocb, list);
+ struct io_kiocb, timeout.list);
- if (req->flags & REQ_F_TIMEOUT_NOSEQ)
+ if (io_is_timeout_noseq(req))
break;
if (req->timeout.target_seq != ctx->cached_cq_tail
- atomic_read(&ctx->cq_timeouts))
break;
- list_del_init(&req->list);
+ list_del_init(&req->timeout.list);
io_kill_timeout(req);
}
}
@@ -1229,6 +1304,15 @@ static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
eventfd_signal(ctx->cq_ev_fd, 1);
}
+static void io_cqring_mark_overflow(struct io_ring_ctx *ctx)
+{
+ if (list_empty(&ctx->cq_overflow_list)) {
+ clear_bit(0, &ctx->sq_check_overflow);
+ clear_bit(0, &ctx->cq_check_overflow);
+ ctx->rings->sq_flags &= ~IORING_SQ_CQ_OVERFLOW;
+ }
+}
+
/* Returns true if there are no backlogged entries after the flush */
static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
{
@@ -1259,13 +1343,13 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
break;
req = list_first_entry(&ctx->cq_overflow_list, struct io_kiocb,
- list);
- list_move(&req->list, &list);
+ compl.list);
+ list_move(&req->compl.list, &list);
req->flags &= ~REQ_F_OVERFLOW;
if (cqe) {
WRITE_ONCE(cqe->user_data, req->user_data);
WRITE_ONCE(cqe->res, req->result);
- WRITE_ONCE(cqe->flags, req->cflags);
+ WRITE_ONCE(cqe->flags, req->compl.cflags);
} else {
WRITE_ONCE(ctx->rings->cq_overflow,
atomic_inc_return(&ctx->cached_cq_overflow));
@@ -1273,17 +1357,14 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
}
io_commit_cqring(ctx);
- if (cqe) {
- clear_bit(0, &ctx->sq_check_overflow);
- clear_bit(0, &ctx->cq_check_overflow);
- ctx->rings->sq_flags &= ~IORING_SQ_CQ_OVERFLOW;
- }
+ io_cqring_mark_overflow(ctx);
+
spin_unlock_irqrestore(&ctx->completion_lock, flags);
io_cqring_ev_posted(ctx);
while (!list_empty(&list)) {
- req = list_first_entry(&list, struct io_kiocb, list);
- list_del(&req->list);
+ req = list_first_entry(&list, struct io_kiocb, compl.list);
+ list_del(&req->compl.list);
io_put_req(req);
}
@@ -1316,11 +1397,12 @@ static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags)
set_bit(0, &ctx->cq_check_overflow);
ctx->rings->sq_flags |= IORING_SQ_CQ_OVERFLOW;
}
+ io_clean_op(req);
req->flags |= REQ_F_OVERFLOW;
- refcount_inc(&req->refs);
req->result = res;
- req->cflags = cflags;
- list_add_tail(&req->list, &ctx->cq_overflow_list);
+ req->compl.cflags = cflags;
+ refcount_inc(&req->refs);
+ list_add_tail(&req->compl.list, &ctx->cq_overflow_list);
}
}
@@ -1329,7 +1411,7 @@ static void io_cqring_fill_event(struct io_kiocb *req, long res)
__io_cqring_fill_event(req, res, 0);
}
-static void __io_cqring_add_event(struct io_kiocb *req, long res, long cflags)
+static void io_cqring_add_event(struct io_kiocb *req, long res, long cflags)
{
struct io_ring_ctx *ctx = req->ctx;
unsigned long flags;
@@ -1342,9 +1424,52 @@ static void __io_cqring_add_event(struct io_kiocb *req, long res, long cflags)
io_cqring_ev_posted(ctx);
}
-static void io_cqring_add_event(struct io_kiocb *req, long res)
+static void io_submit_flush_completions(struct io_comp_state *cs)
+{
+ struct io_ring_ctx *ctx = cs->ctx;
+
+ spin_lock_irq(&ctx->completion_lock);
+ while (!list_empty(&cs->list)) {
+ struct io_kiocb *req;
+
+ req = list_first_entry(&cs->list, struct io_kiocb, compl.list);
+ list_del(&req->compl.list);
+ __io_cqring_fill_event(req, req->result, req->compl.cflags);
+ if (!(req->flags & REQ_F_LINK_HEAD)) {
+ req->flags |= REQ_F_COMP_LOCKED;
+ io_put_req(req);
+ } else {
+ spin_unlock_irq(&ctx->completion_lock);
+ io_put_req(req);
+ spin_lock_irq(&ctx->completion_lock);
+ }
+ }
+ io_commit_cqring(ctx);
+ spin_unlock_irq(&ctx->completion_lock);
+
+ io_cqring_ev_posted(ctx);
+ cs->nr = 0;
+}
+
+static void __io_req_complete(struct io_kiocb *req, long res, unsigned cflags,
+ struct io_comp_state *cs)
{
- __io_cqring_add_event(req, res, 0);
+ if (!cs) {
+ io_cqring_add_event(req, res, cflags);
+ io_put_req(req);
+ } else {
+ io_clean_op(req);
+ req->result = res;
+ req->compl.cflags = cflags;
+ list_add_tail(&req->compl.list, &cs->list);
+ if (++cs->nr >= 32)
+ io_submit_flush_completions(cs);
+ }
+}
+
+static void io_req_complete(struct io_kiocb *req, long res)
+{
+ __io_req_complete(req, res, 0, NULL);
}
static inline bool io_is_fallback_req(struct io_kiocb *req)
@@ -1370,11 +1495,7 @@ static struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx,
gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
struct io_kiocb *req;
- if (!state) {
- req = kmem_cache_alloc(req_cachep, gfp);
- if (unlikely(!req))
- goto fallback;
- } else if (!state->free_reqs) {
+ if (!state->free_reqs) {
size_t sz;
int ret;
@@ -1412,21 +1533,15 @@ static inline void io_put_file(struct io_kiocb *req, struct file *file,
fput(file);
}
-static void __io_req_aux_free(struct io_kiocb *req)
+static void io_dismantle_req(struct io_kiocb *req)
{
- if (req->flags & REQ_F_NEED_CLEANUP)
- io_cleanup_req(req);
+ io_clean_op(req);
- kfree(req->io);
+ if (req->io)
+ kfree(req->io);
if (req->file)
io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE));
- __io_put_req_task(req);
- io_req_work_drop_env(req);
-}
-
-static void __io_free_req(struct io_kiocb *req)
-{
- __io_req_aux_free(req);
+ io_req_clean_work(req);
if (req->flags & REQ_F_INFLIGHT) {
struct io_ring_ctx *ctx = req->ctx;
@@ -1438,57 +1553,20 @@ static void __io_free_req(struct io_kiocb *req)
wake_up(&ctx->inflight_wait);
spin_unlock_irqrestore(&ctx->inflight_lock, flags);
}
-
- percpu_ref_put(&req->ctx->refs);
- if (likely(!io_is_fallback_req(req)))
- kmem_cache_free(req_cachep, req);
- else
- clear_bit_unlock(0, (unsigned long *) &req->ctx->fallback_req);
}
-struct req_batch {
- void *reqs[IO_IOPOLL_BATCH];
- int to_free;
- int need_iter;
-};
-
-static void io_free_req_many(struct io_ring_ctx *ctx, struct req_batch *rb)
+static void __io_free_req(struct io_kiocb *req)
{
- if (!rb->to_free)
- return;
- if (rb->need_iter) {
- int i, inflight = 0;
- unsigned long flags;
-
- for (i = 0; i < rb->to_free; i++) {
- struct io_kiocb *req = rb->reqs[i];
-
- if (req->flags & REQ_F_INFLIGHT)
- inflight++;
- __io_req_aux_free(req);
- }
- if (!inflight)
- goto do_free;
-
- spin_lock_irqsave(&ctx->inflight_lock, flags);
- for (i = 0; i < rb->to_free; i++) {
- struct io_kiocb *req = rb->reqs[i];
-
- if (req->flags & REQ_F_INFLIGHT) {
- list_del(&req->inflight_entry);
- if (!--inflight)
- break;
- }
- }
- spin_unlock_irqrestore(&ctx->inflight_lock, flags);
+ struct io_ring_ctx *ctx;
- if (waitqueue_active(&ctx->inflight_wait))
- wake_up(&ctx->inflight_wait);
- }
-do_free:
- kmem_cache_free_bulk(req_cachep, rb->to_free, rb->reqs);
- percpu_ref_put_many(&ctx->refs, rb->to_free);
- rb->to_free = rb->need_iter = 0;
+ io_dismantle_req(req);
+ __io_put_req_task(req);
+ ctx = req->ctx;
+ if (likely(!io_is_fallback_req(req)))
+ kmem_cache_free(req_cachep, req);
+ else
+ clear_bit_unlock(0, (unsigned long *) &ctx->fallback_req);
+ percpu_ref_put(&ctx->refs);
}
static bool io_link_cancel_timeout(struct io_kiocb *req)
@@ -1508,53 +1586,67 @@ static bool io_link_cancel_timeout(struct io_kiocb *req)
return false;
}
-static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr)
+static bool __io_kill_linked_timeout(struct io_kiocb *req)
+{
+ struct io_kiocb *link;
+ bool wake_ev;
+
+ if (list_empty(&req->link_list))
+ return false;
+ link = list_first_entry(&req->link_list, struct io_kiocb, link_list);
+ if (link->opcode != IORING_OP_LINK_TIMEOUT)
+ return false;
+
+ list_del_init(&link->link_list);
+ wake_ev = io_link_cancel_timeout(link);
+ req->flags &= ~REQ_F_LINK_TIMEOUT;
+ return wake_ev;
+}
+
+static void io_kill_linked_timeout(struct io_kiocb *req)
{
struct io_ring_ctx *ctx = req->ctx;
- bool wake_ev = false;
+ bool wake_ev;
- /* Already got next link */
- if (req->flags & REQ_F_LINK_NEXT)
- return;
+ if (!(req->flags & REQ_F_COMP_LOCKED)) {
+ unsigned long flags;
+
+ spin_lock_irqsave(&ctx->completion_lock, flags);
+ wake_ev = __io_kill_linked_timeout(req);
+ spin_unlock_irqrestore(&ctx->completion_lock, flags);
+ } else {
+ wake_ev = __io_kill_linked_timeout(req);
+ }
+
+ if (wake_ev)
+ io_cqring_ev_posted(ctx);
+}
+
+static struct io_kiocb *io_req_link_next(struct io_kiocb *req)
+{
+ struct io_kiocb *nxt;
/*
* The list should never be empty when we are called here. But could
* potentially happen if the chain is messed up, check to be on the
* safe side.
*/
- while (!list_empty(&req->link_list)) {
- struct io_kiocb *nxt = list_first_entry(&req->link_list,
- struct io_kiocb, link_list);
-
- if (unlikely((req->flags & REQ_F_LINK_TIMEOUT) &&
- (nxt->flags & REQ_F_TIMEOUT))) {
- list_del_init(&nxt->link_list);
- wake_ev |= io_link_cancel_timeout(nxt);
- req->flags &= ~REQ_F_LINK_TIMEOUT;
- continue;
- }
-
- list_del_init(&req->link_list);
- if (!list_empty(&nxt->link_list))
- nxt->flags |= REQ_F_LINK_HEAD;
- *nxtptr = nxt;
- break;
- }
+ if (unlikely(list_empty(&req->link_list)))
+ return NULL;
- req->flags |= REQ_F_LINK_NEXT;
- if (wake_ev)
- io_cqring_ev_posted(ctx);
+ nxt = list_first_entry(&req->link_list, struct io_kiocb, link_list);
+ list_del_init(&req->link_list);
+ if (!list_empty(&nxt->link_list))
+ nxt->flags |= REQ_F_LINK_HEAD;
+ return nxt;
}
/*
* Called if REQ_F_LINK_HEAD is set, and we fail the head request
*/
-static void io_fail_links(struct io_kiocb *req)
+static void __io_fail_links(struct io_kiocb *req)
{
struct io_ring_ctx *ctx = req->ctx;
- unsigned long flags;
-
- spin_lock_irqsave(&ctx->completion_lock, flags);
while (!list_empty(&req->link_list)) {
struct io_kiocb *link = list_first_entry(&req->link_list,
@@ -1563,25 +1655,37 @@ static void io_fail_links(struct io_kiocb *req)
list_del_init(&link->link_list);
trace_io_uring_fail_link(req, link);
- if ((req->flags & REQ_F_LINK_TIMEOUT) &&
- link->opcode == IORING_OP_LINK_TIMEOUT) {
- io_link_cancel_timeout(link);
- } else {
- io_cqring_fill_event(link, -ECANCELED);
- __io_double_put_req(link);
- }
+ io_cqring_fill_event(link, -ECANCELED);
+ __io_double_put_req(link);
req->flags &= ~REQ_F_LINK_TIMEOUT;
}
io_commit_cqring(ctx);
- spin_unlock_irqrestore(&ctx->completion_lock, flags);
io_cqring_ev_posted(ctx);
}
-static void io_req_find_next(struct io_kiocb *req, struct io_kiocb **nxt)
+static void io_fail_links(struct io_kiocb *req)
{
- if (likely(!(req->flags & REQ_F_LINK_HEAD)))
- return;
+ struct io_ring_ctx *ctx = req->ctx;
+
+ if (!(req->flags & REQ_F_COMP_LOCKED)) {
+ unsigned long flags;
+
+ spin_lock_irqsave(&ctx->completion_lock, flags);
+ __io_fail_links(req);
+ spin_unlock_irqrestore(&ctx->completion_lock, flags);
+ } else {
+ __io_fail_links(req);
+ }
+
+ io_cqring_ev_posted(ctx);
+}
+
+static struct io_kiocb *__io_req_find_next(struct io_kiocb *req)
+{
+ req->flags &= ~REQ_F_LINK_HEAD;
+ if (req->flags & REQ_F_LINK_TIMEOUT)
+ io_kill_linked_timeout(req);
/*
* If LINK is set, we have dependent requests in this chain. If we
@@ -1589,62 +1693,187 @@ static void io_req_find_next(struct io_kiocb *req, struct io_kiocb **nxt)
* dependencies to the next request. In case of failure, fail the rest
* of the chain.
*/
- if (req->flags & REQ_F_FAIL_LINK) {
- io_fail_links(req);
- } else if ((req->flags & (REQ_F_LINK_TIMEOUT | REQ_F_COMP_LOCKED)) ==
- REQ_F_LINK_TIMEOUT) {
- struct io_ring_ctx *ctx = req->ctx;
- unsigned long flags;
+ if (likely(!(req->flags & REQ_F_FAIL_LINK)))
+ return io_req_link_next(req);
+ io_fail_links(req);
+ return NULL;
+}
- /*
- * If this is a timeout link, we could be racing with the
- * timeout timer. Grab the completion lock for this case to
- * protect against that.
- */
- spin_lock_irqsave(&ctx->completion_lock, flags);
- io_req_link_next(req, nxt);
- spin_unlock_irqrestore(&ctx->completion_lock, flags);
+static struct io_kiocb *io_req_find_next(struct io_kiocb *req)
+{
+ if (likely(!(req->flags & REQ_F_LINK_HEAD)))
+ return NULL;
+ return __io_req_find_next(req);
+}
+
+static int io_req_task_work_add(struct io_kiocb *req, struct callback_head *cb)
+{
+ struct task_struct *tsk = req->task;
+ struct io_ring_ctx *ctx = req->ctx;
+ int ret, notify = TWA_RESUME;
+
+ /*
+ * SQPOLL kernel thread doesn't need notification, just a wakeup.
+ * If we're not using an eventfd, then TWA_RESUME is always fine,
+ * as we won't have dependencies between request completions for
+ * other kernel wait conditions.
+ */
+ if (ctx->flags & IORING_SETUP_SQPOLL)
+ notify = 0;
+ else if (ctx->cq_ev_fd)
+ notify = TWA_SIGNAL;
+
+ ret = task_work_add(tsk, cb, notify);
+ if (!ret)
+ wake_up_process(tsk);
+ return ret;
+}
+
+static void __io_req_task_cancel(struct io_kiocb *req, int error)
+{
+ struct io_ring_ctx *ctx = req->ctx;
+
+ spin_lock_irq(&ctx->completion_lock);
+ io_cqring_fill_event(req, error);
+ io_commit_cqring(ctx);
+ spin_unlock_irq(&ctx->completion_lock);
+
+ io_cqring_ev_posted(ctx);
+ req_set_fail_links(req);
+ io_double_put_req(req);
+}
+
+static void io_req_task_cancel(struct callback_head *cb)
+{
+ struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
+
+ __io_req_task_cancel(req, -ECANCELED);
+}
+
+static void __io_req_task_submit(struct io_kiocb *req)
+{
+ struct io_ring_ctx *ctx = req->ctx;
+
+ if (!__io_sq_thread_acquire_mm(ctx)) {
+ mutex_lock(&ctx->uring_lock);
+ __io_queue_sqe(req, NULL, NULL);
+ mutex_unlock(&ctx->uring_lock);
} else {
- io_req_link_next(req, nxt);
+ __io_req_task_cancel(req, -EFAULT);
}
}
-static void io_free_req(struct io_kiocb *req)
+static void io_req_task_submit(struct callback_head *cb)
{
- struct io_kiocb *nxt = NULL;
+ struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
- io_req_find_next(req, &nxt);
- __io_free_req(req);
+ __io_req_task_submit(req);
+}
+
+static void io_req_task_queue(struct io_kiocb *req)
+{
+ int ret;
+
+ init_task_work(&req->task_work, io_req_task_submit);
+
+ ret = io_req_task_work_add(req, &req->task_work);
+ if (unlikely(ret)) {
+ struct task_struct *tsk;
+
+ init_task_work(&req->task_work, io_req_task_cancel);
+ tsk = io_wq_get_task(req->ctx->io_wq);
+ task_work_add(tsk, &r