From 29de5f6a350778a621a748cecc7efbb8f0cfa5a7 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 20 Feb 2020 09:56:08 -0700 Subject: io_uring: consider any io_read/write -EAGAIN as final If the -EAGAIN happens because of a static condition, then a poll or later retry won't fix it. We must call it again from blocking condition. Play it safe and ensure that any -EAGAIN condition from read or write must retry from async context. Signed-off-by: Jens Axboe --- fs/io_uring.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 6a595c13e108..64b4519aabf8 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2234,7 +2234,7 @@ static int io_read(struct io_kiocb *req, struct io_kiocb **nxt, /* Ensure we clear previously set non-block flag */ if (!force_nonblock) - req->rw.kiocb.ki_flags &= ~IOCB_NOWAIT; + kiocb->ki_flags &= ~IOCB_NOWAIT; req->result = 0; io_size = ret; @@ -2245,10 +2245,8 @@ static int io_read(struct io_kiocb *req, struct io_kiocb **nxt, * If the file doesn't support async, mark it as REQ_F_MUST_PUNT so * we know to async punt it even if it was opened O_NONBLOCK */ - if (force_nonblock && !io_file_supports_async(req->file)) { - req->flags |= REQ_F_MUST_PUNT; + if (force_nonblock && !io_file_supports_async(req->file)) goto copy_iov; - } iov_count = iov_iter_count(&iter); ret = rw_verify_area(READ, req->file, &kiocb->ki_pos, iov_count); @@ -2269,6 +2267,9 @@ copy_iov: inline_vecs, &iter); if (ret) goto out_free; + /* any defer here is final, must blocking retry */ + if (!(req->flags & REQ_F_NOWAIT)) + req->flags |= REQ_F_MUST_PUNT; return -EAGAIN; } } @@ -2334,10 +2335,8 @@ static int io_write(struct io_kiocb *req, struct io_kiocb **nxt, * If the file doesn't support async, mark it as REQ_F_MUST_PUNT so * we know to async punt it even if it was opened O_NONBLOCK */ - if (force_nonblock && !io_file_supports_async(req->file)) { - req->flags |= REQ_F_MUST_PUNT; + if (force_nonblock && !io_file_supports_async(req->file)) goto copy_iov; - } /* file path doesn't support NOWAIT for non-direct_IO */ if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT) && @@ -2382,6 +2381,8 @@ copy_iov: inline_vecs, &iter); if (ret) goto out_free; + /* any defer here is final, must blocking retry */ + req->flags |= REQ_F_MUST_PUNT; return -EAGAIN; } } -- cgit v1.2.3 From e441d1cf20e1b9fc443e6130488d41e1941aae82 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 20 Feb 2020 09:59:02 -0700 Subject: io_uring: io_accept() should hold on to submit reference on retry Don't drop an early reference, hang on to it and let the caller drop it. This makes it behave more like "regular" requests. Signed-off-by: Jens Axboe --- fs/io_uring.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 64b4519aabf8..2bf954a42586 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -3354,6 +3354,8 @@ static void io_accept_finish(struct io_wq_work **workptr) struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work); struct io_kiocb *nxt = NULL; + io_put_req(req); + if (io_req_cancelled(req)) return; __io_accept(req, &nxt, false); @@ -3371,7 +3373,6 @@ static int io_accept(struct io_kiocb *req, struct io_kiocb **nxt, ret = __io_accept(req, nxt, force_nonblock); if (ret == -EAGAIN && force_nonblock) { req->work.func = io_accept_finish; - io_put_req(req); return -EAGAIN; } return 0; -- cgit v1.2.3 From 5ea62161167eb8297249d3f4dc63741016f01413 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 24 Feb 2020 11:30:16 +0300 Subject: io_uring: don't call work.func from sync ctx Many operations define custom work.func before getting into an io-wq. There are several points against: - it calls io_wq_assign_next() from outside io-wq, that may be confusing - sync context would go unnecessary through io_req_cancelled() - prototypes are quite different, so work!=old_work looks strange - makes async/sync responsibilities fuzzy - adds extra overhead Don't call generic path and io-wq handlers from each other, but use helpers instead Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 76 +++++++++++++++++++++++++++++------------------------------ 1 file changed, 38 insertions(+), 38 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 2bf954a42586..83ae190a3d31 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2462,23 +2462,28 @@ static void io_wq_assign_next(struct io_wq_work **workptr, struct io_kiocb *nxt) } } -static void io_fsync_finish(struct io_wq_work **workptr) +static void __io_fsync(struct io_kiocb *req, struct io_kiocb **nxt) { - struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work); loff_t end = req->sync.off + req->sync.len; - struct io_kiocb *nxt = NULL; int ret; - if (io_req_cancelled(req)) - return; - ret = vfs_fsync_range(req->file, req->sync.off, end > 0 ? end : LLONG_MAX, req->sync.flags & IORING_FSYNC_DATASYNC); if (ret < 0) req_set_fail_links(req); io_cqring_add_event(req, ret); - io_put_req_find_next(req, &nxt); + io_put_req_find_next(req, nxt); +} + +static void io_fsync_finish(struct io_wq_work **workptr) +{ + struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work); + struct io_kiocb *nxt = NULL; + + if (io_req_cancelled(req)) + return; + __io_fsync(req, &nxt); if (nxt) io_wq_assign_next(workptr, nxt); } @@ -2486,26 +2491,18 @@ static void io_fsync_finish(struct io_wq_work **workptr) static int io_fsync(struct io_kiocb *req, struct io_kiocb **nxt, bool force_nonblock) { - struct io_wq_work *work, *old_work; - /* fsync always requires a blocking context */ if (force_nonblock) { io_put_req(req); req->work.func = io_fsync_finish; return -EAGAIN; } - - work = old_work = &req->work; - io_fsync_finish(&work); - if (work && work != old_work) - *nxt = container_of(work, struct io_kiocb, work); + __io_fsync(req, nxt); return 0; } -static void io_fallocate_finish(struct io_wq_work **workptr) +static void __io_fallocate(struct io_kiocb *req, struct io_kiocb **nxt) { - struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work); - struct io_kiocb *nxt = NULL; int ret; if (io_req_cancelled(req)) @@ -2516,7 +2513,15 @@ static void io_fallocate_finish(struct io_wq_work **workptr) if (ret < 0) req_set_fail_links(req); io_cqring_add_event(req, ret); - io_put_req_find_next(req, &nxt); + io_put_req_find_next(req, nxt); +} + +static void io_fallocate_finish(struct io_wq_work **workptr) +{ + struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work); + struct io_kiocb *nxt = NULL; + + __io_fallocate(req, &nxt); if (nxt) io_wq_assign_next(workptr, nxt); } @@ -2536,8 +2541,6 @@ static int io_fallocate_prep(struct io_kiocb *req, static int io_fallocate(struct io_kiocb *req, struct io_kiocb **nxt, bool force_nonblock) { - struct io_wq_work *work, *old_work; - /* fallocate always requiring blocking context */ if (force_nonblock) { io_put_req(req); @@ -2545,11 +2548,7 @@ static int io_fallocate(struct io_kiocb *req, struct io_kiocb **nxt, return -EAGAIN; } - work = old_work = &req->work; - io_fallocate_finish(&work); - if (work && work != old_work) - *nxt = container_of(work, struct io_kiocb, work); - + __io_fallocate(req, nxt); return 0; } @@ -2953,21 +2952,27 @@ static int io_prep_sfr(struct io_kiocb *req, const struct io_uring_sqe *sqe) return 0; } -static void io_sync_file_range_finish(struct io_wq_work **workptr) +static void __io_sync_file_range(struct io_kiocb *req, struct io_kiocb **nxt) { - struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work); - struct io_kiocb *nxt = NULL; int ret; - if (io_req_cancelled(req)) - return; - ret = sync_file_range(req->file, req->sync.off, req->sync.len, req->sync.flags); if (ret < 0) req_set_fail_links(req); io_cqring_add_event(req, ret); - io_put_req_find_next(req, &nxt); + io_put_req_find_next(req, nxt); +} + + +static void io_sync_file_range_finish(struct io_wq_work **workptr) +{ + struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work); + struct io_kiocb *nxt = NULL; + + if (io_req_cancelled(req)) + return; + __io_sync_file_range(req, &nxt); if (nxt) io_wq_assign_next(workptr, nxt); } @@ -2975,8 +2980,6 @@ static void io_sync_file_range_finish(struct io_wq_work **workptr) static int io_sync_file_range(struct io_kiocb *req, struct io_kiocb **nxt, bool force_nonblock) { - struct io_wq_work *work, *old_work; - /* sync_file_range always requires a blocking context */ if (force_nonblock) { io_put_req(req); @@ -2984,10 +2987,7 @@ static int io_sync_file_range(struct io_kiocb *req, struct io_kiocb **nxt, return -EAGAIN; } - work = old_work = &req->work; - io_sync_file_range_finish(&work); - if (work && work != old_work) - *nxt = container_of(work, struct io_kiocb, work); + __io_sync_file_range(req, nxt); return 0; } -- cgit v1.2.3 From deb6dc0544884067b93bbf9a4716be323103b911 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 24 Feb 2020 11:30:17 +0300 Subject: io_uring: don't do full *prep_worker() from io-wq io_prep_async_worker() called io_wq_assign_next() do many useless checks: io_req_work_grab_env() was already called during prep, and @do_hashed is not ever used. Add io_prep_next_work() -- simplified version, that can be called io-wq. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 83ae190a3d31..6f085215be13 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -950,6 +950,17 @@ static inline void io_req_work_drop_env(struct io_kiocb *req) } } +static inline void io_prep_next_work(struct io_kiocb *req, + struct io_kiocb **link) +{ + const struct io_op_def *def = &io_op_defs[req->opcode]; + + if (!(req->flags & REQ_F_ISREG) && def->unbound_nonreg_file) + req->work.flags |= IO_WQ_WORK_UNBOUND; + + *link = io_prep_linked_timeout(req); +} + static inline bool io_prep_async_work(struct io_kiocb *req, struct io_kiocb **link) { @@ -2453,7 +2464,7 @@ static void io_wq_assign_next(struct io_wq_work **workptr, struct io_kiocb *nxt) { struct io_kiocb *link; - io_prep_async_work(nxt, &link); + io_prep_next_work(nxt, &link); *workptr = &nxt->work; if (link) { nxt->work.flags |= IO_WQ_WORK_CB; -- cgit v1.2.3 From bcaec089c5b64953f96a59089598643911765a43 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 24 Feb 2020 11:30:18 +0300 Subject: io_uring: remove req->in_async req->in_async is not really needed, it only prevents propagation of @nxt for fast not-blocked submissions. Remove it. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 6f085215be13..5f2c0afefae1 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -551,7 +551,6 @@ struct io_kiocb { * llist_node is only used for poll deferred completions */ struct llist_node llist_node; - bool in_async; bool needs_fixed_file; u8 opcode; @@ -1973,14 +1972,13 @@ static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret) } } -static void kiocb_done(struct kiocb *kiocb, ssize_t ret, struct io_kiocb **nxt, - bool in_async) +static void kiocb_done(struct kiocb *kiocb, ssize_t ret, struct io_kiocb **nxt) { struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb); if (req->flags & REQ_F_CUR_POS) req->file->f_pos = kiocb->ki_pos; - if (in_async && ret >= 0 && kiocb->ki_complete == io_complete_rw) + if (ret >= 0 && kiocb->ki_complete == io_complete_rw) *nxt = __io_complete_rw(kiocb, ret); else io_rw_done(kiocb, ret); @@ -2271,7 +2269,7 @@ static int io_read(struct io_kiocb *req, struct io_kiocb **nxt, /* Catch -EAGAIN return for forced non-blocking submission */ if (!force_nonblock || ret2 != -EAGAIN) { - kiocb_done(kiocb, ret2, nxt, req->in_async); + kiocb_done(kiocb, ret2, nxt); } else { copy_iov: ret = io_setup_async_rw(req, io_size, iovec, @@ -2385,7 +2383,7 @@ static int io_write(struct io_kiocb *req, struct io_kiocb **nxt, if (ret2 == -EOPNOTSUPP && (kiocb->ki_flags & IOCB_NOWAIT)) ret2 = -EAGAIN; if (!force_nonblock || ret2 != -EAGAIN) { - kiocb_done(kiocb, ret2, nxt, req->in_async); + kiocb_done(kiocb, ret2, nxt); } else { copy_iov: ret = io_setup_async_rw(req, io_size, iovec, @@ -4535,7 +4533,6 @@ static void io_wq_submit_work(struct io_wq_work **workptr) } if (!ret) { - req->in_async = true; do { ret = io_issue_sqe(req, NULL, &nxt, false); /* @@ -5077,7 +5074,6 @@ fail_req: *mm = ctx->sqo_mm; } - req->in_async = async; req->needs_fixed_file = async; trace_io_uring_submit_sqe(ctx, req->opcode, req->user_data, true, async); -- cgit v1.2.3 From 444ebb5768c5c43aadfc60111fecd6c4f946e77b Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 24 Feb 2020 11:32:43 +0300 Subject: splice: make do_splice public Make do_splice(), so other kernel parts can reuse it Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/splice.c | 6 +++--- include/linux/splice.h | 3 +++ 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/fs/splice.c b/fs/splice.c index d671936d0aad..4735defc46ee 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -1109,9 +1109,9 @@ static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe, /* * Determine where to splice to/from. */ -static long do_splice(struct file *in, loff_t __user *off_in, - struct file *out, loff_t __user *off_out, - size_t len, unsigned int flags) +long do_splice(struct file *in, loff_t __user *off_in, + struct file *out, loff_t __user *off_out, + size_t len, unsigned int flags) { struct pipe_inode_info *ipipe; struct pipe_inode_info *opipe; diff --git a/include/linux/splice.h b/include/linux/splice.h index 74b4911ac16d..ebbbfea48aa0 100644 --- a/include/linux/splice.h +++ b/include/linux/splice.h @@ -78,6 +78,9 @@ extern ssize_t add_to_pipe(struct pipe_inode_info *, struct pipe_buffer *); extern ssize_t splice_direct_to_actor(struct file *, struct splice_desc *, splice_direct_actor *); +extern long do_splice(struct file *in, loff_t __user *off_in, + struct file *out, loff_t __user *off_out, + size_t len, unsigned int flags); /* * for dynamic pipe sizing -- cgit v1.2.3 From 8da11c19940ddbc22fc835bce3f361f4d2417fb0 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 24 Feb 2020 11:32:44 +0300 Subject: io_uring: add interface for getting files Preparation without functional changes. Adds io_get_file(), that allows to grab files not only into req->file. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 72 +++++++++++++++++++++++++++++++++++------------------------ 1 file changed, 43 insertions(+), 29 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 5f2c0afefae1..1a3de7337274 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1253,6 +1253,15 @@ fallback: return NULL; } +static inline void io_put_file(struct io_kiocb *req, struct file *file, + bool fixed) +{ + if (fixed) + percpu_ref_put(&req->ctx->file_data->refs); + else + fput(file); +} + static void __io_req_do_free(struct io_kiocb *req) { if (likely(!io_is_fallback_req(req))) @@ -1263,18 +1272,12 @@ static void __io_req_do_free(struct io_kiocb *req) static void __io_req_aux_free(struct io_kiocb *req) { - struct io_ring_ctx *ctx = req->ctx; - if (req->flags & REQ_F_NEED_CLEANUP) io_cleanup_req(req); kfree(req->io); - if (req->file) { - if (req->flags & REQ_F_FIXED_FILE) - percpu_ref_put(&ctx->file_data->refs); - else - fput(req->file); - } + if (req->file) + io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE)); io_req_work_drop_env(req); } @@ -1848,7 +1851,7 @@ static void io_file_put(struct io_submit_state *state) * assuming most submissions are for one file, or at least that each file * has more than one submission. */ -static struct file *io_file_get(struct io_submit_state *state, int fd) +static struct file *__io_file_get(struct io_submit_state *state, int fd) { if (!state) return fget(fd); @@ -4578,41 +4581,52 @@ static inline struct file *io_file_from_index(struct io_ring_ctx *ctx, return table->files[index & IORING_FILE_TABLE_MASK];; } -static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req, - const struct io_uring_sqe *sqe) +static int io_file_get(struct io_submit_state *state, struct io_kiocb *req, + int fd, struct file **out_file, bool fixed) { struct io_ring_ctx *ctx = req->ctx; - unsigned flags; - int fd; - - flags = READ_ONCE(sqe->flags); - fd = READ_ONCE(sqe->fd); - - if (!io_req_needs_file(req, fd)) - return 0; + struct file *file; - if (flags & IOSQE_FIXED_FILE) { + if (fixed) { if (unlikely(!ctx->file_data || (unsigned) fd >= ctx->nr_user_files)) return -EBADF; fd = array_index_nospec(fd, ctx->nr_user_files); - req->file = io_file_from_index(ctx, fd); - if (!req->file) + file = io_file_from_index(ctx, fd); + if (!file) return -EBADF; - req->flags |= REQ_F_FIXED_FILE; percpu_ref_get(&ctx->file_data->refs); } else { - if (req->needs_fixed_file) - return -EBADF; trace_io_uring_file_get(ctx, fd); - req->file = io_file_get(state, fd); - if (unlikely(!req->file)) + file = __io_file_get(state, fd); + if (unlikely(!file)) return -EBADF; } + *out_file = file; return 0; } +static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req, + const struct io_uring_sqe *sqe) +{ + unsigned flags; + int fd; + bool fixed; + + flags = READ_ONCE(sqe->flags); + fd = READ_ONCE(sqe->fd); + + if (!io_req_needs_file(req, fd)) + return 0; + + fixed = (flags & IOSQE_FIXED_FILE); + if (unlikely(!fixed && req->needs_fixed_file)) + return -EBADF; + + return io_file_get(state, req, fd, &req->file, fixed); +} + static int io_grab_files(struct io_kiocb *req) { int ret = -EBADF; @@ -4857,8 +4871,8 @@ static bool io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, } /* same numerical values with corresponding REQ_F_*, safe to copy */ - req->flags |= sqe_flags & (IOSQE_IO_DRAIN|IOSQE_IO_HARDLINK| - IOSQE_ASYNC); + req->flags |= sqe_flags & (IOSQE_IO_DRAIN | IOSQE_IO_HARDLINK | + IOSQE_ASYNC | IOSQE_FIXED_FILE); ret = io_req_set_file(state, req, sqe); if (unlikely(ret)) { -- cgit v1.2.3 From 7d67af2c013402537385dae343a2d0f6a4cb3bfd Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 24 Feb 2020 11:32:45 +0300 Subject: io_uring: add splice(2) support Add support for splice(2). - output file is specified as sqe->fd, so it's handled by generic code - hash_reg_file handled by generic code as well - len is 32bit, but should be fine - the fd_in is registered file, when SPLICE_F_FD_IN_FIXED is set, which is a splice flag (i.e. sqe->splice_flags). Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 109 ++++++++++++++++++++++++++++++++++++++++++ include/uapi/linux/io_uring.h | 14 +++++- 2 files changed, 122 insertions(+), 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 1a3de7337274..1ef20a2af10b 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -76,6 +76,7 @@ #include #include #include +#include #define CREATE_TRACE_POINTS #include @@ -428,6 +429,15 @@ struct io_epoll { struct epoll_event event; }; +struct io_splice { + struct file *file_out; + struct file *file_in; + loff_t off_out; + loff_t off_in; + u64 len; + unsigned int flags; +}; + struct io_async_connect { struct sockaddr_storage address; }; @@ -544,6 +554,7 @@ struct io_kiocb { struct io_fadvise fadvise; struct io_madvise madvise; struct io_epoll epoll; + struct io_splice splice; }; struct io_async_ctx *io; @@ -744,6 +755,11 @@ static const struct io_op_def io_op_defs[] = { .unbound_nonreg_file = 1, .file_table = 1, }, + [IORING_OP_SPLICE] = { + .needs_file = 1, + .hash_reg_file = 1, + .unbound_nonreg_file = 1, + } }; static void io_wq_submit_work(struct io_wq_work **workptr); @@ -758,6 +774,10 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx, static int io_grab_files(struct io_kiocb *req); static void io_ring_file_ref_flush(struct fixed_file_data *data); static void io_cleanup_req(struct io_kiocb *req); +static int io_file_get(struct io_submit_state *state, + struct io_kiocb *req, + int fd, struct file **out_file, + bool fixed); static struct kmem_cache *req_cachep; @@ -2404,6 +2424,77 @@ out_free: return ret; } +static int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_splice* sp = &req->splice; + unsigned int valid_flags = SPLICE_F_FD_IN_FIXED | SPLICE_F_ALL; + int ret; + + if (req->flags & REQ_F_NEED_CLEANUP) + return 0; + + sp->file_in = NULL; + sp->off_in = READ_ONCE(sqe->splice_off_in); + sp->off_out = READ_ONCE(sqe->off); + sp->len = READ_ONCE(sqe->len); + sp->flags = READ_ONCE(sqe->splice_flags); + + if (unlikely(sp->flags & ~valid_flags)) + return -EINVAL; + + ret = io_file_get(NULL, req, READ_ONCE(sqe->splice_fd_in), &sp->file_in, + (sp->flags & SPLICE_F_FD_IN_FIXED)); + if (ret) + return ret; + req->flags |= REQ_F_NEED_CLEANUP; + + if (!S_ISREG(file_inode(sp->file_in)->i_mode)) + req->work.flags |= IO_WQ_WORK_UNBOUND; + + return 0; +} + +static bool io_splice_punt(struct file *file) +{ + if (get_pipe_info(file)) + return false; + if (!io_file_supports_async(file)) + return true; + return !(file->f_mode & O_NONBLOCK); +} + +static int io_splice(struct io_kiocb *req, struct io_kiocb **nxt, + bool force_nonblock) +{ + struct io_splice *sp = &req->splice; + struct file *in = sp->file_in; + struct file *out = sp->file_out; + unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED; + loff_t *poff_in, *poff_out; + long ret; + + if (force_nonblock) { + if (io_splice_punt(in) || io_splice_punt(out)) + return -EAGAIN; + flags |= SPLICE_F_NONBLOCK; + } + + poff_in = (sp->off_in == -1) ? NULL : &sp->off_in; + poff_out = (sp->off_out == -1) ? NULL : &sp->off_out; + ret = do_splice(in, poff_in, out, poff_out, sp->len, flags); + if (force_nonblock && ret == -EAGAIN) + return -EAGAIN; + + io_put_file(req, in, (sp->flags & SPLICE_F_FD_IN_FIXED)); + req->flags &= ~REQ_F_NEED_CLEANUP; + + io_cqring_add_event(req, ret); + if (ret != sp->len) + req_set_fail_links(req); + io_put_req_find_next(req, nxt); + return 0; +} + /* * IORING_OP_NOP just posts a completion event, nothing else. */ @@ -4230,6 +4321,9 @@ static int io_req_defer_prep(struct io_kiocb *req, case IORING_OP_EPOLL_CTL: ret = io_epoll_ctl_prep(req, sqe); break; + case IORING_OP_SPLICE: + ret = io_splice_prep(req, sqe); + break; default: printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n", req->opcode); @@ -4292,6 +4386,10 @@ static void io_cleanup_req(struct io_kiocb *req) case IORING_OP_STATX: putname(req->open.filename); break; + case IORING_OP_SPLICE: + io_put_file(req, req->splice.file_in, + (req->splice.flags & SPLICE_F_FD_IN_FIXED)); + break; } req->flags &= ~REQ_F_NEED_CLEANUP; @@ -4495,6 +4593,14 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, } ret = io_epoll_ctl(req, nxt, force_nonblock); break; + case IORING_OP_SPLICE: + if (sqe) { + ret = io_splice_prep(req, sqe); + if (ret < 0) + break; + } + ret = io_splice(req, nxt, force_nonblock); + break; default: ret = -EINVAL; break; @@ -7230,6 +7336,7 @@ static int __init io_uring_init(void) BUILD_BUG_SQE_ELEM(8, __u64, off); BUILD_BUG_SQE_ELEM(8, __u64, addr2); BUILD_BUG_SQE_ELEM(16, __u64, addr); + BUILD_BUG_SQE_ELEM(16, __u64, splice_off_in); BUILD_BUG_SQE_ELEM(24, __u32, len); BUILD_BUG_SQE_ELEM(28, __kernel_rwf_t, rw_flags); BUILD_BUG_SQE_ELEM(28, /* compat */ int, rw_flags); @@ -7244,9 +7351,11 @@ static int __init io_uring_init(void) BUILD_BUG_SQE_ELEM(28, __u32, open_flags); BUILD_BUG_SQE_ELEM(28, __u32, statx_flags); BUILD_BUG_SQE_ELEM(28, __u32, fadvise_advice); + BUILD_BUG_SQE_ELEM(28, __u32, splice_flags); BUILD_BUG_SQE_ELEM(32, __u64, user_data); BUILD_BUG_SQE_ELEM(40, __u16, buf_index); BUILD_BUG_SQE_ELEM(42, __u16, personality); + BUILD_BUG_SQE_ELEM(44, __s32, splice_fd_in); BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST); req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC); diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 3f7961c1c243..08891cc1c1e7 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -23,7 +23,10 @@ struct io_uring_sqe { __u64 off; /* offset into file */ __u64 addr2; }; - __u64 addr; /* pointer to buffer or iovecs */ + union { + __u64 addr; /* pointer to buffer or iovecs */ + __u64 splice_off_in; + }; __u32 len; /* buffer size or number of iovecs */ union { __kernel_rwf_t rw_flags; @@ -37,6 +40,7 @@ struct io_uring_sqe { __u32 open_flags; __u32 statx_flags; __u32 fadvise_advice; + __u32 splice_flags; }; __u64 user_data; /* data to be passed back at completion time */ union { @@ -45,6 +49,7 @@ struct io_uring_sqe { __u16 buf_index; /* personality to use, if used */ __u16 personality; + __s32 splice_fd_in; }; __u64 __pad2[3]; }; @@ -113,6 +118,7 @@ enum { IORING_OP_RECV, IORING_OP_OPENAT2, IORING_OP_EPOLL_CTL, + IORING_OP_SPLICE, /* this goes last, obviously */ IORING_OP_LAST, @@ -128,6 +134,12 @@ enum { */ #define IORING_TIMEOUT_ABS (1U << 0) +/* + * sqe->splice_flags + * extends splice(2) flags + */ +#define SPLICE_F_FD_IN_FIXED (1U << 31) /* the last bit of __u32 */ + /* * IO completion data structure (Completion Queue Entry) */ -- cgit v1.2.3 From b0a20349f212dc725f5ddfd060e426fe6181d9c5 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 28 Feb 2020 10:36:35 +0300 Subject: io_uring: clean io_poll_complete Deduplicate call to io_cqring_fill_event(), plain and easy Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 1ef20a2af10b..f4c6661b33bc 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -3641,10 +3641,7 @@ static void io_poll_complete(struct io_kiocb *req, __poll_t mask, int error) struct io_ring_ctx *ctx = req->ctx; req->poll.done = true; - if (error) - io_cqring_fill_event(req, error); - else - io_cqring_fill_event(req, mangle_poll(mask)); + io_cqring_fill_event(req, error ? error : mangle_poll(mask)); io_commit_cqring(ctx); } -- cgit v1.2.3 From 02d27d895323c4baa3234e4bed015eb3a196e1dd Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 28 Feb 2020 10:36:36 +0300 Subject: io_uring: extract kmsg copy helper io_recvmsg() and io_sendmsg() duplicate nonblock -EAGAIN finilising part, so add helper for that. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 43 +++++++++++++++++++------------------------ 1 file changed, 19 insertions(+), 24 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index f4c6661b33bc..2a8d88c9bcab 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -3094,6 +3094,21 @@ static int io_sync_file_range(struct io_kiocb *req, struct io_kiocb **nxt, return 0; } +static int io_setup_async_msg(struct io_kiocb *req, + struct io_async_msghdr *kmsg) +{ + if (req->io) + return -EAGAIN; + if (io_alloc_async_ctx(req)) { + if (kmsg->iov != kmsg->fast_iov) + kfree(kmsg->iov); + return -ENOMEM; + } + req->flags |= REQ_F_NEED_CLEANUP; + memcpy(&req->io->msg, kmsg, sizeof(*kmsg)); + return -EAGAIN; +} + static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { #if defined(CONFIG_NET) @@ -3170,18 +3185,8 @@ static int io_sendmsg(struct io_kiocb *req, struct io_kiocb **nxt, flags |= MSG_DONTWAIT; ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags); - if (force_nonblock && ret == -EAGAIN) { - if (req->io) - return -EAGAIN; - if (io_alloc_async_ctx(req)) { - if (kmsg->iov != kmsg->fast_iov) - kfree(kmsg->iov); - return -ENOMEM; - } - req->flags |= REQ_F_NEED_CLEANUP; - memcpy(&req->io->msg, &io.msg, sizeof(io.msg)); - return -EAGAIN; - } + if (force_nonblock && ret == -EAGAIN) + return io_setup_async_msg(req, kmsg); if (ret == -ERESTARTSYS) ret = -EINTR; } @@ -3329,18 +3334,8 @@ static int io_recvmsg(struct io_kiocb *req, struct io_kiocb **nxt, ret = __sys_recvmsg_sock(sock, &kmsg->msg, req->sr_msg.msg, kmsg->uaddr, flags); - if (force_nonblock && ret == -EAGAIN) { - if (req->io) - return -EAGAIN; - if (io_alloc_async_ctx(req)) { - if (kmsg->iov != kmsg->fast_iov) - kfree(kmsg->iov); - return -ENOMEM; - } - memcpy(&req->io->msg, &io.msg, sizeof(io.msg)); - req->flags |= REQ_F_NEED_CLEANUP; - return -EAGAIN; - } + if (force_nonblock && ret == -EAGAIN) + return io_setup_async_msg(req, kmsg); if (ret == -ERESTARTSYS) ret = -EINTR; } -- cgit v1.2.3 From e85530ddda4f08d4f9ed6506d4a1f42e086e3b21 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 28 Feb 2020 10:36:37 +0300 Subject: io-wq: remove unused IO_WQ_WORK_HAS_MM IO_WQ_WORK_HAS_MM is set but never used, remove it. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io-wq.c | 2 -- fs/io-wq.h | 1 - 2 files changed, 3 deletions(-) diff --git a/fs/io-wq.c b/fs/io-wq.c index 5cef075c0b37..39ed8751ea31 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -499,8 +499,6 @@ next: */ if (test_bit(IO_WQ_BIT_CANCEL, &wq->state)) work->flags |= IO_WQ_WORK_CANCEL; - if (worker->mm) - work->flags |= IO_WQ_WORK_HAS_MM; if (wq->get_work) { put_work = work; diff --git a/fs/io-wq.h b/fs/io-wq.h index e5e15f2c93ec..d500d88ab84e 100644 --- a/fs/io-wq.h +++ b/fs/io-wq.h @@ -5,7 +5,6 @@ struct io_wq; enum { IO_WQ_WORK_CANCEL = 1, - IO_WQ_WORK_HAS_MM = 2, IO_WQ_WORK_HASHED = 4, IO_WQ_WORK_UNBOUND = 32, IO_WQ_WORK_CB = 128, -- cgit v1.2.3 From 5eae8619907a1389dbd1b4a1049caf52782c0916 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 28 Feb 2020 10:36:38 +0300 Subject: io_uring: remove IO_WQ_WORK_CB IO_WQ_WORK_CB is used only for linked timeouts, which will be armed before the work setup (i.e. mm, override creds, etc). The setup shouldn't take long, so it's ok to arm it a bit later and get rid of IO_WQ_WORK_CB. Make io-wq call work->func() only once, callbacks will handle the rest. i.e. the linked timeout handler will do the actual issue. And as a bonus, it removes an extra indirect call. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io-wq.c | 3 --- fs/io-wq.h | 1 - fs/io_uring.c | 3 +-- 3 files changed, 1 insertion(+), 6 deletions(-) diff --git a/fs/io-wq.c b/fs/io-wq.c index 39ed8751ea31..a1a42ead3b5a 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -479,9 +479,6 @@ next: worker->cur_work = work; spin_unlock_irq(&worker->lock); - if (work->flags & IO_WQ_WORK_CB) - work->func(&work); - if (work->files && current->files != work->files) { task_lock(current); current->files = work->files; diff --git a/fs/io-wq.h b/fs/io-wq.h index d500d88ab84e..a0978d6958f0 100644 --- a/fs/io-wq.h +++ b/fs/io-wq.h @@ -7,7 +7,6 @@ enum { IO_WQ_WORK_CANCEL = 1, IO_WQ_WORK_HASHED = 4, IO_WQ_WORK_UNBOUND = 32, - IO_WQ_WORK_CB = 128, IO_WQ_WORK_NO_CANCEL = 256, IO_WQ_WORK_CONCURRENT = 512, diff --git a/fs/io_uring.c b/fs/io_uring.c index 2a8d88c9bcab..f999503854b7 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2549,7 +2549,7 @@ static void io_link_work_cb(struct io_wq_work **workptr) struct io_kiocb *link = work->data; io_queue_linked_timeout(link); - work->func = io_wq_submit_work; + io_wq_submit_work(workptr); } static void io_wq_assign_next(struct io_wq_work **workptr, struct io_kiocb *nxt) @@ -2559,7 +2559,6 @@ static void io_wq_assign_next(struct io_wq_work **workptr, struct io_kiocb *nxt) io_prep_next_work(nxt, &link); *workptr = &nxt->work; if (link) { - nxt->work.flags |= IO_WQ_WORK_CB; nxt->work.func = io_link_work_cb; nxt->work.data = link; } -- cgit v1.2.3 From 3684f24653534c71c7dc9f44d7281a838f4e4979 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 28 Feb 2020 10:36:39 +0300 Subject: io-wq: use BIT for ulong hash @hash_map is unsigned long, but BIT_ULL() is used for manipulations. BIT() is a better match as it returns exactly unsigned long value. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io-wq.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/io-wq.c b/fs/io-wq.c index a1a42ead3b5a..042c7e2057ef 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -393,8 +393,8 @@ static struct io_wq_work *io_get_next_work(struct io_wqe *wqe, unsigned *hash) /* hashed, can run if not already running */ *hash = work->flags >> IO_WQ_HASH_SHIFT; - if (!(wqe->hash_map & BIT_ULL(*hash))) { - wqe->hash_map |= BIT_ULL(*hash); + if (!(wqe->hash_map & BIT(*hash))) { + wqe->hash_map |= BIT(*hash); wq_node_del(&wqe->work_list, node, prev); return work; } @@ -512,7 +512,7 @@ next: spin_lock_irq(&wqe->lock); if (hash != -1U) { - wqe->hash_map &= ~BIT_ULL(hash); + wqe->hash_map &= ~BIT(hash); wqe->flags &= ~IO_WQE_FLAG_STALLED; } if (work && work != old_work) { -- cgit v1.2.3 From 6fb614920b38bbf3c1c7fcd944c6d9b5d746103d Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 18 Feb 2020 16:50:18 +0100 Subject: task_work_run: don't take ->pi_lock unconditionally As Peter pointed out, task_work() can avoid ->pi_lock and cmpxchg() if task->task_works == NULL && !PF_EXITING. And in fact the only reason why task_work_run() needs ->pi_lock is the possible race with task_work_cancel(), we can optimize this code and make the locking more clear. Signed-off-by: Oleg Nesterov Signed-off-by: Jens Axboe --- kernel/task_work.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/kernel/task_work.c b/kernel/task_work.c index 0fef395662a6..825f28259a19 100644 --- a/kernel/task_work.c +++ b/kernel/task_work.c @@ -97,16 +97,26 @@ void task_work_run(void) * work->func() can do task_work_add(), do not set * work_exited unless the list is empty. */ - raw_spin_lock_irq(&task->pi_lock); do { + head = NULL; work = READ_ONCE(task->task_works); - head = !work && (task->flags & PF_EXITING) ? - &work_exited : NULL; + if (!work) { + if (task->flags & PF_EXITING) + head = &work_exited; + else + break; + } } while (cmpxchg(&task->task_works, work, head) != work); - raw_spin_unlock_irq(&task->pi_lock); if (!work) break; + /* + * Synchronize with task_work_cancel(). It can not remove + * the first entry == work, cmpxchg(task_works) must fail. + * But it can remove another entry from the ->next list. + */ + raw_spin_lock_irq(&task->pi_lock); + raw_spin_unlock_irq(&task->pi_lock); do { next = work->next; -- cgit v1.2.3 From c2f2eb7d2c1cdc37fa9633bae96f381d33ee7a14 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 10 Feb 2020 09:07:05 -0700 Subject: io_uring: store io_kiocb in wait->private Store the io_kiocb in the private field instead of the poll entry, this is in preparation for allowing multiple waitqueues. No functional changes in this patch. Signed-off-by: Jens Axboe --- fs/io_uring.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index f999503854b7..7a97a6c1c09e 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -3730,8 +3730,8 @@ static void io_poll_trigger_evfd(struct io_wq_work **workptr) static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, void *key) { - struct io_poll_iocb *poll = wait->private; - struct io_kiocb *req = container_of(poll, struct io_kiocb, poll); + struct io_kiocb *req = wait->private; + struct io_poll_iocb *poll = &req->poll; struct io_ring_ctx *ctx = req->ctx; __poll_t mask = key_to_poll(key); @@ -3854,7 +3854,7 @@ static int io_poll_add(struct io_kiocb *req, struct io_kiocb **nxt) /* initialized the list so that we can do list_empty checks */ INIT_LIST_HEAD(&poll->wait.entry); init_waitqueue_func_entry(&poll->wait, io_poll_wake); - poll->wait.private = poll; + poll->wait.private = req; INIT_LIST_HEAD(&req->list); -- cgit v1.2.3 From b41e98524e424d104aa7851d54fd65820759875a Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 17 Feb 2020 09:52:41 -0700 Subject: io_uring: add per-task callback handler For poll requests, it's not uncommon to link a read (or write) after the poll to execute immediately after the file is marked as ready. Since the poll completion is called inside the waitqueue wake up handler, we have to punt that linked request to async context. This slows down the processing, and actually means it's faster to not use a link for this use case. We also run into problems if the completion_lock is contended, as we're doing a different lock ordering than the issue side is. Hence we have to do trylock for completion, and if that fails, go async. Poll removal needs to go async as well, for the same reason. eventfd notification needs special case as well, to avoid stack blowing recursion or deadlocks. These are all deficiencies that were inherited from the aio poll implementation, but I think we can do better. When a poll completes, simply queue it up in the task poll list. When the task completes the list, we can run dependent links inline as well. This means we never have to go async, and we can remove a bunch of code associated with that, and optimizations to try and make that run faster. The diffstat speaks for itself. Signed-off-by: Jens Axboe --- fs/io_uring.c | 218 ++++++++++++++++++++-------------------------------------- 1 file changed, 76 insertions(+), 142 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 7a97a6c1c09e..a16b5632ce6f 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -77,6 +77,7 @@ #include #include #include +#include #define CREATE_TRACE_POINTS #include @@ -291,7 +292,6 @@ struct io_ring_ctx { struct { spinlock_t completion_lock; - struct llist_head poll_llist; /* * ->poll_list is protected by the ctx->uring_lock for @@ -558,10 +558,6 @@ struct io_kiocb { }; struct io_async_ctx *io; - /* - * llist_node is only used for poll deferred completions - */ - struct llist_node llist_node; bool needs_fixed_file; u8 opcode; @@ -579,7 +575,17 @@ struct io_kiocb { struct list_head inflight_entry; - struct io_wq_work work; + union { + /* + * Only commands that never go async can use the below fields, + * obviously. Right now only IORING_OP_POLL_ADD uses them. + */ + struct { + struct task_struct *task; + struct callback_head task_work; + }; + struct io_wq_work work; + }; }; #define IO_PLUG_THRESHOLD 2 @@ -774,10 +780,10 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx, static int io_grab_files(struct io_kiocb *req); static void io_ring_file_ref_flush(struct fixed_file_data *data); static void io_cleanup_req(struct io_kiocb *req); -static int io_file_get(struct io_submit_state *state, - struct io_kiocb *req, - int fd, struct file **out_file, - bool fixed); +static int io_file_get(struct io_submit_state *state, struct io_kiocb *req, + int fd, struct file **out_file, bool fixed); +static void __io_queue_sqe(struct io_kiocb *req, + const struct io_uring_sqe *sqe); static struct kmem_cache *req_cachep; @@ -848,7 +854,6 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) mutex_init(&ctx->uring_lock); init_waitqueue_head(&ctx->wait); spin_lock_init(&ctx->completion_lock); - init_llist_head(&ctx->poll_llist); INIT_LIST_HEAD(&ctx->poll_list); INIT_LIST_HEAD(&ctx->defer_list); INIT_LIST_HEAD(&ctx->timeout_list); @@ -1081,24 +1086,19 @@ static inline bool io_should_trigger_evfd(struct io_ring_ctx *ctx) return false; if (!ctx->eventfd_async) return true; - return io_wq_current_is_worker() || in_interrupt(); + return io_wq_current_is_worker(); } -static void __io_cqring_ev_posted(struct io_ring_ctx *ctx, bool trigger_ev) +static void io_cqring_ev_posted(struct io_ring_ctx *ctx) { if (waitqueue_active(&ctx->wait)) wake_up(&ctx->wait); if (waitqueue_active(&ctx->sqo_wait)) wake_up(&ctx->sqo_wait); - if (trigger_ev) + if (io_should_trigger_evfd(ctx)) eventfd_signal(ctx->cq_ev_fd, 1); } -static void io_cqring_ev_posted(struct io_ring_ctx *ctx) -{ - __io_cqring_ev_posted(ctx, io_should_trigger_evfd(ctx)); -} - /* Returns true if there are no backlogged entries after the flush */ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) { @@ -3548,18 +3548,27 @@ out: #endif } -static void io_poll_remove_one(struct io_kiocb *req) +static bool io_poll_remove_one(struct io_kiocb *req) { struct io_poll_iocb *poll = &req->poll; + bool do_complete = false; spin_lock(&poll->head->lock); WRITE_ONCE(poll->canceled, true); if (!list_empty(&poll->wait.entry)) { list_del_init(&poll->wait.entry); - io_queue_async_work(req); + do_complete = true; } spin_unlock(&poll->head->lock); hash_del(&req->hash_node); + if (do_complete) { + io_cqring_fill_event(req, -ECANCELED); + io_commit_cqring(req->ctx); + req->flags |= REQ_F_COMP_LOCKED; + io_put_req(req); + } + + return do_complete; } static void io_poll_remove_all(struct io_ring_ctx *ctx) @@ -3577,6 +3586,8 @@ static void io_poll_remove_all(struct io_ring_ctx *ctx) io_poll_remove_one(req); } spin_unlock_irq(&ctx->completion_lock); + + io_cqring_ev_posted(ctx); } static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr) @@ -3586,10 +3597,11 @@ static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr) list = &ctx->cancel_hash[hash_long(sqe_addr, ctx->cancel_hash_bits)]; hlist_for_each_entry(req, list, hash_node) { - if (sqe_addr == req->user_data) { - io_poll_remove_one(req); + if (sqe_addr != req->user_data) + continue; + if (io_poll_remove_one(req)) return 0; - } + return -EALREADY; } return -ENOENT; @@ -3639,92 +3651,28 @@ static void io_poll_complete(struct io_kiocb *req, __poll_t mask, int error) io_commit_cqring(ctx); } -static void io_poll_complete_work(struct io_wq_work **workptr) +static void io_poll_task_handler(struct io_kiocb *req, struct io_kiocb **nxt) { - struct io_wq_work *work = *workptr; - struct io_kiocb *req = container_of(work, struct io_kiocb, work); - struct io_poll_iocb *poll = &req->poll; - struct poll_table_struct pt = { ._key = poll->events }; struct io_ring_ctx *ctx = req->ctx; - struct io_kiocb *nxt = NULL; - __poll_t mask = 0; - int ret = 0; - - if (work->flags & IO_WQ_WORK_CANCEL) { - WRITE_ONCE(poll->canceled, true); - ret = -ECANCELED; - } else if (READ_ONCE(poll->canceled)) { - ret = -ECANCELED; - } - - if (ret != -ECANCELED) - mask = vfs_poll(poll->file, &pt) & poll->events; - /* - * Note that ->ki_cancel callers also delete iocb from active_reqs after - * calling ->ki_cancel. We need the ctx_lock roundtrip here to - * synchronize with them. In the cancellation case the list_del_init - * itself is not actually needed, but harmless so we keep it in to - * avoid further branches in the fast path. - */ spin_lock_irq(&ctx->completion_lock); - if (!mask && ret != -ECANCELED) { - add_wait_queue(poll->head, &poll->wait); - spin_unlock_irq(&ctx->completion_lock); - return; - } hash_del(&req->hash_node); - io_poll_complete(req, mask, ret); - spin_unlock_irq(&ctx->completion_lock); - - io_cqring_ev_posted(ctx); - - if (ret < 0) - req_set_fail_links(req); - io_put_req_find_next(req, &nxt); - if (nxt) - io_wq_assign_next(workptr, nxt); -} - -static void __io_poll_flush(struct io_ring_ctx *ctx, struct llist_node *nodes) -{ - struct io_kiocb *req, *tmp; - struct req_batch rb; - - rb.to_free = rb.need_iter = 0; - spin_lock_irq(&ctx->completion_lock); - llist_for_each_entry_safe(req, tmp, nodes, llist_node) { - hash_del(&req->hash_node); - io_poll_complete(req, req->result, 0); - - if (refcount_dec_and_test(&req->refs) && - !io_req_multi_free(&rb, req)) { - req->flags |= REQ_F_COMP_LOCKED; - io_free_req(req); - } - } + io_poll_complete(req, req->result, 0); + req->flags |= REQ_F_COMP_LOCKED; + io_put_req_find_next(req, nxt); spin_unlock_irq(&ctx->completion_lock); io_cqring_ev_posted(ctx); - io_free_req_many(ctx, &rb); -} - -static void io_poll_flush(struct io_wq_work **workptr) -{ - struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work); - struct llist_node *nodes; - - nodes = llist_del_all(&req->ctx->poll_llist); - if (nodes) - __io_poll_flush(req->ctx, nodes); } -static void io_poll_trigger_evfd(struct io_wq_work **workptr) +static void io_poll_task_func(struct callback_head *cb) { - struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work); + struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work); + struct io_kiocb *nxt = NULL; - eventfd_signal(req->ctx->cq_ev_fd, 1); - io_put_req(req); + io_poll_task_handler(req, &nxt); + if (nxt) + __io_queue_sqe(nxt, NULL); } static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, @@ -3732,8 +3680,8 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, { struct io_kiocb *req = wait->private; struct io_poll_iocb *poll = &req->poll; - struct io_ring_ctx *ctx = req->ctx; __poll_t mask = key_to_poll(key); + struct task_struct *tsk; /* for instances that support it check for an event match first: */ if (mask && !(mask & poll->events)) @@ -3741,46 +3689,11 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, list_del_init(&poll->wait.entry); - /* - * Run completion inline if we can. We're using trylock here because - * we are violating the completion_lock -> poll wq lock ordering. - * If we have a link timeout we're going to need the completion_lock - * for finalizing the request, mark us as having grabbed that already. - */ - if (mask) { - unsigned long flags; - - if (llist_empty(&ctx->poll_llist) && - spin_trylock_irqsave(&ctx->completion_lock, flags)) { - bool trigger_ev; - - hash_del(&req->hash_node); - io_poll_complete(req, mask, 0); - - trigger_ev = io_should_trigger_evfd(ctx); - if (trigger_ev && eventfd_signal_count()) { - trigger_ev = false; - req->work.func = io_poll_trigger_evfd; - } else { - req->flags |= REQ_F_COMP_LOCKED; - io_put_req(req); - req = NULL; - } - spin_unlock_irqrestore(&ctx->completion_lock, flags); - __io_cqring_ev_posted(ctx, trigger_ev); - } else { - req->result = mask; - req->llist_node.next = NULL; - /* if the list wasn't empty, we're done */ - if (!llist_add(&req->llist_node, &ctx->poll_llist)) - req = NULL; - else - req->work.func = io_poll_flush; - } - } - if (req) - io_queue_async_work(req); - + tsk = req->task; + req->result = mask; + init_task_work(&req->task_work, io_poll_task_func); + task_work_add(tsk, &req->task_work, true); + wake_up_process(tsk); return 1; } @@ -3828,6 +3741,9 @@ static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe events = READ_ONCE(sqe->poll_events); poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP; + + /* task will wait for requests on exit, don't need a ref */ + req->task = current; return 0; } @@ -3839,7 +3755,6 @@ static int io_poll_add(struct io_kiocb *req, struct io_kiocb **nxt) bool cancel = false; __poll_t mask; - INIT_IO_WORK(&req->work, io_poll_complete_work); INIT_HLIST_NODE(&req->hash_node); poll->head = NULL; @@ -5268,6 +5183,8 @@ static int io_sq_thread(void *data) if (!list_empty(&ctx->poll_list) || (!time_after(jiffies, timeout) && ret != -EBUSY && !percpu_ref_is_dying(&ctx->refs))) { + if (current->task_works) + task_work_run(); cond_resched(); continue; } @@ -5299,6 +5216,10 @@ static int io_sq_thread(void *data) finish_wait(&ctx->sqo_wait, &wait); break; } + if (current->task_works) { + task_work_run(); + continue; + } if (signal_pending(current)) flush_signals(current); schedule(); @@ -5318,6 +5239,9 @@ static int io_sq_thread(void *data) timeout = jiffies + ctx->sq_thread_idle; } + if (current->task_works) + task_work_run(); + set_fs(old_fs); if (cur_mm) { unuse_mm(cur_mm); @@ -5382,8 +5306,13 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, struct io_rings *rings = ctx->rings; int ret = 0; - if (io_cqring_events(ctx, false) >= min_events) - return 0; + do { + if (io_cqring_events(ctx, false) >= min_events) + return 0; + if (!current->task_works) + break; + task_work_run(); + } while (1); if (sig) { #ifdef CONFIG_COMPAT @@ -5403,6 +5332,8 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, do { prepare_to_wait_exclusive(&ctx->wait, &iowq.wq, TASK_INTERRUPTIBLE); + if (current->task_works) + task_work_run(); if (io_should_wake(&iowq, false)) break; schedule(); @@ -6711,6 +6642,9 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, int submitted = 0; struct fd f; + if (current->task_works) + task_work_run(); + if (flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP)) return -EINVAL; -- cgit v1.2.3 From 8a72758c51f8a5501a0e01ea95069630edb9ca07 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 20 Feb 2020 09:59:44 -0700 Subject: io_uring: mark requests that we can do poll async in io_op_defs Add a pollin/pollout field to the request table, and have commands that we can safely poll for properly marked. Signed-off-by: Jens Axboe --- fs/io_uring.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/fs/io_uring.c b/fs/io_uring.c index a16b5632ce6f..0d973de75127 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -629,6 +629,9 @@ struct io_op_def { unsigned file_table : 1; /* needs ->fs */ unsigned needs_fs : 1; + /* set if opcode supports polled "wait" */ + unsigned pollin : 1; + unsigned pollout : 1; }; static const struct io_op_def io_op_defs[] = { @@ -638,6 +641,7 @@ static const struct io_op_def io_op_defs[] = { .needs_mm = 1, .needs_file = 1, .unbound_nonreg_file = 1, + .pollin = 1, }, [IORING_OP_WRITEV] = { .async_ctx = 1, @@ -645,6 +649,7 @@ static const struct io_op_def io_op_defs[] = { .needs_file = 1, .hash_reg_file = 1, .unbound_nonreg_file = 1, + .pollout = 1, }, [IORING_OP_FSYNC] = { .needs_file = 1, @@ -652,11 +657,13 @@ static const struct io_op_def io_op_defs[] = { [IORING_OP_READ_FIXED] = { .needs_file = 1, .unbound_nonreg_file = 1, + .pollin = 1, }, [IORING_OP_WRITE_FIXED] = { .needs_file = 1, .hash_reg_file = 1, .unbound_nonreg_file = 1, + .pollout = 1, }, [IORING_OP_POLL_ADD] = { .needs_file = 1, @@ -672,6 +679,7 @@ static const struct io_op_def io_op_defs[] = { .needs_file = 1, .unbound_nonreg_file = 1, .needs_fs = 1, + .pollout = 1, }, [IORING_OP_RECVMSG] = { .async_ctx = 1, @@ -679,6 +687,7 @@ static const struct io_op_def io_op_defs[] = { .needs_file = 1, .unbound_nonreg_file = 1, .needs_fs = 1, + .pollin = 1, }, [IORING_OP_TIMEOUT] = { .async_ctx = 1, @@ -690,6 +699,7 @@ static const struct io_op_def io_op_defs[] = { .needs_file = 1, .unbound_nonreg_file = 1, .file_table = 1, + .pollin = 1, }, [IORING_OP_ASYNC_CANCEL] = {}, [IORING_OP_LINK_TIMEOUT] = { @@ -701,6 +711,7 @@ static const struct io_op_def io_op_defs[] = { .needs_mm = 1, .needs_file = 1, .unbound_nonreg_file = 1, + .pollout = 1, }, [IORING_OP_FALLOCATE] = { .needs_file = 1, @@ -729,11 +740,13 @@ static const struct io_op_def io_op_defs[] = { .needs_mm = 1, .needs_file = 1, .unbound_nonreg_file = 1, + .pollin = 1, }, [IORING_OP_WRITE] = { .needs_mm = 1, .needs_file = 1, .unbound_nonreg_file = 1, + .pollout = 1, }, [IORING_OP_FADVISE] = { .needs_file = 1, @@ -745,11 +758,13 @@ static const struct io_op_def io_op_defs[] = { .needs_mm = 1, .needs_file = 1, .unbound_nonreg_file = 1, + .pollout = 1, }, [IORING_OP_RECV] = { .needs_mm = 1, .needs_file = 1, .unbound_nonreg_file = 1, + .pollin = 1, }, [IORING_OP_OPENAT2] = { .needs_file = 1, -- cgit v1.2.3 From d7718a9d25a61442da8ee8aeeff6a0097f0ccfd6 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 14 Feb 2020 22:23:12 -0700 Subject: io_uring: use poll driven retry for files that support it Currently io_uring tries any request in a non-blocking manner, if it can, and then retries from a worker thread if we get -EAGAIN. Now that we have a new and fancy poll based retry backend, use that to retry requests if the file supports it. This means that, for example, an IORING_OP_RECVMSG on a socket no longer requires an async thread to complete the IO. If we get -EAGAIN reading from the socket in a non-blocking manner, we arm a poll handler for notification on when the socket becomes readable. When it does, the pending read is executed directly by the task again, through the io_uring task work handlers. Not only is this faster and more efficient, it also means we're not generating potentially tons of async threads that just sit and block, waiting for the IO to complete. The feature is marked with IORING_FEAT_FAST_POLL, meaning that async pollable IO is fast, and that pollother_op is fast as well. Signed-off-by: Jens Axboe --- fs/io_uring.c | 354 ++++++++++++++++++++++++++++++---------- include/trace/events/io_uring.h | 103 ++++++++++++ include/uapi/linux/io_uring.h | 1 + 3 files changed, 375 insertions(+), 83 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 0d973de75127..8c976fde40bd 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -487,6 +487,7 @@ enum { REQ_F_COMP_LOCKED_BIT, REQ_F_NEED_CLEANUP_BIT, REQ_F_OVERFLOW_BIT, + REQ_F_POLLED_BIT, }; enum { @@ -529,6 +530,13 @@ enum { REQ_F_NEED_CLEANUP = BIT(REQ_F_NEED_CLEANUP_BIT), /* in overflow list */ REQ_F_OVERFLOW = BIT(REQ_F_OVERFLOW_BIT), + /* already went through poll handler */ + REQ_F_POLLED = BIT(REQ_F_POLLED_BIT), +}; + +struct async_poll { + struct io_poll_iocb poll; + struct io_wq_work work; }; /* @@ -562,27 +570,29 @@ struct io_kiocb { u8 opcode; struct io_ring_ctx *ctx; - union { - struct list_head list; - struct hlist_node hash_node; - }; - struct list_head link_list; + struct list_head list; unsigned int flags; refcount_t refs; + struct task_struct *task; u64 user_data; u32 result; u32 sequence; + struct list_head link_list; + struct list_head inflight_entry; union { /* * Only commands that never go async can use the below fields, - * obviously. Right now only IORING_OP_POLL_ADD uses them. + * obviously. Right now only IORING_OP_POLL_ADD uses them, and + * async armed poll handlers for regular commands. The latter + * restore the work, if needed. */ struct { - struct task_struct *task; struct callback_head task_work; + struct hlist_node hash_node; + struct async_poll *apoll; }; struct io_wq_work work; }; @@ -3563,9 +3573,209 @@ out: #endif } -static bool io_poll_remove_one(struct io_kiocb *req) +struct io_poll_table { + struct poll_table_struct pt; + struct io_kiocb *req; + int error; +}; + +static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt, + struct wait_queue_head *head) +{ + if (unlikely(poll->head)) { + pt->error = -EINVAL; + return; + } + + pt->error = 0; + poll->head = head; + add_wait_queue(head, &poll->wait); +} + +static void io_async_queue_proc(struct file *file, struct wait_queue_head *head, + struct poll_table_struct *p) +{ + struct io_poll_table *pt = container_of(p, struct io_poll_table, pt); + + __io_queue_proc(&pt->req->apoll->poll, pt, head); +} + +static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll, + __poll_t mask, task_work_func_t func) +{ + struct task_struct *tsk; + + /* for instances that support it check for an event match first: */ + if (mask && !(mask & poll->events)) + return 0; + + trace_io_uring_task_add(req->ctx, req->opcode, req->user_data, mask); + + list_del_init(&poll->wait.entry); + + tsk = req->task; + req->result = mask; + init_task_work(&req->task_work, func); + /* + * If this fails, then the task is exiting. If that is the case, then + * the exit check will ultimately cancel these work items. Hence we + * don't need to check here and handle it specifically. + */ + task_work_add(tsk, &req->task_work, true); + wake_up_process(tsk); + return 1; +} + +static void io_async_task_func(struct callback_head *cb) +{ + struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work); + struct async_poll *apoll = req->apoll; + struct io_ring_ctx *ctx = req->ctx; + + trace_io_uring_task_run(req->ctx, req->opcode, req->user_data); + + WARN_ON_ONCE(!list_empty(&req->apoll->poll.wait.entry)); + + if (hash_hashed(&req->hash_node)) { + spin_lock_irq(&ctx->completion_lock); + hash_del(&req->hash_node); + spin_unlock_irq(&ctx->completion_lock); + } + + /* restore ->work in case we need to retry again */ + memcpy(&req->work, &apoll->work, sizeof(req->work)); + + __set_current_state(TASK_RUNNING); + mutex_lock(&ctx->uring_lock); + __io_queue_sqe(req, NULL); + mutex_unlock(&ctx->uring_lock); + + kfree(apoll); +} + +static int io_async_wake(struct wait_queue_entry *wait, unsigned mode, int sync, + void *key) +{ + struct io_kiocb *req = wait->private; + struct io_poll_iocb *poll = &req->apoll->poll; + + trace_io_uring_poll_wake(req->ctx, req->opcode, req->user_data, + key_to_poll(key)); + + return __io_async_wake(req, poll, key_to_poll(key), io_async_task_func); +} + +static void io_poll_req_insert(struct io_kiocb *req) +{ + struct io_ring_ctx *ctx = req->ctx; + struct hlist_head *list; + + list = &ctx->cancel_hash[hash_long(req->user_data, ctx->cancel_hash_bits)]; + hlist_add_head(&req->hash_node, list); +} + +static __poll_t __io_arm_poll_handler(struct io_kiocb *req, + struct io_poll_iocb *poll, + struct io_poll_table *ipt, __poll_t mask, + wait_queue_func_t wake_func) + __acquires(&ctx->completion_lock) +{ + struct io_ring_ctx *ctx = req->ctx; + bool cancel = false; + + poll->file = req->file; + poll->head = NULL; + poll->done = poll->canceled = false; + poll->events = mask; + + ipt->pt._key = mask; + ipt->req = req; + ipt->error = -EINVAL; + + INIT_LIST_HEAD(&poll->wait.entry); + init_waitqueue_func_entry(&poll->wait, wake_func); + poll->wait.private = req; + + mask = vfs_poll(req->file, &ipt->pt) & poll->events; + + spin_lock_irq(&ctx->completion_lock); + if (likely(poll->head)) { + spin_lock(&poll->head->lock); + if (unlikely(list_empty(&poll->wait.entry))) { + if (ipt->error) + cancel = true; + ipt->error = 0; + mask = 0; + } + if (mask || ipt->error) + list_del_init(&poll->wait.entry); + else if (cancel) + WRITE_ONCE(poll->canceled, true); + else if (!poll->done) /* actually waiting for an event */ + io_poll_req_insert(req); + spin_unlock(&poll->head->lock); + } + + return mask; +} + +static bool io_arm_poll_handler(struct io_kiocb *req) +{ + const struct io_op_def *def = &io_op_defs[req->opcode]; + struct io_ring_ctx *ctx = req->ctx; + struct async_poll *apoll; + struct io_poll_table ipt; + __poll_t mask, ret; + + if (!req->file || !file_can_poll(req->file)) + return false; + if (req->flags & (REQ_F_MUST_PUNT | REQ_F_POLLED)) + return false; + if (!def->pollin && !def->pollout) + return false; + + apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC); + if (unlikely(!apoll)) + return false; + + req->flags |= REQ_F_POLLED; + memcpy(&apoll->work, &req->work, sizeof(req->work)); + + /* + * Don't need a reference here, as we're adding it to the task + * task_works list. If the task exits, the list is pruned. + */ + req->task = current; + req->apoll = apoll; + INIT_HLIST_NODE(&req->hash_node); + + if (def->pollin) + mask = POLLIN | POLLRDNORM; + if (def->pollout) + mask |= POLLOUT | POLLWRNORM; + mask |= POLLERR | POLLPRI; + + ipt.pt._qproc = io_async_queue_proc; + + ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask, + io_async_wake); + if (ret) { + ipt.error = 0; + apoll->poll.done = true; + spin_unlock_irq(&ctx->completion_lock); + memcpy(&req->work, &apoll->work, sizeof(req->work)); + kfree(apoll); + return false; + } + spin_unlock_irq(&ctx->completion_lock); + trace_io_uring_poll_arm(ctx, req->opcode, req->user_data, mask, + apoll->poll.events); + return true; +} + +static bool __io_poll_remove_one(struct io_kiocb *req, + struct io_poll_iocb *poll) { - struct io_poll_iocb *poll = &req->poll; bool do_complete = false; spin_lock(&poll->head->lock); @@ -3575,7 +3785,24 @@ static bool io_poll_remove_one(struct io_kiocb *req) do_complete = true; } spin_unlock(&poll->head->lock); + return do_complete; +} + +static bool io_poll_remove_one(struct io_kiocb *req) +{ + bool do_complete; + + if (req->opcode == IORING_OP_POLL_ADD) { + do_complete = __io_poll_remove_one(req, &req->poll); + } else { + /* non-poll requests have submit ref still */ + do_complete = __io_poll_remove_one(req, &req->apoll->poll); + if (do_complete) + io_put_req(req); + } + hash_del(&req->hash_node); + if (do_complete) { io_cqring_fill_event(req, -ECANCELED); io_commit_cqring(req->ctx); @@ -3686,8 +3913,13 @@ static void io_poll_task_func(struct callback_head *cb) struct io_kiocb *nxt = NULL; io_poll_task_handler(req, &nxt); - if (nxt) + if (nxt) { + struct io_ring_ctx *ctx = nxt->ctx; + + mutex_lock(&ctx->uring_lock); __io_queue_sqe(nxt, NULL); + mutex_unlock(&ctx->uring_lock); + } } static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, @@ -3695,51 +3927,16 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, { struct io_kiocb *req = wait->private; struct io_poll_iocb *poll = &req->poll; - __poll_t mask = key_to_poll(key); - struct task_struct *tsk; - /* for instances that support it check for an event match first: */ - if (mask && !(mask & poll->events)) - return 0; - - list_del_init(&poll->wait.entry); - - tsk = req->task; - req->result = mask; - init_task_work(&req->task_work, io_poll_task_func); - task_work_add(tsk, &req->task_work, true); - wake_up_process(tsk); - return 1; + return __io_async_wake(req, poll, key_to_poll(key), io_poll_task_func); } -struct io_poll_table { - struct poll_table_struct pt; - struct io_kiocb *req; - int error; -}; - static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head, struct poll_table_struct *p) { struct io_poll_table *pt = container_of(p, struct io_poll_table, pt); - if (unlikely(pt->req->poll.head)) { - pt->error = -EINVAL; - return; - } - - pt->error = 0; - pt->req->poll.head = head; - add_wait_queue(head, &pt->req->poll.wait); -} - -static void io_poll_req_insert(struct io_kiocb *req) -{ - struct io_ring_ctx *ctx = req->ctx; - struct hlis