diff options
Diffstat (limited to 'fs/io_uring.c')
-rw-r--r-- | fs/io_uring.c | 2015 |
1 files changed, 1406 insertions, 609 deletions
diff --git a/fs/io_uring.c b/fs/io_uring.c index 3affd96a98ba..358f97be9c7b 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -44,6 +44,7 @@ #include <linux/errno.h> #include <linux/syscalls.h> #include <linux/compat.h> +#include <net/compat.h> #include <linux/refcount.h> #include <linux/uio.h> #include <linux/bits.h> @@ -76,6 +77,8 @@ #include <linux/fadvise.h> #include <linux/eventpoll.h> #include <linux/fs_struct.h> +#include <linux/splice.h> +#include <linux/task_work.h> #define CREATE_TRACE_POINTS #include <trace/events/io_uring.h> @@ -193,6 +196,13 @@ struct fixed_file_data { struct completion done; }; +struct io_buffer { + struct list_head list; + __u64 addr; + __s32 len; + __u16 bid; +}; + struct io_ring_ctx { struct { struct percpu_ref refs; @@ -270,6 +280,8 @@ struct io_ring_ctx { struct socket *ring_sock; #endif + struct idr io_buffer_idr; + struct idr personality_idr; struct { @@ -290,7 +302,6 @@ struct io_ring_ctx { struct { spinlock_t completion_lock; - struct llist_head poll_llist; /* * ->poll_list is protected by the ctx->uring_lock for @@ -386,7 +397,9 @@ struct io_sr_msg { void __user *buf; }; int msg_flags; + int bgid; size_t len; + struct io_buffer *kbuf; }; struct io_open { @@ -430,6 +443,24 @@ struct io_epoll { struct epoll_event event; }; +struct io_splice { + struct file *file_out; + struct file *file_in; + loff_t off_out; + loff_t off_in; + u64 len; + unsigned int flags; +}; + +struct io_provide_buf { + struct file *file; + __u64 addr; + __s32 len; + __u32 bgid; + __u16 nbufs; + __u16 bid; +}; + struct io_async_connect { struct sockaddr_storage address; }; @@ -464,6 +495,7 @@ enum { REQ_F_LINK_BIT = IOSQE_IO_LINK_BIT, REQ_F_HARDLINK_BIT = IOSQE_IO_HARDLINK_BIT, REQ_F_FORCE_ASYNC_BIT = IOSQE_ASYNC_BIT, + REQ_F_BUFFER_SELECT_BIT = IOSQE_BUFFER_SELECT_BIT, REQ_F_LINK_NEXT_BIT, REQ_F_FAIL_LINK_BIT, @@ -479,6 +511,11 @@ enum { REQ_F_COMP_LOCKED_BIT, REQ_F_NEED_CLEANUP_BIT, REQ_F_OVERFLOW_BIT, + REQ_F_POLLED_BIT, + REQ_F_BUFFER_SELECTED_BIT, + + /* not a real bit, just to check we're not overflowing the space */ + __REQ_F_LAST_BIT, }; enum { @@ -492,6 +529,8 @@ enum { REQ_F_HARDLINK = BIT(REQ_F_HARDLINK_BIT), /* IOSQE_ASYNC */ REQ_F_FORCE_ASYNC = BIT(REQ_F_FORCE_ASYNC_BIT), + /* IOSQE_BUFFER_SELECT */ + REQ_F_BUFFER_SELECT = BIT(REQ_F_BUFFER_SELECT_BIT), /* already grabbed next link */ REQ_F_LINK_NEXT = BIT(REQ_F_LINK_NEXT_BIT), @@ -521,6 +560,15 @@ enum { REQ_F_NEED_CLEANUP = BIT(REQ_F_NEED_CLEANUP_BIT), /* in overflow list */ REQ_F_OVERFLOW = BIT(REQ_F_OVERFLOW_BIT), + /* already went through poll handler */ + REQ_F_POLLED = BIT(REQ_F_POLLED_BIT), + /* buffer already selected */ + REQ_F_BUFFER_SELECTED = BIT(REQ_F_BUFFER_SELECTED_BIT), +}; + +struct async_poll { + struct io_poll_iocb poll; + struct io_wq_work work; }; /* @@ -546,32 +594,45 @@ struct io_kiocb { struct io_fadvise fadvise; struct io_madvise madvise; struct io_epoll epoll; + struct io_splice splice; + struct io_provide_buf pbuf; }; struct io_async_ctx *io; - /* - * llist_node is only used for poll deferred completions - */ - struct llist_node llist_node; - bool in_async; bool needs_fixed_file; u8 opcode; struct io_ring_ctx *ctx; - union { - struct list_head list; - struct hlist_node hash_node; - }; - struct list_head link_list; + struct list_head list; unsigned int flags; refcount_t refs; + union { + struct task_struct *task; + unsigned long fsize; + }; u64 user_data; u32 result; u32 sequence; + struct list_head link_list; + struct list_head inflight_entry; - struct io_wq_work work; + union { + /* + * Only commands that never go async can use the below fields, + * obviously. Right now only IORING_OP_POLL_ADD uses them, and + * async armed poll handlers for regular commands. The latter + * restore the work, if needed. + */ + struct { + struct callback_head task_work; + struct hlist_node hash_node; + struct async_poll *apoll; + int cflags; + }; + struct io_wq_work work; + }; }; #define IO_PLUG_THRESHOLD 2 @@ -615,6 +676,11 @@ struct io_op_def { unsigned file_table : 1; /* needs ->fs */ unsigned needs_fs : 1; + /* set if opcode supports polled "wait" */ + unsigned pollin : 1; + unsigned pollout : 1; + /* op supports buffer selection */ + unsigned buffer_select : 1; }; static const struct io_op_def io_op_defs[] = { @@ -624,6 +690,8 @@ static const struct io_op_def io_op_defs[] = { .needs_mm = 1, .needs_file = 1, .unbound_nonreg_file = 1, + .pollin = 1, + .buffer_select = 1, }, [IORING_OP_WRITEV] = { .async_ctx = 1, @@ -631,6 +699,7 @@ static const struct io_op_def io_op_defs[] = { .needs_file = 1, .hash_reg_file = 1, .unbound_nonreg_file = 1, + .pollout = 1, }, [IORING_OP_FSYNC] = { .needs_file = 1, @@ -638,11 +707,13 @@ static const struct io_op_def io_op_defs[] = { [IORING_OP_READ_FIXED] = { .needs_file = 1, .unbound_nonreg_file = 1, + .pollin = 1, }, [IORING_OP_WRITE_FIXED] = { .needs_file = 1, .hash_reg_file = 1, .unbound_nonreg_file = 1, + .pollout = 1, }, [IORING_OP_POLL_ADD] = { .needs_file = 1, @@ -658,6 +729,7 @@ static const struct io_op_def io_op_defs[] = { .needs_file = 1, .unbound_nonreg_file = 1, .needs_fs = 1, + .pollout = 1, }, [IORING_OP_RECVMSG] = { .async_ctx = 1, @@ -665,6 +737,8 @@ static const struct io_op_def io_op_defs[] = { .needs_file = 1, .unbound_nonreg_file = 1, .needs_fs = 1, + .pollin = 1, + .buffer_select = 1, }, [IORING_OP_TIMEOUT] = { .async_ctx = 1, @@ -676,6 +750,7 @@ static const struct io_op_def io_op_defs[] = { .needs_file = 1, .unbound_nonreg_file = 1, .file_table = 1, + .pollin = 1, }, [IORING_OP_ASYNC_CANCEL] = {}, [IORING_OP_LINK_TIMEOUT] = { @@ -687,6 +762,7 @@ static const struct io_op_def io_op_defs[] = { .needs_mm = 1, .needs_file = 1, .unbound_nonreg_file = 1, + .pollout = 1, }, [IORING_OP_FALLOCATE] = { .needs_file = 1, @@ -715,11 +791,14 @@ static const struct io_op_def io_op_defs[] = { .needs_mm = 1, .needs_file = 1, .unbound_nonreg_file = 1, + .pollin = 1, + .buffer_select = 1, }, [IORING_OP_WRITE] = { .needs_mm = 1, .needs_file = 1, .unbound_nonreg_file = 1, + .pollout = 1, }, [IORING_OP_FADVISE] = { .needs_file = 1, @@ -731,11 +810,14 @@ static const struct io_op_def io_op_defs[] = { .needs_mm = 1, .needs_file = 1, .unbound_nonreg_file = 1, + .pollout = 1, }, [IORING_OP_RECV] = { .needs_mm = 1, .needs_file = 1, .unbound_nonreg_file = 1, + .pollin = 1, + .buffer_select = 1, }, [IORING_OP_OPENAT2] = { .needs_file = 1, @@ -747,6 +829,13 @@ static const struct io_op_def io_op_defs[] = { .unbound_nonreg_file = 1, .file_table = 1, }, + [IORING_OP_SPLICE] = { + .needs_file = 1, + .hash_reg_file = 1, + .unbound_nonreg_file = 1, + }, + [IORING_OP_PROVIDE_BUFFERS] = {}, + [IORING_OP_REMOVE_BUFFERS] = {}, }; static void io_wq_submit_work(struct io_wq_work **workptr); @@ -761,6 +850,10 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx, static int io_grab_files(struct io_kiocb *req); static void io_ring_file_ref_flush(struct fixed_file_data *data); static void io_cleanup_req(struct io_kiocb *req); +static int io_file_get(struct io_submit_state *state, struct io_kiocb *req, + int fd, struct file **out_file, bool fixed); +static void __io_queue_sqe(struct io_kiocb *req, + const struct io_uring_sqe *sqe); static struct kmem_cache *req_cachep; @@ -827,11 +920,11 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) INIT_LIST_HEAD(&ctx->cq_overflow_list); init_completion(&ctx->completions[0]); init_completion(&ctx->completions[1]); + idr_init(&ctx->io_buffer_idr); idr_init(&ctx->personality_idr); mutex_init(&ctx->uring_lock); init_waitqueue_head(&ctx->wait); spin_lock_init(&ctx->completion_lock); - init_llist_head(&ctx->poll_llist); INIT_LIST_HEAD(&ctx->poll_list); INIT_LIST_HEAD(&ctx->defer_list); INIT_LIST_HEAD(&ctx->timeout_list); @@ -952,15 +1045,14 @@ static inline void io_req_work_drop_env(struct io_kiocb *req) } } -static inline bool io_prep_async_work(struct io_kiocb *req, +static inline void io_prep_async_work(struct io_kiocb *req, struct io_kiocb **link) { const struct io_op_def *def = &io_op_defs[req->opcode]; - bool do_hashed = false; if (req->flags & REQ_F_ISREG) { if (def->hash_reg_file) - do_hashed = true; + io_wq_hash_work(&req->work, file_inode(req->file)); } else { if (def->unbound_nonreg_file) req->work.flags |= IO_WQ_WORK_UNBOUND; @@ -969,25 +1061,18 @@ static inline bool io_prep_async_work(struct io_kiocb *req, io_req_work_grab_env(req, def); *link = io_prep_linked_timeout(req); - return do_hashed; } static inline void io_queue_async_work(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; struct io_kiocb *link; - bool do_hashed; - do_hashed = io_prep_async_work(req, &link); + io_prep_async_work(req, &link); - trace_io_uring_queue_async_work(ctx, do_hashed, req, &req->work, - req->flags); - if (!do_hashed) { - io_wq_enqueue(ctx->io_wq, &req->work); - } else { - io_wq_enqueue_hashed(ctx->io_wq, &req->work, - file_inode(req->file)); - } + trace_io_uring_queue_async_work(ctx, io_wq_is_hashed(&req->work), req, + &req->work, req->flags); + io_wq_enqueue(ctx->io_wq, &req->work); if (link) io_queue_linked_timeout(link); @@ -1054,24 +1139,19 @@ static inline bool io_should_trigger_evfd(struct io_ring_ctx *ctx) return false; if (!ctx->eventfd_async) return true; - return io_wq_current_is_worker() || in_interrupt(); + return io_wq_current_is_worker(); } -static void __io_cqring_ev_posted(struct io_ring_ctx *ctx, bool trigger_ev) +static void io_cqring_ev_posted(struct io_ring_ctx *ctx) { if (waitqueue_active(&ctx->wait)) wake_up(&ctx->wait); if (waitqueue_active(&ctx->sqo_wait)) wake_up(&ctx->sqo_wait); - if (trigger_ev) + if (io_should_trigger_evfd(ctx)) eventfd_signal(ctx->cq_ev_fd, 1); } -static void io_cqring_ev_posted(struct io_ring_ctx *ctx) -{ - __io_cqring_ev_posted(ctx, io_should_trigger_evfd(ctx)); -} - /* Returns true if there are no backlogged entries after the flush */ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) { @@ -1108,7 +1188,7 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) if (cqe) { WRITE_ONCE(cqe->user_data, req->user_data); WRITE_ONCE(cqe->res, req->result); - WRITE_ONCE(cqe->flags, 0); + WRITE_ONCE(cqe->flags, req->cflags); } else { WRITE_ONCE(ctx->rings->cq_overflow, atomic_inc_return(&ctx->cached_cq_overflow)); @@ -1132,7 +1212,7 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) return cqe != NULL; } -static void io_cqring_fill_event(struct io_kiocb *req, long res) +static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags) { struct io_ring_ctx *ctx = req->ctx; struct io_uring_cqe *cqe; @@ -1148,7 +1228,7 @@ static void io_cqring_fill_event(struct io_kiocb *req, long res) if (likely(cqe)) { WRITE_ONCE(cqe->user_data, req->user_data); WRITE_ONCE(cqe->res, res); - WRITE_ONCE(cqe->flags, 0); + WRITE_ONCE(cqe->flags, cflags); } else if (ctx->cq_overflow_flushed) { WRITE_ONCE(ctx->rings->cq_overflow, atomic_inc_return(&ctx->cached_cq_overflow)); @@ -1160,23 +1240,34 @@ static void io_cqring_fill_event(struct io_kiocb *req, long res) req->flags |= REQ_F_OVERFLOW; refcount_inc(&req->refs); req->result = res; + req->cflags = cflags; list_add_tail(&req->list, &ctx->cq_overflow_list); } } -static void io_cqring_add_event(struct io_kiocb *req, long res) +static void io_cqring_fill_event(struct io_kiocb *req, long res) +{ + __io_cqring_fill_event(req, res, 0); +} + +static void __io_cqring_add_event(struct io_kiocb *req, long res, long cflags) { struct io_ring_ctx *ctx = req->ctx; unsigned long flags; spin_lock_irqsave(&ctx->completion_lock, flags); - io_cqring_fill_event(req, res); + __io_cqring_fill_event(req, res, cflags); io_commit_cqring(ctx); spin_unlock_irqrestore(&ctx->completion_lock, flags); io_cqring_ev_posted(ctx); } +static void io_cqring_add_event(struct io_kiocb *req, long res) +{ + __io_cqring_add_event(req, res, 0); +} + static inline bool io_is_fallback_req(struct io_kiocb *req) { return req == (struct io_kiocb *) @@ -1246,6 +1337,15 @@ fallback: return NULL; } +static inline void io_put_file(struct io_kiocb *req, struct file *file, + bool fixed) +{ + if (fixed) + percpu_ref_put(&req->ctx->file_data->refs); + else + fput(file); +} + static void __io_req_do_free(struct io_kiocb *req) { if (likely(!io_is_fallback_req(req))) @@ -1256,18 +1356,12 @@ static void __io_req_do_free(struct io_kiocb *req) static void __io_req_aux_free(struct io_kiocb *req) { - struct io_ring_ctx *ctx = req->ctx; - if (req->flags & REQ_F_NEED_CLEANUP) io_cleanup_req(req); kfree(req->io); - if (req->file) { - if (req->flags & REQ_F_FIXED_FILE) - percpu_ref_put(&ctx->file_data->refs); - else - fput(req->file); - } + if (req->file) + io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE)); io_req_work_drop_env(req); } @@ -1474,6 +1568,30 @@ static void io_free_req(struct io_kiocb *req) io_queue_async_work(nxt); } +static void io_link_work_cb(struct io_wq_work **workptr) +{ + struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work); + struct io_kiocb *link; + + link = list_first_entry(&req->link_list, struct io_kiocb, link_list); + io_queue_linked_timeout(link); + io_wq_submit_work(workptr); +} + +static void io_wq_assign_next(struct io_wq_work **workptr, struct io_kiocb *nxt) +{ + struct io_kiocb *link; + const struct io_op_def *def = &io_op_defs[nxt->opcode]; + + if ((nxt->flags & REQ_F_ISREG) && def->hash_reg_file) + io_wq_hash_work(&nxt->work, file_inode(nxt->file)); + + *workptr = &nxt->work; + link = io_prep_linked_timeout(nxt); + if (link) + nxt->work.func = io_link_work_cb; +} + /* * Drop reference to request, return next in chain (if there is one) if this * was the last reference to this request. @@ -1493,6 +1611,26 @@ static void io_put_req(struct io_kiocb *req) io_free_req(req); } +static void io_steal_work(struct io_kiocb *req, + struct io_wq_work **workptr) +{ + /* + * It's in an io-wq worker, so there always should be at least + * one reference, which will be dropped in io_put_work() just + * after the current handler returns. + * + * It also means, that if the counter dropped to 1, then there is + * no asynchronous users left, so it's safe to steal the next work. + */ + if (refcount_read(&req->refs) == 1) { + struct io_kiocb *nxt = NULL; + + io_req_find_next(req, &nxt); + if (nxt) + io_wq_assign_next(workptr, nxt); + } +} + /* * Must only be used if we don't need to care about links, usually from * within the completion handling itself. @@ -1554,6 +1692,19 @@ static inline bool io_req_multi_free(struct req_batch *rb, struct io_kiocb *req) return true; } +static int io_put_kbuf(struct io_kiocb *req) +{ + struct io_buffer *kbuf; + int cflags; + + kbuf = (struct io_buffer *) (unsigned long) req->rw.addr; + cflags = kbuf->bid << IORING_CQE_BUFFER_SHIFT; + cflags |= IORING_CQE_F_BUFFER; + req->rw.addr = 0; + kfree(kbuf); + return cflags; +} + /* * Find and free completed poll iocbs */ @@ -1565,10 +1716,15 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events, rb.to_free = rb.need_iter = 0; while (!list_empty(done)) { + int cflags = 0; + req = list_first_entry(done, struct io_kiocb, list); list_del(&req->list); - io_cqring_fill_event(req, req->result); + if (req->flags & REQ_F_BUFFER_SELECTED) + cflags = io_put_kbuf(req); + + __io_cqring_fill_event(req, req->result, cflags); (*nr_events)++; if (refcount_dec_and_test(&req->refs) && @@ -1577,6 +1733,8 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events, } io_commit_cqring(ctx); + if (ctx->flags & IORING_SETUP_SQPOLL) + io_cqring_ev_posted(ctx); io_free_req_many(ctx, &rb); } @@ -1743,13 +1901,16 @@ static inline void req_set_fail_links(struct io_kiocb *req) static void io_complete_rw_common(struct kiocb *kiocb, long res) { struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb); + int cflags = 0; if (kiocb->ki_flags & IOCB_WRITE) kiocb_end_write(req); if (res != req->result) req_set_fail_links(req); - io_cqring_add_event(req, res); + if (req->flags & REQ_F_BUFFER_SELECTED) + cflags = io_put_kbuf(req); + __io_cqring_add_event(req, res, cflags); } static void io_complete_rw(struct kiocb *kiocb, long res, long res2) @@ -1760,17 +1921,6 @@ static void io_complete_rw(struct kiocb *kiocb, long res, long res2) io_put_req(req); } -static struct io_kiocb *__io_complete_rw(struct kiocb *kiocb, long res) -{ - struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb); - struct io_kiocb *nxt = NULL; - - io_complete_rw_common(kiocb, res); - io_put_req_find_next(req, &nxt); - - return nxt; -} - static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2) { struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb); @@ -1841,7 +1991,7 @@ static void io_file_put(struct io_submit_state *state) * assuming most submissions are for one file, or at least that each file * has more than one submission. */ -static struct file *io_file_get(struct io_submit_state *state, int fd) +static struct file *__io_file_get(struct io_submit_state *state, int fd) { if (!state) return fget(fd); @@ -1938,7 +2088,7 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, req->rw.addr = READ_ONCE(sqe->addr); req->rw.len = READ_ONCE(sqe->len); - /* we own ->private, reuse it for the buffer index */ + /* we own ->private, reuse it for the buffer index / buffer ID */ req->rw.kiocb.private = (void *) (unsigned long) READ_ONCE(sqe->buf_index); return 0; @@ -1965,15 +2115,14 @@ static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret) } } -static void kiocb_done(struct kiocb *kiocb, ssize_t ret, struct io_kiocb **nxt, - bool in_async) +static void kiocb_done(struct kiocb *kiocb, ssize_t ret) { struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb); if (req->flags & REQ_F_CUR_POS) req->file->f_pos = kiocb->ki_pos; - if (in_async && ret >= 0 && kiocb->ki_complete == io_complete_rw) - *nxt = __io_complete_rw(kiocb, ret); + if (ret >= 0 && kiocb->ki_complete == io_complete_rw) + io_complete_rw(kiocb, ret, 0); else io_rw_done(kiocb, ret); } @@ -2052,11 +2201,147 @@ static ssize_t io_import_fixed(struct io_kiocb *req, int rw, return len; } +static void io_ring_submit_unlock(struct io_ring_ctx *ctx, bool needs_lock) +{ + if (needs_lock) + mutex_unlock(&ctx->uring_lock); +} + +static void io_ring_submit_lock(struct io_ring_ctx *ctx, bool needs_lock) +{ + /* + * "Normal" inline submissions always hold the uring_lock, since we + * grab it from the system call. Same is true for the SQPOLL offload. + * The only exception is when we've detached the request and issue it + * from an async worker thread, grab the lock for that case. + */ + if (needs_lock) + mutex_lock(&ctx->uring_lock); +} + +static struct io_buffer *io_buffer_select(struct io_kiocb *req, size_t *len, + int bgid, struct io_buffer *kbuf, + bool needs_lock) +{ + struct io_buffer *head; + + if (req->flags & REQ_F_BUFFER_SELECTED) + return kbuf; + + io_ring_submit_lock(req->ctx, needs_lock); + + lockdep_assert_held(&req->ctx->uring_lock); + + head = idr_find(&req->ctx->io_buffer_idr, bgid); + if (head) { + if (!list_empty(&head->list)) { + kbuf = list_last_entry(&head->list, struct io_buffer, + list); + list_del(&kbuf->list); + } else { + kbuf = head; + idr_remove(&req->ctx->io_buffer_idr, bgid); + } + if (*len > kbuf->len) + *len = kbuf->len; + } else { + kbuf = ERR_PTR(-ENOBUFS); + } + + io_ring_submit_unlock(req->ctx, needs_lock); + + return kbuf; +} + +static void __user *io_rw_buffer_select(struct io_kiocb *req, size_t *len, + bool needs_lock) +{ + struct io_buffer *kbuf; + int bgid; + + kbuf = (struct io_buffer *) (unsigned long) req->rw.addr; + bgid = (int) (unsigned long) req->rw.kiocb.private; + kbuf = io_buffer_select(req, len, bgid, kbuf, needs_lock); + if (IS_ERR(kbuf)) + return kbuf; + req->rw.addr = (u64) (unsigned long) kbuf; + req->flags |= REQ_F_BUFFER_SELECTED; + return u64_to_user_ptr(kbuf->addr); +} + +#ifdef CONFIG_COMPAT +static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov, + bool needs_lock) +{ + struct compat_iovec __user *uiov; + compat_ssize_t clen; + void __user *buf; + ssize_t len; + + uiov = u64_to_user_ptr(req->rw.addr); + if (!access_ok(uiov, sizeof(*uiov))) + return -EFAULT; + if (__get_user(clen, &uiov->iov_len)) + return -EFAULT; + if (clen < 0) + return -EINVAL; + + len = clen; + buf = io_rw_buffer_select(req, &len, needs_lock); + if (IS_ERR(buf)) + return PTR_ERR(buf); + iov[0].iov_base = buf; + iov[0].iov_len = (compat_size_t) len; + return 0; +} +#endif + +static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov, + bool needs_lock) +{ + struct iovec __user *uiov = u64_to_user_ptr(req->rw.addr); + void __user *buf; + ssize_t len; + + if (copy_from_user(iov, uiov, sizeof(*uiov))) + return -EFAULT; + + len = iov[0].iov_len; + if (len < 0) + return -EINVAL; + buf = io_rw_buffer_select(req, &len, needs_lock); + if (IS_ERR(buf)) + return PTR_ERR(buf); + iov[0].iov_base = buf; + iov[0].iov_len = len; + return 0; +} + +static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov, + bool needs_lock) +{ + if (req->flags & REQ_F_BUFFER_SELECTED) + return 0; + if (!req->rw.len) + return 0; + else if (req->rw.len > 1) + return -EINVAL; + +#ifdef CONFIG_COMPAT + if (req->ctx->compat) + return io_compat_import(req, iov, needs_lock); +#endif + + return __io_iov_buffer_select(req, iov, needs_lock); +} + static ssize_t io_import_iovec(int rw, struct io_kiocb *req, - struct iovec **iovec, struct iov_iter *iter) + struct iovec **iovec, struct iov_iter *iter, + bool needs_lock) { void __user *buf = u64_to_user_ptr(req->rw.addr); size_t sqe_len = req->rw.len; + ssize_t ret; u8 opcode; opcode = req->opcode; @@ -2065,12 +2350,20 @@ static ssize_t io_import_iovec(int rw, struct io_kiocb *req, return io_import_fixed(req, rw, iter); } - /* buffer index only valid with fixed read/write */ - if (req->rw.kiocb.private) + /* buffer index only valid with fixed read/write, or buffer select */ + if (req->rw.kiocb.private && !(req->flags & REQ_F_BUFFER_SELECT)) return -EINVAL; if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) { - ssize_t ret; + if (req->flags & REQ_F_BUFFER_SELECT) { + buf = io_rw_buffer_select(req, &sqe_len, needs_lock); + if (IS_ERR(buf)) { + *iovec = NULL; + return PTR_ERR(buf); + } + req->rw.len = sqe_len; + } + ret = import_single_range(rw, buf, sqe_len, *iovec, iter); *iovec = NULL; return ret < 0 ? ret : sqe_len; @@ -2086,6 +2379,16 @@ static ssize_t io_import_iovec(int rw, struct io_kiocb *req, return iorw->size; } + if (req->flags & REQ_F_BUFFER_SELECT) { + ret = io_iov_buffer_select(req, *iovec, needs_lock); + if (!ret) { + ret = (*iovec)->iov_len; + iov_iter_init(iter, rw, *iovec, 1, ret); + } + *iovec = NULL; + return ret; + } + #ifdef CONFIG_COMPAT if (req->ctx->compat) return compat_import_iovec(rw, buf, sqe_len, UIO_FASTIOV, @@ -2169,12 +2472,18 @@ static void io_req_map_rw(struct io_kiocb *req, ssize_t io_size, } } +static inline int __io_alloc_async_ctx(struct io_kiocb *req) +{ + req->io = kmalloc(sizeof(*req->io), GFP_KERNEL); + return req->io == NULL; +} + static int io_alloc_async_ctx(struct io_kiocb *req) { if (!io_op_defs[req->opcode].async_ctx) return 0; - req->io = kmalloc(sizeof(*req->io), GFP_KERNEL); - return req->io == NULL; + + return __io_alloc_async_ctx(req); } static int io_setup_async_rw(struct io_kiocb *req, ssize_t io_size, @@ -2184,7 +2493,7 @@ static int io_setup_async_rw(struct io_kiocb *req, ssize_t io_size, if (!io_op_defs[req->opcode].async_ctx) return 0; if (!req->io) { - if (io_alloc_async_ctx(req)) + if (__io_alloc_async_ctx(req)) return -ENOMEM; io_req_map_rw(req, io_size, iovec, fast_iov, iter); @@ -2213,7 +2522,7 @@ static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, io = req->io; io->rw.iov = io->rw.fast_iov; req->io = NULL; - ret = io_import_iovec(READ, req, &io->rw.iov, &iter); + ret = io_import_iovec(READ, req, &io->rw.iov, &iter, !force_nonblock); req->io = io; if (ret < 0) return ret; @@ -2222,8 +2531,7 @@ static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, return 0; } -static int io_read(struct io_kiocb *req, struct io_kiocb **nxt, - bool force_nonblock) +static int io_read(struct io_kiocb *req, bool force_nonblock) { struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; struct kiocb *kiocb = &req->rw.kiocb; @@ -2231,13 +2539,13 @@ static int io_read(struct io_kiocb *req, struct io_kiocb **nxt, size_t iov_count; ssize_t io_size, ret; - ret = io_import_iovec(READ, req, &iovec, &iter); + ret = io_import_iovec(READ, req, &iovec, &iter, !force_nonblock); if (ret < 0) return ret; /* Ensure we clear previously set non-block flag */ if (!force_nonblock) - req->rw.kiocb.ki_flags &= ~IOCB_NOWAIT; + kiocb->ki_flags &= ~IOCB_NOWAIT; req->result = 0; io_size = ret; @@ -2248,10 +2556,8 @@ static int io_read(struct io_kiocb *req, struct io_kiocb **nxt, * If the file doesn't support async, mark it as REQ_F_MUST_PUNT so * we know to async punt it even if it was opened O_NONBLOCK */ - if (force_nonblock && !io_file_supports_async(req->file)) { - req->flags |= REQ_F_MUST_PUNT; + if (force_nonblock && !io_file_supports_async(req->file)) goto copy_iov; - } iov_count = iov_iter_count(&iter); ret = rw_verify_area(READ, req->file, &kiocb->ki_pos, iov_count); @@ -2265,13 +2571,16 @@ static int io_read(struct io_kiocb *req, struct io_kiocb **nxt, /* Catch -EAGAIN return for forced non-blocking submission */ if (!force_nonblock || ret2 != -EAGAIN) { - kiocb_done(kiocb, ret2, nxt, req->in_async); + kiocb_done(kiocb, ret2); } else { copy_iov: ret = io_setup_async_rw(req, io_size, iovec, inline_vecs, &iter); if (ret) goto out_free; + /* any defer here is final, must blocking retry */ + if (!(req->flags & REQ_F_NOWAIT)) + req->flags |= REQ_F_MUST_PUNT; return -EAGAIN; } } @@ -2295,6 +2604,8 @@ static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (unlikely(!(req->file->f_mode & FMODE_WRITE))) return -EBADF; + req->fsize = rlimit(RLIMIT_FSIZE); + /* either don't need iovec imported or already have it */ if (!req->io || req->flags & REQ_F_NEED_CLEANUP) return 0; @@ -2302,7 +2613,7 @@ static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, io = req->io; io->rw.iov = io->rw.fast_iov; req->io = NULL; - ret = io_import_iovec(WRITE, req, &io->rw.iov, &iter); + ret = io_import_iovec(WRITE, req, &io->rw.iov, &iter, !force_nonblock); req->io = io; if (ret < 0) return ret; @@ -2311,8 +2622,7 @@ static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, return 0; } -static int io_write(struct io_kiocb *req, struct io_kiocb **nxt, - bool force_nonblock) +static int io_write(struct io_kiocb *req, bool force_nonblock) { struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; struct kiocb *kiocb = &req->rw.kiocb; @@ -2320,7 +2630,7 @@ static int io_write(struct io_kiocb *req, struct io_kiocb **nxt, size_t iov_count; ssize_t ret, io_size; - ret = io_import_iovec(WRITE, req, &iovec, &iter); + ret = io_import_iovec(WRITE, req, &iovec, &iter, !force_nonblock); if (ret < 0) return ret; @@ -2337,10 +2647,8 @@ static int io_write(struct io_kiocb *req, struct io_kiocb **nxt, * If the file doesn't support async, mark it as REQ_F_MUST_PUNT so * we know to async punt it even if it was opened O_NONBLOCK */ - if (force_nonblock && !io_file_supports_async(req->file)) { - req->flags |= REQ_F_MUST_PUNT; + if (force_nonblock && !io_file_supports_async(req->file)) goto copy_iov; - } /* file path doesn't support NOWAIT for non-direct_IO */ if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT) && @@ -2367,24 +2675,33 @@ static int io_write(struct io_kiocb *req, struct io_kiocb **nxt, } kiocb->ki_flags |= IOCB_WRITE; + if (!force_nonblock) + current->signal->rlim[RLIMIT_FSIZE].rlim_cur = req->fsize; + if (req->file->f_op->write_iter) ret2 = call_write_iter(req->file, kiocb, &iter); else ret2 = loop_rw_iter(WRITE, req->file, kiocb, &iter); + + if (!force_nonblock) + current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY; + /* - * Raw bdev writes will -EOPNOTSUPP for IOCB_NOWAIT. Just + * Raw bdev writes will return -EOPNOTSUPP for IOCB_NOWAIT. Just * retry them without IOCB_NOWAIT. */ if (ret2 == -EOPNOTSUPP && (kiocb->ki_flags & IOCB_NOWAIT)) ret2 = -EAGAIN; if (!force_nonblock || ret2 != -EAGAIN) { - kiocb_done(kiocb, ret2, nxt, req->in_async); + kiocb_done(kiocb, ret2); } else { copy_iov: ret = io_setup_async_rw(req, io_size, iovec, inline_vecs, &iter); if (ret) goto out_free; + /* any defer here is final, must blocking retry */ + req->flags |= REQ_F_MUST_PUNT; return -EAGAIN; } } @@ -2394,6 +2711,76 @@ out_free: return ret; } +static int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_splice* sp = &req->splice; + unsigned int valid_flags = SPLICE_F_FD_IN_FIXED | SPLICE_F_ALL; + int ret; + + if (req->flags & REQ_F_NEED_CLEANUP) + return 0; + + sp->file_in = NULL; + sp->off_in = READ_ONCE(sqe->splice_off_in); + sp->off_out = READ_ONCE(sqe->off); + sp->len = READ_ONCE(sqe->len); + sp->flags = READ_ONCE(sqe->splice_flags); + + if (unlikely(sp->flags & ~valid_flags)) + return -EINVAL; + + ret = io_file_get(NULL, req, READ_ONCE(sqe->splice_fd_in), &sp->file_in, + (sp->flags & SPLICE_F_FD_IN_FIXED)); + if (ret) + return ret; + req->flags |= REQ_F_NEED_CLEANUP; + + if (!S_ISREG(file_inode(sp->file_in)->i_mode)) + req->work.flags |= IO_WQ_WORK_UNBOUND; + + return 0; +} + +static bool io_splice_punt(struct file *file) +{ + if (get_pipe_info(file)) + return false; + if (!io_file_supports_async(file)) + return true; + return !(file->f_mode & O_NONBLOCK); +} + +static int io_splice(struct io_kiocb *req, bool force_nonblock) +{ + struct io_splice *sp = &req->splice; + struct file *in = sp->file_in; + struct file *out = sp->file_out; + unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED; + loff_t *poff_in, *poff_out; + long ret; + + if (force_nonblock) { + if (io_splice_punt(in) || io_splice_punt(out)) + return -EAGAIN; + flags |= SPLICE_F_NONBLOCK; + } + + poff_in = (sp->off_in == -1) ? NULL : &sp->off_in; + poff_out = (sp->off_out == -1) ? NULL : &sp->off_out; + ret = do_splice(in, poff_in, out, poff_out, sp->len, flags); + if (force_nonblock && ret == -EAGAIN) + return -EAGAIN; + + io_put_file(req, in, (sp->flags & SPLICE_F_FD_IN_FIXED)); + req->flags &= ~REQ_F_NEED_CLEANUP; + + io_cqring_add_event(req, ret); + if (ret != sp->len) + req_set_fail_links(req); + io_put_req(req); + return 0; +} + /* * IORING_OP_NOP just posts a completion event, nothing else. */ @@ -2442,85 +2829,63 @@ static bool io_req_cancelled(struct io_kiocb *req) return false; } -static void io_link_work_cb(struct io_wq_work **workptr) -{ - struct io_wq_work *work = *workptr; - struct io_kiocb *link = work->data; - - io_queue_linked_timeout(link); - work->func = io_wq_submit_work; -} - -static void io_wq_assign_next(struct io_wq_work **workptr, struct io_kiocb *nxt) +static void __io_fsync(struct io_kiocb *req) { - struct io_kiocb *link; - - io_prep_async_work(nxt, &link); - *workptr = &nxt->work; - if (link) { - nxt->work.flags |= IO_WQ_WORK_CB; - nxt->work.func = io_link_work_cb; - nxt->work.data = link; - } -} - -static void io_fsync_finish(struct io_wq_work **workptr) -{ - struct io_kiocb *req = |