summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2020-12-16 12:44:05 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2020-12-16 12:44:05 -0800
commit48aba79bcf6ea05148dc82ad9c40713960b00396 (patch)
tree768ced0b8a924b8adb82771a797942cb5fd6ce1e
parent005b2a9dc819a1265a8c765595f8f6d88d6173d9 (diff)
parent59850d226e4907a6f37c1d2fe5ba97546a8691a4 (diff)
Merge tag 'for-5.11/io_uring-2020-12-14' of git://git.kernel.dk/linux-block
Pull io_uring updates from Jens Axboe: "Fairly light set of changes this time around, and mostly some bits that were pushed out to 5.11 instead of 5.10, fixes/cleanups, and a few features. In particular: - Cleanups around iovec import (David Laight, Pavel) - Add timeout support for io_uring_enter(2), which enables us to clean up liburing and avoid a timeout sqe submission in the completion path. The big win here is that it allows setups that split SQ and CQ handling into separate threads to avoid locking, as the CQ side will no longer submit when timeouts are needed when waiting for events (Hao Xu) - Add support for socket shutdown, and renameat/unlinkat. - SQPOLL cleanups and improvements (Xiaoguang Wang) - Allow SQPOLL setups for CAP_SYS_NICE, and enable regular (non-fixed) files to be used. - Cancelation improvements (Pavel) - Fixed file reference improvements (Pavel) - IOPOLL related race fixes (Pavel) - Lots of other little fixes and cleanups (mostly Pavel)" * tag 'for-5.11/io_uring-2020-12-14' of git://git.kernel.dk/linux-block: (43 commits) io_uring: fix io_cqring_events()'s noflush io_uring: fix racy IOPOLL flush overflow io_uring: fix racy IOPOLL completions io_uring: always let io_iopoll_complete() complete polled io io_uring: add timeout update io_uring: restructure io_timeout_cancel() io_uring: fix files cancellation io_uring: use bottom half safe lock for fixed file data io_uring: fix miscounting ios_left io_uring: change submit file state invariant io_uring: check kthread stopped flag when sq thread is unparked io_uring: share fixed_file_refs b/w multiple rsrcs io_uring: replace inflight_wait with tctx->wait io_uring: don't take fs for recvmsg/sendmsg io_uring: only wake up sq thread while current task is in io worker context io_uring: don't acquire uring_lock twice io_uring: initialize 'timeout' properly in io_sq_thread() io_uring: refactor io_sq_thread() handling io_uring: always batch cancel in *cancel_files() io_uring: pass files into kill timeouts/poll ...
-rw-r--r--fs/internal.h2
-rw-r--r--fs/io-wq.c10
-rw-r--r--fs/io-wq.h1
-rw-r--r--fs/io_uring.c1327
-rw-r--r--fs/namei.c40
-rw-r--r--include/linux/socket.h1
-rw-r--r--include/linux/syscalls.h2
-rw-r--r--include/uapi/linux/io_uring.h16
-rw-r--r--net/socket.c15
9 files changed, 824 insertions, 590 deletions
diff --git a/fs/internal.h b/fs/internal.h
index a7cd0f64faa4..6fd14ea213c3 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -78,6 +78,8 @@ extern int vfs_path_lookup(struct dentry *, struct vfsmount *,
long do_rmdir(int dfd, struct filename *name);
long do_unlinkat(int dfd, struct filename *name);
int may_linkat(struct path *link);
+int do_renameat2(int olddfd, struct filename *oldname, int newdfd,
+ struct filename *newname, unsigned int flags);
/*
* namespace.c
diff --git a/fs/io-wq.c b/fs/io-wq.c
index b53c055bea6a..f72d53848dcb 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -1078,16 +1078,6 @@ enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel,
return IO_WQ_CANCEL_NOTFOUND;
}
-static bool io_wq_io_cb_cancel_data(struct io_wq_work *work, void *data)
-{
- return work == data;
-}
-
-enum io_wq_cancel io_wq_cancel_work(struct io_wq *wq, struct io_wq_work *cwork)
-{
- return io_wq_cancel_cb(wq, io_wq_io_cb_cancel_data, (void *)cwork, false);
-}
-
struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
{
int ret = -ENOMEM, node;
diff --git a/fs/io-wq.h b/fs/io-wq.h
index cba36f03c355..069496c6d4f9 100644
--- a/fs/io-wq.h
+++ b/fs/io-wq.h
@@ -129,7 +129,6 @@ static inline bool io_wq_is_hashed(struct io_wq_work *work)
}
void io_wq_cancel_all(struct io_wq *wq);
-enum io_wq_cancel io_wq_cancel_work(struct io_wq *wq, struct io_wq_work *cwork);
typedef bool (work_cancel_fn)(struct io_wq_work *, void *);
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 0e8902be6b96..22e31050f33e 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -245,6 +245,8 @@ struct io_sq_data {
struct task_struct *thread;
struct wait_queue_head wait;
+
+ unsigned sq_thread_idle;
};
struct io_ring_ctx {
@@ -285,7 +287,6 @@ struct io_ring_ctx {
struct list_head timeout_list;
struct list_head cq_overflow_list;
- wait_queue_head_t inflight_wait;
struct io_uring_sqe *sq_sqes;
} ____cacheline_aligned_in_smp;
@@ -310,7 +311,6 @@ struct io_ring_ctx {
struct io_sq_data *sq_data; /* if using sq thread polling */
struct wait_queue_head sqo_sq_wait;
- struct wait_queue_entry sqo_wait_entry;
struct list_head sqd_list;
/*
@@ -395,16 +395,18 @@ struct io_ring_ctx {
*/
struct io_poll_iocb {
struct file *file;
- union {
- struct wait_queue_head *head;
- u64 addr;
- };
+ struct wait_queue_head *head;
__poll_t events;
bool done;
bool canceled;
struct wait_queue_entry wait;
};
+struct io_poll_remove {
+ struct file *file;
+ u64 addr;
+};
+
struct io_close {
struct file *file;
struct file *put_file;
@@ -444,11 +446,17 @@ struct io_timeout {
u32 off;
u32 target_seq;
struct list_head list;
+ /* head of the link, used by linked timeouts only */
+ struct io_kiocb *head;
};
struct io_timeout_rem {
struct file *file;
u64 addr;
+
+ /* timeout update */
+ struct timespec64 ts;
+ u32 flags;
};
struct io_rw {
@@ -541,6 +549,27 @@ struct io_statx {
struct statx __user *buffer;
};
+struct io_shutdown {
+ struct file *file;
+ int how;
+};
+
+struct io_rename {
+ struct file *file;
+ int old_dfd;
+ int new_dfd;
+ struct filename *oldpath;
+ struct filename *newpath;
+ int flags;
+};
+
+struct io_unlink {
+ struct file *file;
+ int dfd;
+ int flags;
+ struct filename *filename;
+};
+
struct io_completion {
struct file *file;
struct list_head list;
@@ -575,7 +604,6 @@ enum {
REQ_F_FORCE_ASYNC_BIT = IOSQE_ASYNC_BIT,
REQ_F_BUFFER_SELECT_BIT = IOSQE_BUFFER_SELECT_BIT,
- REQ_F_LINK_HEAD_BIT,
REQ_F_FAIL_LINK_BIT,
REQ_F_INFLIGHT_BIT,
REQ_F_CUR_POS_BIT,
@@ -607,8 +635,6 @@ enum {
/* IOSQE_BUFFER_SELECT */
REQ_F_BUFFER_SELECT = BIT(REQ_F_BUFFER_SELECT_BIT),
- /* head of a link */
- REQ_F_LINK_HEAD = BIT(REQ_F_LINK_HEAD_BIT),
/* fail rest of links */
REQ_F_FAIL_LINK = BIT(REQ_F_FAIL_LINK_BIT),
/* on inflight list */
@@ -651,6 +677,7 @@ struct io_kiocb {
struct file *file;
struct io_rw rw;
struct io_poll_iocb poll;
+ struct io_poll_remove poll_remove;
struct io_accept accept;
struct io_sync sync;
struct io_cancel cancel;
@@ -667,6 +694,9 @@ struct io_kiocb {
struct io_splice splice;
struct io_provide_buf pbuf;
struct io_statx statx;
+ struct io_shutdown shutdown;
+ struct io_rename rename;
+ struct io_unlink unlink;
/* use only after cleaning per-op data, see io_clean_op() */
struct io_completion compl;
};
@@ -686,15 +716,14 @@ struct io_kiocb {
struct task_struct *task;
u64 user_data;
- struct list_head link_list;
+ struct io_kiocb *link;
+ struct percpu_ref *fixed_file_refs;
/*
* 1. used with ctx->iopoll_list with reads/writes
* 2. to track reqs with ->files (see io_op_def::file_table)
*/
struct list_head inflight_entry;
-
- struct percpu_ref *fixed_file_refs;
struct callback_head task_work;
/* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */
struct hlist_node hash_node;
@@ -725,6 +754,8 @@ struct io_submit_state {
void *reqs[IO_IOPOLL_BATCH];
unsigned int free_reqs;
+ bool plug_started;
+
/*
* Batch completion logic
*/
@@ -735,7 +766,7 @@ struct io_submit_state {
*/
struct file *file;
unsigned int fd;
- unsigned int has_refs;
+ unsigned int file_refs;
unsigned int ios_left;
};
@@ -757,6 +788,8 @@ struct io_op_def {
unsigned buffer_select : 1;
/* must always have async data allocated */
unsigned needs_async_data : 1;
+ /* should block plug */
+ unsigned plug : 1;
/* size of async data needed, if any */
unsigned short async_size;
unsigned work_flags;
@@ -770,6 +803,7 @@ static const struct io_op_def io_op_defs[] = {
.pollin = 1,
.buffer_select = 1,
.needs_async_data = 1,
+ .plug = 1,
.async_size = sizeof(struct io_async_rw),
.work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG,
},
@@ -779,6 +813,7 @@ static const struct io_op_def io_op_defs[] = {
.unbound_nonreg_file = 1,
.pollout = 1,
.needs_async_data = 1,
+ .plug = 1,
.async_size = sizeof(struct io_async_rw),
.work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG |
IO_WQ_WORK_FSIZE,
@@ -791,6 +826,7 @@ static const struct io_op_def io_op_defs[] = {
.needs_file = 1,
.unbound_nonreg_file = 1,
.pollin = 1,
+ .plug = 1,
.async_size = sizeof(struct io_async_rw),
.work_flags = IO_WQ_WORK_BLKCG | IO_WQ_WORK_MM,
},
@@ -799,6 +835,7 @@ static const struct io_op_def io_op_defs[] = {
.hash_reg_file = 1,
.unbound_nonreg_file = 1,
.pollout = 1,
+ .plug = 1,
.async_size = sizeof(struct io_async_rw),
.work_flags = IO_WQ_WORK_BLKCG | IO_WQ_WORK_FSIZE |
IO_WQ_WORK_MM,
@@ -818,8 +855,7 @@ static const struct io_op_def io_op_defs[] = {
.pollout = 1,
.needs_async_data = 1,
.async_size = sizeof(struct io_async_msghdr),
- .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG |
- IO_WQ_WORK_FS,
+ .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG,
},
[IORING_OP_RECVMSG] = {
.needs_file = 1,
@@ -828,15 +864,17 @@ static const struct io_op_def io_op_defs[] = {
.buffer_select = 1,
.needs_async_data = 1,
.async_size = sizeof(struct io_async_msghdr),
- .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG |
- IO_WQ_WORK_FS,
+ .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG,
},
[IORING_OP_TIMEOUT] = {
.needs_async_data = 1,
.async_size = sizeof(struct io_timeout_data),
.work_flags = IO_WQ_WORK_MM,
},
- [IORING_OP_TIMEOUT_REMOVE] = {},
+ [IORING_OP_TIMEOUT_REMOVE] = {
+ /* used by timeout updates' prep() */
+ .work_flags = IO_WQ_WORK_MM,
+ },
[IORING_OP_ACCEPT] = {
.needs_file = 1,
.unbound_nonreg_file = 1,
@@ -863,7 +901,7 @@ static const struct io_op_def io_op_defs[] = {
},
[IORING_OP_OPENAT] = {
.work_flags = IO_WQ_WORK_FILES | IO_WQ_WORK_BLKCG |
- IO_WQ_WORK_FS,
+ IO_WQ_WORK_FS | IO_WQ_WORK_MM,
},
[IORING_OP_CLOSE] = {
.needs_file = 1,
@@ -882,6 +920,7 @@ static const struct io_op_def io_op_defs[] = {
.unbound_nonreg_file = 1,
.pollin = 1,
.buffer_select = 1,
+ .plug = 1,
.async_size = sizeof(struct io_async_rw),
.work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG,
},
@@ -889,6 +928,7 @@ static const struct io_op_def io_op_defs[] = {
.needs_file = 1,
.unbound_nonreg_file = 1,
.pollout = 1,
+ .plug = 1,
.async_size = sizeof(struct io_async_rw),
.work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG |
IO_WQ_WORK_FSIZE,
@@ -915,7 +955,7 @@ static const struct io_op_def io_op_defs[] = {
},
[IORING_OP_OPENAT2] = {
.work_flags = IO_WQ_WORK_FILES | IO_WQ_WORK_FS |
- IO_WQ_WORK_BLKCG,
+ IO_WQ_WORK_BLKCG | IO_WQ_WORK_MM,
},
[IORING_OP_EPOLL_CTL] = {
.unbound_nonreg_file = 1,
@@ -934,6 +974,17 @@ static const struct io_op_def io_op_defs[] = {
.hash_reg_file = 1,
.unbound_nonreg_file = 1,
},
+ [IORING_OP_SHUTDOWN] = {
+ .needs_file = 1,
+ },
+ [IORING_OP_RENAMEAT] = {
+ .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_FILES |
+ IO_WQ_WORK_FS | IO_WQ_WORK_BLKCG,
+ },
+ [IORING_OP_UNLINKAT] = {
+ .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_FILES |
+ IO_WQ_WORK_FS | IO_WQ_WORK_BLKCG,
+ },
};
enum io_mem_account {
@@ -983,6 +1034,9 @@ struct sock *io_uring_get_socket(struct file *file)
}
EXPORT_SYMBOL(io_uring_get_socket);
+#define io_for_each_link(pos, head) \
+ for (pos = (head); pos; pos = pos->link)
+
static inline void io_clean_op(struct io_kiocb *req)
{
if (req->flags & (REQ_F_NEED_CLEANUP | REQ_F_BUFFER_SELECTED |
@@ -990,8 +1044,39 @@ static inline void io_clean_op(struct io_kiocb *req)
__io_clean_op(req);
}
-static void io_sq_thread_drop_mm(void)
+static inline void io_set_resource_node(struct io_kiocb *req)
+{
+ struct io_ring_ctx *ctx = req->ctx;
+
+ if (!req->fixed_file_refs) {
+ req->fixed_file_refs = &ctx->file_data->node->refs;
+ percpu_ref_get(req->fixed_file_refs);
+ }
+}
+
+static bool io_match_task(struct io_kiocb *head,
+ struct task_struct *task,
+ struct files_struct *files)
{
+ struct io_kiocb *req;
+
+ if (task && head->task != task)
+ return false;
+ if (!files)
+ return true;
+
+ io_for_each_link(req, head) {
+ if ((req->flags & REQ_F_WORK_INITIALIZED) &&
+ (req->work.flags & IO_WQ_WORK_FILES) &&
+ req->work.identity->files == files)
+ return true;
+ }
+ return false;
+}
+
+static void io_sq_thread_drop_mm_files(void)
+{
+ struct files_struct *files = current->files;
struct mm_struct *mm = current->mm;
if (mm) {
@@ -999,6 +1084,41 @@ static void io_sq_thread_drop_mm(void)
mmput(mm);
current->mm = NULL;
}
+ if (files) {
+ struct nsproxy *nsproxy = current->nsproxy;
+
+ task_lock(current);
+ current->files = NULL;
+ current->nsproxy = NULL;
+ task_unlock(current);
+ put_files_struct(files);
+ put_nsproxy(nsproxy);
+ }
+}
+
+static int __io_sq_thread_acquire_files(struct io_ring_ctx *ctx)
+{
+ if (!current->files) {
+ struct files_struct *files;
+ struct nsproxy *nsproxy;
+
+ task_lock(ctx->sqo_task);
+ files = ctx->sqo_task->files;
+ if (!files) {
+ task_unlock(ctx->sqo_task);
+ return -EOWNERDEAD;
+ }
+ atomic_inc(&files->count);
+ get_nsproxy(ctx->sqo_task->nsproxy);
+ nsproxy = ctx->sqo_task->nsproxy;
+ task_unlock(ctx->sqo_task);
+
+ task_lock(current);
+ current->files = files;
+ current->nsproxy = nsproxy;
+ task_unlock(current);
+ }
+ return 0;
}
static int __io_sq_thread_acquire_mm(struct io_ring_ctx *ctx)
@@ -1026,12 +1146,25 @@ static int __io_sq_thread_acquire_mm(struct io_ring_ctx *ctx)
return -EFAULT;
}
-static int io_sq_thread_acquire_mm(struct io_ring_ctx *ctx,
- struct io_kiocb *req)
+static int io_sq_thread_acquire_mm_files(struct io_ring_ctx *ctx,
+ struct io_kiocb *req)
{
- if (!(io_op_defs[req->opcode].work_flags & IO_WQ_WORK_MM))
- return 0;
- return __io_sq_thread_acquire_mm(ctx);
+ const struct io_op_def *def = &io_op_defs[req->opcode];
+ int ret;
+
+ if (def->work_flags & IO_WQ_WORK_MM) {
+ ret = __io_sq_thread_acquire_mm(ctx);
+ if (unlikely(ret))
+ return ret;
+ }
+
+ if (def->needs_file || (def->work_flags & IO_WQ_WORK_FILES)) {
+ ret = __io_sq_thread_acquire_files(ctx);
+ if (unlikely(ret))
+ return ret;
+ }
+
+ return 0;
}
static void io_sq_thread_associate_blkcg(struct io_ring_ctx *ctx,
@@ -1174,7 +1307,6 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
INIT_LIST_HEAD(&ctx->iopoll_list);
INIT_LIST_HEAD(&ctx->defer_list);
INIT_LIST_HEAD(&ctx->timeout_list);
- init_waitqueue_head(&ctx->inflight_wait);
spin_lock_init(&ctx->inflight_lock);
INIT_LIST_HEAD(&ctx->inflight_list);
INIT_DELAYED_WORK(&ctx->file_put_work, io_file_put_work);
@@ -1416,10 +1548,8 @@ static void io_prep_async_link(struct io_kiocb *req)
{
struct io_kiocb *cur;
- io_prep_async_work(req);
- if (req->flags & REQ_F_LINK_HEAD)
- list_for_each_entry(cur, &req->link_list, link_list)
- io_prep_async_work(cur);
+ io_for_each_link(cur, req)
+ io_prep_async_work(cur);
}
static struct io_kiocb *__io_queue_async_work(struct io_kiocb *req)
@@ -1460,30 +1590,18 @@ static void io_kill_timeout(struct io_kiocb *req)
}
}
-static bool io_task_match(struct io_kiocb *req, struct task_struct *tsk)
-{
- struct io_ring_ctx *ctx = req->ctx;
-
- if (!tsk || req->task == tsk)
- return true;
- if (ctx->flags & IORING_SETUP_SQPOLL) {
- if (ctx->sq_data && req->task == ctx->sq_data->thread)
- return true;
- }
- return false;
-}
-
/*
* Returns true if we found and killed one or more timeouts
*/
-static bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk)
+static bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk,
+ struct files_struct *files)
{
struct io_kiocb *req, *tmp;
int canceled = 0;
spin_lock_irq(&ctx->completion_lock);
list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list) {
- if (io_task_match(req, tsk)) {
+ if (io_match_task(req, tsk, files)) {
io_kill_timeout(req);
canceled++;
}
@@ -1594,32 +1712,6 @@ static void io_cqring_mark_overflow(struct io_ring_ctx *ctx)
}
}
-static inline bool __io_match_files(struct io_kiocb *req,
- struct files_struct *files)
-{
- return ((req->flags & REQ_F_WORK_INITIALIZED) &&
- (req->work.flags & IO_WQ_WORK_FILES)) &&
- req->work.identity->files == files;
-}
-
-static bool io_match_files(struct io_kiocb *req,
- struct files_struct *files)
-{
- struct io_kiocb *link;
-
- if (!files)
- return true;
- if (__io_match_files(req, files))
- return true;
- if (req->flags & REQ_F_LINK_HEAD) {
- list_for_each_entry(link, &req->link_list, link_list) {
- if (__io_match_files(link, files))
- return true;
- }
- }
- return false;
-}
-
/* Returns true if there are no backlogged entries after the flush */
static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force,
struct task_struct *tsk,
@@ -1647,9 +1739,7 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force,
cqe = NULL;
list_for_each_entry_safe(req, tmp, &ctx->cq_overflow_list, compl.list) {
- if (tsk && req->task != tsk)
- continue;
- if (!io_match_files(req, files))
+ if (!io_match_task(req, tsk, files))
continue;
cqe = io_get_cqring(ctx);
@@ -1845,9 +1935,7 @@ fallback:
static inline void io_put_file(struct io_kiocb *req, struct file *file,
bool fixed)
{
- if (fixed)
- percpu_ref_put(req->fixed_file_refs);
- else
+ if (!fixed)
fput(file);
}
@@ -1859,7 +1947,8 @@ static void io_dismantle_req(struct io_kiocb *req)
kfree(req->async_data);
if (req->file)
io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE));
-
+ if (req->fixed_file_refs)
+ percpu_ref_put(req->fixed_file_refs);
io_req_clean_work(req);
}
@@ -1882,6 +1971,14 @@ static void __io_free_req(struct io_kiocb *req)
percpu_ref_put(&ctx->refs);
}
+static inline void io_remove_next_linked(struct io_kiocb *req)
+{
+ struct io_kiocb *nxt = req->link;
+
+ req->link = nxt->link;
+ nxt->link = NULL;
+}
+
static void io_kill_linked_timeout(struct io_kiocb *req)
{
struct io_ring_ctx *ctx = req->ctx;
@@ -1890,8 +1987,8 @@ static void io_kill_linked_timeout(struct io_kiocb *req)
unsigned long flags;
spin_lock_irqsave(&ctx->completion_lock, flags);
- link = list_first_entry_or_null(&req->link_list, struct io_kiocb,
- link_list);
+ link = req->link;
+
/*
* Can happen if a linked timeout fired and link had been like
* req -> link t-out -> link t-out [-> ...]
@@ -1900,7 +1997,8 @@ static void io_kill_linked_timeout(struct io_kiocb *req)
struct io_timeout_data *io = link->async_data;
int ret;
- list_del_init(&link->link_list);
+ io_remove_next_linked(req);
+ link->timeout.head = NULL;
ret = hrtimer_try_to_cancel(&io->timer);
if (ret != -1) {
io_cqring_fill_event(link, -ECANCELED);
@@ -1917,41 +2015,22 @@ static void io_kill_linked_timeout(struct io_kiocb *req)
}
}
-static struct io_kiocb *io_req_link_next(struct io_kiocb *req)
-{
- struct io_kiocb *nxt;
- /*
- * The list should never be empty when we are called here. But could
- * potentially happen if the chain is messed up, check to be on the
- * safe side.
- */
- if (unlikely(list_empty(&req->link_list)))
- return NULL;
-
- nxt = list_first_entry(&req->link_list, struct io_kiocb, link_list);
- list_del_init(&req->link_list);
- if (!list_empty(&nxt->link_list))
- nxt->flags |= REQ_F_LINK_HEAD;
- return nxt;
-}
-
-/*
- * Called if REQ_F_LINK_HEAD is set, and we fail the head request
- */
static void io_fail_links(struct io_kiocb *req)
{
+ struct io_kiocb *link, *nxt;
struct io_ring_ctx *ctx = req->ctx;
unsigned long flags;
spin_lock_irqsave(&ctx->completion_lock, flags);
- while (!list_empty(&req->link_list)) {
- struct io_kiocb *link = list_first_entry(&req->link_list,
- struct io_kiocb, link_list);
+ link = req->link;
+ req->link = NULL;
- list_del_init(&link->link_list);
- trace_io_uring_fail_link(req, link);
+ while (link) {
+ nxt = link->link;
+ link->link = NULL;
+ trace_io_uring_fail_link(req, link);
io_cqring_fill_event(link, -ECANCELED);
/*
@@ -1963,8 +2042,8 @@ static void io_fail_links(struct io_kiocb *req)
io_put_req_deferred(link, 2);
else
io_double_put_req(link);
+ link = nxt;
}
-
io_commit_cqring(ctx);
spin_unlock_irqrestore(&ctx->completion_lock, flags);
@@ -1973,7 +2052,6 @@ static void io_fail_links(struct io_kiocb *req)
static struct io_kiocb *__io_req_find_next(struct io_kiocb *req)
{
- req->flags &= ~REQ_F_LINK_HEAD;
if (req->flags & REQ_F_LINK_TIMEOUT)
io_kill_linked_timeout(req);
@@ -1983,15 +2061,19 @@ static struct io_kiocb *__io_req_find_next(struct io_kiocb *req)
* dependencies to the next request. In case of failure, fail the rest
* of the chain.
*/
- if (likely(!(req->flags & REQ_F_FAIL_LINK)))
- return io_req_link_next(req);
+ if (likely(!(req->flags & REQ_F_FAIL_LINK))) {
+ struct io_kiocb *nxt = req->link;
+
+ req->link = NULL;
+ return nxt;
+ }
io_fail_links(req);
return NULL;
}
-static struct io_kiocb *io_req_find_next(struct io_kiocb *req)
+static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req)
{
- if (likely(!(req->flags & REQ_F_LINK_HEAD)))
+ if (likely(!(req->link) && !(req->flags & REQ_F_LINK_TIMEOUT)))
return NULL;
return __io_req_find_next(req);
}
@@ -2050,7 +2132,8 @@ static void __io_req_task_submit(struct io_kiocb *req)
{
struct io_ring_ctx *ctx = req->ctx;
- if (!__io_sq_thread_acquire_mm(ctx)) {
+ if (!__io_sq_thread_acquire_mm(ctx) &&
+ !__io_sq_thread_acquire_files(ctx)) {
mutex_lock(&ctx->uring_lock);
__io_queue_sqe(req, NULL);
mutex_unlock(&ctx->uring_lock);
@@ -2086,7 +2169,7 @@ static void io_req_task_queue(struct io_kiocb *req)
}
}
-static void io_queue_next(struct io_kiocb *req)
+static inline void io_queue_next(struct io_kiocb *req)
{
struct io_kiocb *nxt = io_req_find_next(req);
@@ -2143,8 +2226,7 @@ static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req)
io_free_req(req);
return;
}
- if (req->flags & REQ_F_LINK_HEAD)
- io_queue_next(req);
+ io_queue_next(req);
if (req->task != rb->task) {
if (rb->task) {
@@ -2246,7 +2328,7 @@ static unsigned io_cqring_events(struct io_ring_ctx *ctx, bool noflush)
* we wake up the task, and the next invocation will flush the
* entries. We cannot safely to it from here.
*/
- if (noflush && !list_empty(&ctx->cq_overflow_list))
+ if (noflush)
return -1U;
io_cqring_overflow_flush(ctx, false, NULL, NULL);
@@ -2593,7 +2675,7 @@ static bool io_rw_reissue(struct io_kiocb *req, long res)
if ((res != -EAGAIN && res != -EOPNOTSUPP) || io_wq_current_is_worker())
return false;
- ret = io_sq_thread_acquire_mm(req->ctx, req);
+ ret = io_sq_thread_acquire_mm_files(req->ctx, req);
if (io_resubmit_prep(req, ret)) {
refcount_inc(&req->refs);
@@ -2641,7 +2723,7 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
* find it from a io_iopoll_getevents() thread before the issuer is done
* accessing the kiocb cookie.
*/
-static void io_iopoll_req_issued(struct io_kiocb *req)
+static void io_iopoll_req_issued(struct io_kiocb *req, bool in_async)
{
struct io_ring_ctx *ctx = req->ctx;
@@ -2670,21 +2752,25 @@ static void io_iopoll_req_issued(struct io_kiocb *req)
else
list_add_tail(&req->inflight_entry, &ctx->iopoll_list);
- if ((ctx->flags & IORING_SETUP_SQPOLL) &&
+ /*
+ * If IORING_SETUP_SQPOLL is enabled, sqes are either handled in sq thread
+ * task context or in io worker task context. If current task context is
+ * sq thread, we don't need to check whether should wake up sq thread.
+ */
+ if (in_async && (ctx->flags & IORING_SETUP_SQPOLL) &&
wq_has_sleeper(&ctx->sq_data->wait))
wake_up(&ctx->sq_data->wait);
}
-static void __io_state_file_put(struct io_submit_state *state)
+static inline void __io_state_file_put(struct io_submit_state *state)
{
- if (state->has_refs)
- fput_many(state->file, state->has_refs);
- state->file = NULL;
+ fput_many(state->file, state->file_refs);
+ state->file_refs = 0;
}
static inline void io_state_file_put(struct io_submit_state *state)
{
- if (state->file)
+ if (state->file_refs)
__io_state_file_put(state);
}
@@ -2698,19 +2784,19 @@ static struct file *__io_file_get(struct io_submit_state *state, int fd)
if (!state)
return fget(fd);
- if (state->file) {
+ if (state->file_refs) {
if (state->fd == fd) {
- state->has_refs--;
+ state->file_refs--;
return state->file;
}
__io_state_file_put(state);
}
state->file = fget_many(fd, state->ios_left);
- if (!state->file)
+ if (unlikely(!state->file))
return NULL;
state->fd = fd;
- state->has_refs = state->ios_left - 1;
+ state->file_refs = state->ios_left - 1;
return state->file;
}
@@ -3065,7 +3151,7 @@ static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
return __io_iov_buffer_select(req, iov, needs_lock);
}
-static ssize_t __io_import_iovec(int rw, struct io_kiocb *req,
+static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
struct iovec **iovec, struct iov_iter *iter,
bool needs_lock)
{
@@ -3094,7 +3180,7 @@ static ssize_t __io_import_iovec(int rw, struct io_kiocb *req,
ret = import_single_range(rw, buf, sqe_len, *iovec, iter);
*iovec = NULL;
- return ret < 0 ? ret : sqe_len;
+ return ret;
}
if (req->flags & REQ_F_BUFFER_SELECT) {
@@ -3111,18 +3197,6 @@ static ssize_t __io_import_iovec(int rw, struct io_kiocb *req,
req->ctx->compat);
}
-static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
- struct iovec **iovec, struct iov_iter *iter,
- bool needs_lock)
-{
- struct io_async_rw *iorw = req->async_data;
-
- if (!iorw)
- return __io_import_iovec(rw, req, iovec, iter, needs_lock);
- *iovec = NULL;
- return iov_iter_count(&iorw->iter);
-}
-
static inline loff_t *io_kiocb_ppos(struct kiocb *kiocb)
{
return (kiocb->ki_filp->f_mode & FMODE_STREAM) ? NULL : &kiocb->ki_pos;
@@ -3246,7 +3320,7 @@ static inline int io_rw_prep_async(struct io_kiocb *req, int rw)
struct iovec *iov = iorw->fast_iov;
ssize_t ret;
- ret = __io_import_iovec(rw, req, &iov, &iorw->iter, false);
+ ret = io_import_iovec(rw, req, &iov, &iorw->iter, false);
if (unlikely(ret < 0))
return ret;
@@ -3379,17 +3453,17 @@ static int io_read(struct io_kiocb *req, bool force_nonblock,
struct iov_iter __iter, *iter = &__iter;
struct io_async_rw *rw = req->async_data;
ssize_t io_size, ret, ret2;
- size_t iov_count;
bool no_async;
- if (rw)
+ if (rw) {
iter = &rw->iter;
-
- ret = io_import_iovec(READ, req, &iovec, iter, !force_nonblock);
- if (ret < 0)
- return ret;
- iov_count = iov_iter_count(iter);
- io_size = ret;
+ iovec = NULL;
+ } else {
+ ret = io_import_iovec(READ, req, &iovec, iter, !force_nonblock);
+ if (ret < 0)
+ return ret;
+ }
+ io_size = iov_iter_count(iter);
req->result = io_size;
ret = 0;
@@ -3405,7 +3479,7 @@ static int io_read(struct io_kiocb *req, bool force_nonblock,
if (no_async)
goto copy_iov;
- ret = rw_verify_area(READ, req->file, io_kiocb_ppos(kiocb), iov_count);
+ ret = rw_verify_area(READ, req->file, io_kiocb_ppos(kiocb), io_size);
if (unlikely(ret))
goto out_free;
@@ -3424,7 +3498,7 @@ static int io_read(struct io_kiocb *req, bool force_nonblock,
if (req->file->f_flags & O_NONBLOCK)
goto done;
/* some cases will consume bytes even on error returns */
- iov_iter_revert(iter, iov_count - iov_iter_count(iter));
+ iov_iter_revert(iter, io_size - iov_iter_count(iter));
ret = 0;
goto copy_iov;
} else if (ret < 0) {
@@ -3507,17 +3581,17 @@ static int io_write(struct io_kiocb *req, bool force_nonblock,
struct kiocb *kiocb = &req->rw.kiocb;
struct iov_iter __iter, *iter = &__iter;
struct io_async_rw *rw = req->async_data;
- size_t iov_count;
ssize_t ret, ret2, io_size;
- if (rw)
+ if (rw) {
iter = &rw->iter;
-
- ret = io_import_iovec(WRITE, req, &iovec, iter, !force_nonblock);
- if (ret < 0)
- return ret;
- iov_count = iov_iter_count(iter);
- io_size = ret;
+ iovec = NULL;
+ } else {
+ ret = io_import_iovec(WRITE, req, &iovec, iter, !force_nonblock);
+ if (ret < 0)
+ return ret;
+ }
+ io_size = iov_iter_count(iter);
req->result = io_size;
/* Ensure we clear previously set non-block flag */
@@ -3535,7 +3609,7 @@ static int io_write(struct io_kiocb *req, bool force_nonblock,
(req->flags & REQ_F_ISREG))
goto copy_iov;
- ret = rw_verify_area(WRITE, req->file, io_kiocb_ppos(kiocb), iov_count);
+ ret = rw_verify_area(WRITE, req->file, io_kiocb_ppos(kiocb), io_size);
if (unlikely(ret))
goto out_free;
@@ -3578,7 +3652,7 @@ done:
} else {
copy_iov:
/* some cases will consume bytes even on error returns */
- iov_iter_revert(iter, iov_count - iov_iter_count(iter));
+ iov_iter_revert(iter, io_size - iov_iter_count(iter));
ret = io_setup_async_rw(req, iovec, inline_vecs, iter, false);
if (!ret)
return -EAGAIN;
@@ -3590,6 +3664,135 @@ out_free:
return ret;
}
+static int io_renameat_prep(struct io_kiocb *req,
+ const struct io_uring_sqe *sqe)
+{
+ struct io_rename *ren = &req->rename;
+ const char __user *oldf, *newf;
+
+ if (unlikely(req->flags & REQ_F_FIXED_FILE))
+ return -EBADF;
+
+ ren->old_dfd = READ_ONCE(sqe->fd);
+ oldf = u64_to_user_ptr(READ_ONCE(sqe->addr));
+ newf = u64_to_user_ptr(READ_ONCE(sqe->addr2));
+ ren->new_dfd = READ_ONCE(sqe->len);
+ ren->flags = READ_ONCE(sqe->rename_flags);
+
+ ren->oldpath = getname(oldf);
+ if (IS_ERR(ren->oldpath))
+ return PTR_ERR(ren->oldpath);
+
+ ren->newpath = getname(newf);
+ if (IS_ERR(ren->newpath)) {
+ putname(ren->oldpath);
+ return PTR_ERR(ren->newpath);
+ }
+
+ req->flags |= REQ_F_NEED_CLEANUP;
+ return 0;
+}
+
+static int io_renameat(struct io_kiocb *req, bool force_nonblock)
+{
+ struct io_rename *ren = &req->rename;
+ int ret;
+
+ if (force_nonblock)
+ return -EAGAIN;
+
+ ret = do_renameat2(ren->old_dfd, ren->oldpath, ren->new_dfd,
+ ren->newpath, ren->flags);
+
+ req->flags &= ~REQ_F_NEED_CLEANUP;
+ if (ret < 0)
+ req_set_fail_links(req);
+ io_req_complete(req, ret);
+ return 0;
+}
+
+static int io_unlinkat_prep(struct io_kiocb *req,
+ const struct io_uring_sqe *sqe)
+{
+ struct io_unlink *un = &req->unlink;
+ const char __user *fname;
+
+ if (unlikely(req->flags & REQ_F_FIXED_FILE))
+ return -EBADF;
+
+ un->dfd = READ_ONCE(sqe->fd);
+
+ un->flags = READ_ONCE(sqe->unlink_flags);
+ if (un->flags & ~AT_REMOVEDIR)
+ return -EINVAL;
+
+ fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
+ un->filename = getname(fname);
+ if (IS_ERR(un->filename))
+ return PTR_ERR(un->filename);
+
+ req->flags |= REQ_F_NEED_CLEANUP;
+ return 0;
+}
+
+static int io_unlinkat(struct io_kiocb *req, bool force_nonblock)
+{
+ struct io_unlink *un = &req->unlink;
+ int ret;
+
+ if (force_nonblock)
+ return -EAGAIN;
+
+ if (un->flags & AT_REMOVEDIR)
+ ret = do_rmdir(un->dfd, un->filename);
+ else
+ ret = do_unlinkat(un->dfd, un->filename);
+
+ req->flags &= ~REQ_F_NEED_CLEANUP;
+ if (ret < 0)
+ req_set_fail_links(req);
+ io_req_complete(req, ret);
+ return 0;
+}
+
+static int io_shutdown_prep(struct io_kiocb *req,
<