summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--fs/internal.h2
-rw-r--r--fs/io-wq.c10
-rw-r--r--fs/io-wq.h1
-rw-r--r--fs/io_uring.c1327
-rw-r--r--fs/namei.c40
-rw-r--r--include/linux/socket.h1
-rw-r--r--include/linux/syscalls.h2
-rw-r--r--include/uapi/linux/io_uring.h16
-rw-r--r--net/socket.c15
9 files changed, 824 insertions, 590 deletions
diff --git a/fs/internal.h b/fs/internal.h
index a7cd0f64faa4..6fd14ea213c3 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -78,6 +78,8 @@ extern int vfs_path_lookup(struct dentry *, struct vfsmount *,
long do_rmdir(int dfd, struct filename *name);
long do_unlinkat(int dfd, struct filename *name);
int may_linkat(struct path *link);
+int do_renameat2(int olddfd, struct filename *oldname, int newdfd,
+ struct filename *newname, unsigned int flags);
/*
* namespace.c
diff --git a/fs/io-wq.c b/fs/io-wq.c
index b53c055bea6a..f72d53848dcb 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -1078,16 +1078,6 @@ enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel,
return IO_WQ_CANCEL_NOTFOUND;
}
-static bool io_wq_io_cb_cancel_data(struct io_wq_work *work, void *data)
-{
- return work == data;
-}
-
-enum io_wq_cancel io_wq_cancel_work(struct io_wq *wq, struct io_wq_work *cwork)
-{
- return io_wq_cancel_cb(wq, io_wq_io_cb_cancel_data, (void *)cwork, false);
-}
-
struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
{
int ret = -ENOMEM, node;
diff --git a/fs/io-wq.h b/fs/io-wq.h
index cba36f03c355..069496c6d4f9 100644
--- a/fs/io-wq.h
+++ b/fs/io-wq.h
@@ -129,7 +129,6 @@ static inline bool io_wq_is_hashed(struct io_wq_work *work)
}
void io_wq_cancel_all(struct io_wq *wq);
-enum io_wq_cancel io_wq_cancel_work(struct io_wq *wq, struct io_wq_work *cwork);
typedef bool (work_cancel_fn)(struct io_wq_work *, void *);
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 0e8902be6b96..22e31050f33e 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -245,6 +245,8 @@ struct io_sq_data {
struct task_struct *thread;
struct wait_queue_head wait;
+
+ unsigned sq_thread_idle;
};
struct io_ring_ctx {
@@ -285,7 +287,6 @@ struct io_ring_ctx {
struct list_head timeout_list;
struct list_head cq_overflow_list;
- wait_queue_head_t inflight_wait;
struct io_uring_sqe *sq_sqes;
} ____cacheline_aligned_in_smp;
@@ -310,7 +311,6 @@ struct io_ring_ctx {
struct io_sq_data *sq_data; /* if using sq thread polling */
struct wait_queue_head sqo_sq_wait;
- struct wait_queue_entry sqo_wait_entry;
struct list_head sqd_list;
/*
@@ -395,16 +395,18 @@ struct io_ring_ctx {
*/
struct io_poll_iocb {
struct file *file;
- union {
- struct wait_queue_head *head;
- u64 addr;
- };
+ struct wait_queue_head *head;
__poll_t events;
bool done;
bool canceled;
struct wait_queue_entry wait;
};
+struct io_poll_remove {
+ struct file *file;
+ u64 addr;
+};
+
struct io_close {
struct file *file;
struct file *put_file;
@@ -444,11 +446,17 @@ struct io_timeout {
u32 off;
u32 target_seq;
struct list_head list;
+ /* head of the link, used by linked timeouts only */
+ struct io_kiocb *head;
};
struct io_timeout_rem {
struct file *file;
u64 addr;
+
+ /* timeout update */
+ struct timespec64 ts;
+ u32 flags;
};
struct io_rw {
@@ -541,6 +549,27 @@ struct io_statx {
struct statx __user *buffer;
};
+struct io_shutdown {
+ struct file *file;
+ int how;
+};
+
+struct io_rename {
+ struct file *file;
+ int old_dfd;
+ int new_dfd;
+ struct filename *oldpath;
+ struct filename *newpath;
+ int flags;
+};
+
+struct io_unlink {
+ struct file *file;
+ int dfd;
+ int flags;
+ struct filename *filename;
+};
+
struct io_completion {
struct file *file;
struct list_head list;
@@ -575,7 +604,6 @@ enum {
REQ_F_FORCE_ASYNC_BIT = IOSQE_ASYNC_BIT,
REQ_F_BUFFER_SELECT_BIT = IOSQE_BUFFER_SELECT_BIT,
- REQ_F_LINK_HEAD_BIT,
REQ_F_FAIL_LINK_BIT,
REQ_F_INFLIGHT_BIT,
REQ_F_CUR_POS_BIT,
@@ -607,8 +635,6 @@ enum {
/* IOSQE_BUFFER_SELECT */
REQ_F_BUFFER_SELECT = BIT(REQ_F_BUFFER_SELECT_BIT),
- /* head of a link */
- REQ_F_LINK_HEAD = BIT(REQ_F_LINK_HEAD_BIT),
/* fail rest of links */
REQ_F_FAIL_LINK = BIT(REQ_F_FAIL_LINK_BIT),
/* on inflight list */
@@ -651,6 +677,7 @@ struct io_kiocb {
struct file *file;
struct io_rw rw;
struct io_poll_iocb poll;
+ struct io_poll_remove poll_remove;
struct io_accept accept;
struct io_sync sync;
struct io_cancel cancel;
@@ -667,6 +694,9 @@ struct io_kiocb {
struct io_splice splice;
struct io_provide_buf pbuf;
struct io_statx statx;
+ struct io_shutdown shutdown;
+ struct io_rename rename;
+ struct io_unlink unlink;
/* use only after cleaning per-op data, see io_clean_op() */
struct io_completion compl;
};
@@ -686,15 +716,14 @@ struct io_kiocb {
struct task_struct *task;
u64 user_data;
- struct list_head link_list;
+ struct io_kiocb *link;
+ struct percpu_ref *fixed_file_refs;
/*
* 1. used with ctx->iopoll_list with reads/writes
* 2. to track reqs with ->files (see io_op_def::file_table)
*/
struct list_head inflight_entry;
-
- struct percpu_ref *fixed_file_refs;
struct callback_head task_work;
/* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */
struct hlist_node hash_node;
@@ -725,6 +754,8 @@ struct io_submit_state {
void *reqs[IO_IOPOLL_BATCH];
unsigned int free_reqs;
+ bool plug_started;
+
/*
* Batch completion logic
*/
@@ -735,7 +766,7 @@ struct io_submit_state {
*/
struct file *file;
unsigned int fd;
- unsigned int has_refs;
+ unsigned int file_refs;
unsigned int ios_left;
};
@@ -757,6 +788,8 @@ struct io_op_def {
unsigned buffer_select : 1;
/* must always have async data allocated */
unsigned needs_async_data : 1;
+ /* should block plug */
+ unsigned plug : 1;
/* size of async data needed, if any */
unsigned short async_size;
unsigned work_flags;
@@ -770,6 +803,7 @@ static const struct io_op_def io_op_defs[] = {
.pollin = 1,
.buffer_select = 1,
.needs_async_data = 1,
+ .plug = 1,
.async_size = sizeof(struct io_async_rw),
.work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG,
},
@@ -779,6 +813,7 @@ static const struct io_op_def io_op_defs[] = {
.unbound_nonreg_file = 1,
.pollout = 1,
.needs_async_data = 1,
+ .plug = 1,
.async_size = sizeof(struct io_async_rw),
.work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG |
IO_WQ_WORK_FSIZE,
@@ -791,6 +826,7 @@ static const struct io_op_def io_op_defs[] = {
.needs_file = 1,
.unbound_nonreg_file = 1,
.pollin = 1,
+ .plug = 1,
.async_size = sizeof(struct io_async_rw),
.work_flags = IO_WQ_WORK_BLKCG | IO_WQ_WORK_MM,
},
@@ -799,6 +835,7 @@ static const struct io_op_def io_op_defs[] = {
.hash_reg_file = 1,
.unbound_nonreg_file = 1,
.pollout = 1,
+ .plug = 1,
.async_size = sizeof(struct io_async_rw),
.work_flags = IO_WQ_WORK_BLKCG | IO_WQ_WORK_FSIZE |
IO_WQ_WORK_MM,
@@ -818,8 +855,7 @@ static const struct io_op_def io_op_defs[] = {
.pollout = 1,
.needs_async_data = 1,
.async_size = sizeof(struct io_async_msghdr),
- .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG |
- IO_WQ_WORK_FS,
+ .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG,
},
[IORING_OP_RECVMSG] = {
.needs_file = 1,
@@ -828,15 +864,17 @@ static const struct io_op_def io_op_defs[] = {
.buffer_select = 1,
.needs_async_data = 1,
.async_size = sizeof(struct io_async_msghdr),
- .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG |
- IO_WQ_WORK_FS,
+ .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG,
},
[IORING_OP_TIMEOUT] = {
.needs_async_data = 1,
.async_size = sizeof(struct io_timeout_data),
.work_flags = IO_WQ_WORK_MM,
},
- [IORING_OP_TIMEOUT_REMOVE] = {},
+ [IORING_OP_TIMEOUT_REMOVE] = {
+ /* used by timeout updates' prep() */
+ .work_flags = IO_WQ_WORK_MM,
+ },
[IORING_OP_ACCEPT] = {
.needs_file = 1,
.unbound_nonreg_file = 1,
@@ -863,7 +901,7 @@ static const struct io_op_def io_op_defs[] = {
},
[IORING_OP_OPENAT] = {
.work_flags = IO_WQ_WORK_FILES | IO_WQ_WORK_BLKCG |
- IO_WQ_WORK_FS,
+ IO_WQ_WORK_FS | IO_WQ_WORK_MM,
},
[IORING_OP_CLOSE] = {
.needs_file = 1,
@@ -882,6 +920,7 @@ static const struct io_op_def io_op_defs[] = {
.unbound_nonreg_file = 1,
.pollin = 1,
.buffer_select = 1,
+ .plug = 1,
.async_size = sizeof(struct io_async_rw),
.work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG,
},
@@ -889,6 +928,7 @@ static const struct io_op_def io_op_defs[] = {
.needs_file = 1,
.unbound_nonreg_file = 1,
.pollout = 1,
+ .plug = 1,
.async_size = sizeof(struct io_async_rw),
.work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG |
IO_WQ_WORK_FSIZE,
@@ -915,7 +955,7 @@ static const struct io_op_def io_op_defs[] = {
},
[IORING_OP_OPENAT2] = {
.work_flags = IO_WQ_WORK_FILES | IO_WQ_WORK_FS |
- IO_WQ_WORK_BLKCG,
+ IO_WQ_WORK_BLKCG | IO_WQ_WORK_MM,
},
[IORING_OP_EPOLL_CTL] = {
.unbound_nonreg_file = 1,
@@ -934,6 +974,17 @@ static const struct io_op_def io_op_defs[] = {
.hash_reg_file = 1,
.unbound_nonreg_file = 1,
},
+ [IORING_OP_SHUTDOWN] = {
+ .needs_file = 1,
+ },
+ [IORING_OP_RENAMEAT] = {
+ .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_FILES |
+ IO_WQ_WORK_FS | IO_WQ_WORK_BLKCG,
+ },
+ [IORING_OP_UNLINKAT] = {
+ .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_FILES |
+ IO_WQ_WORK_FS | IO_WQ_WORK_BLKCG,
+ },
};
enum io_mem_account {
@@ -983,6 +1034,9 @@ struct sock *io_uring_get_socket(struct file *file)
}
EXPORT_SYMBOL(io_uring_get_socket);
+#define io_for_each_link(pos, head) \
+ for (pos = (head); pos; pos = pos->link)
+
static inline void io_clean_op(struct io_kiocb *req)
{
if (req->flags & (REQ_F_NEED_CLEANUP | REQ_F_BUFFER_SELECTED |
@@ -990,8 +1044,39 @@ static inline void io_clean_op(struct io_kiocb *req)
__io_clean_op(req);
}
-static void io_sq_thread_drop_mm(void)
+static inline void io_set_resource_node(struct io_kiocb *req)
+{
+ struct io_ring_ctx *ctx = req->ctx;
+
+ if (!req->fixed_file_refs) {
+ req->fixed_file_refs = &ctx->file_data->node->refs;
+ percpu_ref_get(req->fixed_file_refs);
+ }
+}
+
+static bool io_match_task(struct io_kiocb *head,
+ struct task_struct *task,
+ struct files_struct *files)
{
+ struct io_kiocb *req;
+
+ if (task && head->task != task)
+ return false;
+ if (!files)
+ return true;
+
+ io_for_each_link(req, head) {
+ if ((req->flags & REQ_F_WORK_INITIALIZED) &&
+ (req->work.flags & IO_WQ_WORK_FILES) &&
+ req->work.identity->files == files)
+ return true;
+ }
+ return false;
+}
+
+static void io_sq_thread_drop_mm_files(void)
+{
+ struct files_struct *files = current->files;
struct mm_struct *mm = current->mm;
if (mm) {
@@ -999,6 +1084,41 @@ static void io_sq_thread_drop_mm(void)
mmput(mm);
current->mm = NULL;
}
+ if (files) {
+ struct nsproxy *nsproxy = current->nsproxy;
+
+ task_lock(current);
+ current->files = NULL;
+ current->nsproxy = NULL;
+ task_unlock(current);
+ put_files_struct(files);
+ put_nsproxy(nsproxy);
+ }
+}
+
+static int __io_sq_thread_acquire_files(struct io_ring_ctx *ctx)
+{
+ if (!current->files) {
+ struct files_struct *files;
+ struct nsproxy *nsproxy;
+
+ task_lock(ctx->sqo_task);
+ files = ctx->sqo_task->files;
+ if (!files) {
+ task_unlock(ctx->sqo_task);
+ return -EOWNERDEAD;
+ }
+ atomic_inc(&files->count);
+ get_nsproxy(ctx->sqo_task->nsproxy);
+ nsproxy = ctx->sqo_task->nsproxy;
+ task_unlock(ctx->sqo_task);
+
+ task_lock(current);
+ current->files = files;
+ current->nsproxy = nsproxy;
+ task_unlock(current);
+ }
+ return 0;
}
static int __io_sq_thread_acquire_mm(struct io_ring_ctx *ctx)
@@ -1026,12 +1146,25 @@ static int __io_sq_thread_acquire_mm(struct io_ring_ctx *ctx)
return -EFAULT;
}
-static int io_sq_thread_acquire_mm(struct io_ring_ctx *ctx,
- struct io_kiocb *req)
+static int io_sq_thread_acquire_mm_files(struct io_ring_ctx *ctx,
+ struct io_kiocb *req)
{
- if (!(io_op_defs[req->opcode].work_flags & IO_WQ_WORK_MM))
- return 0;
- return __io_sq_thread_acquire_mm(ctx);
+ const struct io_op_def *def = &io_op_defs[req->opcode];
+ int ret;
+
+ if (def->work_flags & IO_WQ_WORK_MM) {
+ ret = __io_sq_thread_acquire_mm(ctx);
+ if (unlikely(ret))
+ return ret;
+ }
+
+ if (def->needs_file || (def->work_flags & IO_WQ_WORK_FILES)) {
+ ret = __io_sq_thread_acquire_files(ctx);
+ if (unlikely(ret))
+ return ret;
+ }
+
+ return 0;
}
static void io_sq_thread_associate_blkcg(struct io_ring_ctx *ctx,
@@ -1174,7 +1307,6 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
INIT_LIST_HEAD(&ctx->iopoll_list);
INIT_LIST_HEAD(&ctx->defer_list);
INIT_LIST_HEAD(&ctx->timeout_list);
- init_waitqueue_head(&ctx->inflight_wait);
spin_lock_init(&ctx->inflight_lock);
INIT_LIST_HEAD(&ctx->inflight_list);
INIT_DELAYED_WORK(&ctx->file_put_work, io_file_put_work);
@@ -1416,10 +1548,8 @@ static void io_prep_async_link(struct io_kiocb *req)
{
struct io_kiocb *cur;
- io_prep_async_work(req);
- if (req->flags & REQ_F_LINK_HEAD)
- list_for_each_entry(cur, &req->link_list, link_list)
- io_prep_async_work(cur);
+ io_for_each_link(cur, req)
+ io_prep_async_work(cur);
}
static struct io_kiocb *__io_queue_async_work(struct io_kiocb *req)
@@ -1460,30 +1590,18 @@ static void io_kill_timeout(struct io_kiocb *req)
}
}
-static bool io_task_match(struct io_kiocb *req, struct task_struct *tsk)
-{
- struct io_ring_ctx *ctx = req->ctx;
-
- if (!tsk || req->task == tsk)
- return true;
- if (ctx->flags & IORING_SETUP_SQPOLL) {
- if (ctx->sq_data && req->task == ctx->sq_data->thread)
- return true;
- }
- return false;
-}
-
/*
* Returns true if we found and killed one or more timeouts
*/
-static bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk)
+static bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk,
+ struct files_struct *files)
{
struct io_kiocb *req, *tmp;
int canceled = 0;
spin_lock_irq(&ctx->completion_lock);
list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list) {
- if (io_task_match(req, tsk)) {
+ if (io_match_task(req, tsk, files)) {
io_kill_timeout(req);
canceled++;
}
@@ -1594,32 +1712,6 @@ static void io_cqring_mark_overflow(struct io_ring_ctx *ctx)
}
}
-static inline bool __io_match_files(struct io_kiocb *req,
- struct files_struct *files)
-{
- return ((req->flags & REQ_F_WORK_INITIALIZED) &&
- (req->work.flags & IO_WQ_WORK_FILES)) &&
- req->work.identity->files == files;
-}
-
-static bool io_match_files(struct io_kiocb *req,
- struct files_struct *files)
-{
- struct io_kiocb *link;
-
- if (!files)
- return true;
- if (__io_match_files(req, files))
- return true;
- if (req->flags & REQ_F_LINK_HEAD) {
- list_for_each_entry(link, &req->link_list, link_list) {
- if (__io_match_files(link, files))
- return true;
- }
- }
- return false;
-}
-
/* Returns true if there are no backlogged entries after the flush */
static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force,
struct task_struct *tsk,
@@ -1647,9 +1739,7 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force,
cqe = NULL;
list_for_each_entry_safe(req, tmp, &ctx->cq_overflow_list, compl.list) {
- if (tsk && req->task != tsk)
- continue;
- if (!io_match_files(req, files))
+ if (!io_match_task(req, tsk, files))
continue;
cqe = io_get_cqring(ctx);
@@ -1845,9 +1935,7 @@ fallback:
static inline void io_put_file(struct io_kiocb *req, struct file *file,
bool fixed)
{
- if (fixed)
- percpu_ref_put(req->fixed_file_refs);
- else
+ if (!fixed)
fput(file);
}
@@ -1859,7 +1947,8 @@ static void io_dismantle_req(struct io_kiocb *req)
kfree(req->async_data);
if (req->file)
io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE));
-
+ if (req->fixed_file_refs)
+ percpu_ref_put(req->fixed_file_refs);
io_req_clean_work(req);
}
@@ -1882,6 +1971,14 @@ static void __io_free_req(struct io_kiocb *req)
percpu_ref_put(&ctx->refs);
}
+static inline void io_remove_next_linked(struct io_kiocb *req)
+{
+ struct io_kiocb *nxt = req->link;
+
+ req->link = nxt->link;
+ nxt->link = NULL;
+}
+
static void io_kill_linked_timeout(struct io_kiocb *req)
{
struct io_ring_ctx *ctx = req->ctx;
@@ -1890,8 +1987,8 @@ static void io_kill_linked_timeout(struct io_kiocb *req)
unsigned long flags;
spin_lock_irqsave(&ctx->completion_lock, flags);
- link = list_first_entry_or_null(&req->link_list, struct io_kiocb,
- link_list);
+ link = req->link;
+
/*
* Can happen if a linked timeout fired and link had been like
* req -> link t-out -> link t-out [-> ...]
@@ -1900,7 +1997,8 @@ static void io_kill_linked_timeout(struct io_kiocb *req)
struct io_timeout_data *io = link->async_data;
int ret;
- list_del_init(&link->link_list);
+ io_remove_next_linked(req);
+ link->timeout.head = NULL;
ret = hrtimer_try_to_cancel(&io->timer);
if (ret != -1) {
io_cqring_fill_event(link, -ECANCELED);
@@ -1917,41 +2015,22 @@ static void io_kill_linked_timeout(struct io_kiocb *req)
}
}
-static struct io_kiocb *io_req_link_next(struct io_kiocb *req)
-{
- struct io_kiocb *nxt;
- /*
- * The list should never be empty when we are called here. But could
- * potentially happen if the chain is messed up, check to be on the
- * safe side.
- */
- if (unlikely(list_empty(&req->link_list)))
- return NULL;
-
- nxt = list_first_entry(&req->link_list, struct io_kiocb, link_list);
- list_del_init(&req->link_list);
- if (!list_empty(&nxt->link_list))
- nxt->flags |= REQ_F_LINK_HEAD;
- return nxt;
-}
-
-/*
- * Called if REQ_F_LINK_HEAD is set, and we fail the head request
- */
static void io_fail_links(struct io_kiocb *req)
{
+ struct io_kiocb *link, *nxt;
struct io_ring_ctx *ctx = req->ctx;
unsigned long flags;
spin_lock_irqsave(&ctx->completion_lock, flags);
- while (!list_empty(&req->link_list)) {
- struct io_kiocb *link = list_first_entry(&req->link_list,
- struct io_kiocb, link_list);
+ link = req->link;
+ req->link = NULL;
- list_del_init(&link->link_list);
- trace_io_uring_fail_link(req, link);
+ while (link) {
+ nxt = link->link;
+ link->link = NULL;
+ trace_io_uring_fail_link(req, link);
io_cqring_fill_event(link, -ECANCELED);
/*
@@ -1963,8 +2042,8 @@ static void io_fail_links(struct io_kiocb *req)
io_put_req_deferred(link, 2);
else
io_double_put_req(link);
+ link = nxt;
}
-
io_commit_cqring(ctx);
spin_unlock_irqrestore(&ctx->completion_lock, flags);
@@ -1973,7 +2052,6 @@ static void io_fail_links(struct io_kiocb *req)
static struct io_kiocb *__io_req_find_next(struct io_kiocb *req)
{
- req->flags &= ~REQ_F_LINK_HEAD;
if (req->flags & REQ_F_LINK_TIMEOUT)
io_kill_linked_timeout(req);
@@ -1983,15 +2061,19 @@ static struct io_kiocb *__io_req_find_next(struct io_kiocb *req)
* dependencies to the next request. In case of failure, fail the rest
* of the chain.
*/
- if (likely(!(req->flags & REQ_F_FAIL_LINK)))
- return io_req_link_next(req);
+ if (likely(!(req->flags & REQ_F_FAIL_LINK))) {
+ struct io_kiocb *nxt = req->link;
+
+ req->link = NULL;
+ return nxt;
+ }
io_fail_links(req);
return NULL;
}
-static struct io_kiocb *io_req_find_next(struct io_kiocb *req)
+static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req)
{
- if (likely(!(req->flags & REQ_F_LINK_HEAD)))
+ if (likely(!(req->link) && !(req->flags & REQ_F_LINK_TIMEOUT)))
return NULL;
return __io_req_find_next(req);
}
@@ -2050,7 +2132,8 @@ static void __io_req_task_submit(struct io_kiocb *req)
{
struct io_ring_ctx *ctx = req->ctx;
- if (!__io_sq_thread_acquire_mm(ctx)) {
+ if (!__io_sq_thread_acquire_mm(ctx) &&
+ !__io_sq_thread_acquire_files(ctx)) {
mutex_lock(&ctx->uring_lock);
__io_queue_sqe(req, NULL);
mutex_unlock(&ctx->uring_lock);
@@ -2086,7 +2169,7 @@ static void io_req_task_queue(struct io_kiocb *req)
}
}
-static void io_queue_next(struct io_kiocb *req)
+static inline void io_queue_next(struct io_kiocb *req)
{
struct io_kiocb *nxt = io_req_find_next(req);
@@ -2143,8 +2226,7 @@ static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req)
io_free_req(req);
return;
}
- if (req->flags & REQ_F_LINK_HEAD)
- io_queue_next(req);
+ io_queue_next(req);
if (req->task != rb->task) {
if (rb->task) {
@@ -2246,7 +2328,7 @@ static unsigned io_cqring_events(struct io_ring_ctx *ctx, bool noflush)
* we wake up the task, and the next invocation will flush the
* entries. We cannot safely to it from here.
*/
- if (noflush && !list_empty(&ctx->cq_overflow_list))
+ if (noflush)
return -1U;
io_cqring_overflow_flush(ctx, false, NULL, NULL);
@@ -2593,7 +2675,7 @@ static bool io_rw_reissue(struct io_kiocb *req, long res)
if ((res != -EAGAIN && res != -EOPNOTSUPP) || io_wq_current_is_worker())
return false;
- ret = io_sq_thread_acquire_mm(req->ctx, req);
+ ret = io_sq_thread_acquire_mm_files(req->ctx, req);
if (io_resubmit_prep(req, ret)) {
refcount_inc(&req->refs);
@@ -2641,7 +2723,7 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
* find it from a io_iopoll_getevents() thread before the issuer is done
* accessing the kiocb cookie.
*/
-static void io_iopoll_req_issued(struct io_kiocb *req)
+static void io_iopoll_req_issued(struct io_kiocb *req, bool in_async)
{
struct io_ring_ctx *ctx = req->ctx;
@@ -2670,21 +2752,25 @@ static void io_iopoll_req_issued(struct io_kiocb *req)
else
list_add_tail(&req->inflight_entry, &ctx->iopoll_list);
- if ((ctx->flags & IORING_SETUP_SQPOLL) &&
+ /*
+ * If IORING_SETUP_SQPOLL is enabled, sqes are either handled in sq thread
+ * task context or in io worker task context. If current task context is
+ * sq thread, we don't need to check whether should wake up sq thread.
+ */
+ if (in_async && (ctx->flags & IORING_SETUP_SQPOLL) &&
wq_has_sleeper(&ctx->sq_data->wait))
wake_up(&ctx->sq_data->wait);
}
-static void __io_state_file_put(struct io_submit_state *state)
+static inline void __io_state_file_put(struct io_submit_state *state)
{
- if (state->has_refs)
- fput_many(state->file, state->has_refs);
- state->file = NULL;
+ fput_many(state->file, state->file_refs);
+ state->file_refs = 0;
}
static inline void io_state_file_put(struct io_submit_state *state)
{
- if (state->file)
+ if (state->file_refs)
__io_state_file_put(state);
}
@@ -2698,19 +2784,19 @@ static struct file *__io_file_get(struct io_submit_state *state, int fd)
if (!state)
return fget(fd);
- if (state->file) {
+ if (state->file_refs) {
if (state->fd == fd) {
- state->has_refs--;
+ state->file_refs--;
return state->file;
}
__io_state_file_put(state);
}
state->file = fget_many(fd, state->ios_left);
- if (!state->file)
+ if (unlikely(!state->file))
return NULL;
state->fd = fd;
- state->has_refs = state->ios_left - 1;
+ state->file_refs = state->ios_left - 1;
return state->file;
}
@@ -3065,7 +3151,7 @@ static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
return __io_iov_buffer_select(req, iov, needs_lock);
}
-static ssize_t __io_import_iovec(int rw, struct io_kiocb *req,
+static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
struct iovec **iovec, struct iov_iter *iter,
bool needs_lock)
{
@@ -3094,7 +3180,7 @@ static ssize_t __io_import_iovec(int rw, struct io_kiocb *req,
ret = import_single_range(rw, buf, sqe_len, *iovec, iter);
*iovec = NULL;
- return ret < 0 ? ret : sqe_len;
+ return ret;
}
if (req->flags & REQ_F_BUFFER_SELECT) {
@@ -3111,18 +3197,6 @@ static ssize_t __io_import_iovec(int rw, struct io_kiocb *req,
req->ctx->compat);
}
-static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
- struct iovec **iovec, struct iov_iter *iter,
- bool needs_lock)
-{
- struct io_async_rw *iorw = req->async_data;
-
- if (!iorw)
- return __io_import_iovec(rw, req, iovec, iter, needs_lock);
- *iovec = NULL;
- return iov_iter_count(&iorw->iter);
-}
-
static inline loff_t *io_kiocb_ppos(struct kiocb *kiocb)
{
return (kiocb->ki_filp->f_mode & FMODE_STREAM) ? NULL : &kiocb->ki_pos;
@@ -3246,7 +3320,7 @@ static inline int io_rw_prep_async(struct io_kiocb *req, int rw)
struct iovec *iov = iorw->fast_iov;
ssize_t ret;
- ret = __io_import_iovec(rw, req, &iov, &iorw->iter, false);
+ ret = io_import_iovec(rw, req, &iov, &iorw->iter, false);
if (unlikely(ret < 0))
return ret;
@@ -3379,17 +3453,17 @@ static int io_read(struct io_kiocb *req, bool force_nonblock,
struct iov_iter __iter, *iter = &__iter;
struct io_async_rw *rw = req->async_data;
ssize_t io_size, ret, ret2;
- size_t iov_count;
bool no_async;
- if (rw)
+ if (rw) {
iter = &rw->iter;
-
- ret = io_import_iovec(READ, req, &iovec, iter, !force_nonblock);
- if (ret < 0)
- return ret;
- iov_count = iov_iter_count(iter);
- io_size = ret;
+ iovec = NULL;
+ } else {
+ ret = io_import_iovec(READ, req, &iovec, iter, !force_nonblock);
+ if (ret < 0)
+ return ret;
+ }
+ io_size = iov_iter_count(iter);
req->result = io_size;
ret = 0;
@@ -3405,7 +3479,7 @@ static int io_read(struct io_kiocb *req, bool force_nonblock,
if (no_async)
goto copy_iov;
- ret = rw_verify_area(READ, req->file, io_kiocb_ppos(kiocb), iov_count);
+ ret = rw_verify_area(READ, req->file, io_kiocb_ppos(kiocb), io_size);
if (unlikely(ret))
goto out_free;
@@ -3424,7 +3498,7 @@ static int io_read(struct io_kiocb *req, bool force_nonblock,
if (req->file->f_flags & O_NONBLOCK)
goto done;
/* some cases will consume bytes even on error returns */
- iov_iter_revert(iter, iov_count - iov_iter_count(iter));
+ iov_iter_revert(iter, io_size - iov_iter_count(iter));
ret = 0;
goto copy_iov;
} else if (ret < 0) {
@@ -3507,17 +3581,17 @@ static int io_write(struct io_kiocb *req, bool force_nonblock,
struct kiocb *kiocb = &req->rw.kiocb;
struct iov_iter __iter, *iter = &__iter;
struct io_async_rw *rw = req->async_data;
- size_t iov_count;
ssize_t ret, ret2, io_size;
- if (rw)
+ if (rw) {
iter = &rw->iter;
-
- ret = io_import_iovec(WRITE, req, &iovec, iter, !force_nonblock);
- if (ret < 0)
- return ret;
- iov_count = iov_iter_count(iter);
- io_size = ret;
+ iovec = NULL;
+ } else {
+ ret = io_import_iovec(WRITE, req, &iovec, iter, !force_nonblock);
+ if (ret < 0)
+ return ret;
+ }
+ io_size = iov_iter_count(iter);
req->result = io_size;
/* Ensure we clear previously set non-block flag */
@@ -3535,7 +3609,7 @@ static int io_write(struct io_kiocb *req, bool force_nonblock,
(req->flags & REQ_F_ISREG))
goto copy_iov;
- ret = rw_verify_area(WRITE, req->file, io_kiocb_ppos(kiocb), iov_count);
+ ret = rw_verify_area(WRITE, req->file, io_kiocb_ppos(kiocb), io_size);
if (unlikely(ret))
goto out_free;
@@ -3578,7 +3652,7 @@ done:
} else {
copy_iov:
/* some cases will consume bytes even on error returns */
- iov_iter_revert(iter, iov_count - iov_iter_count(iter));
+ iov_iter_revert(iter, io_size - iov_iter_count(iter));
ret = io_setup_async_rw(req, iovec, inline_vecs, iter, false);
if (!ret)
return -EAGAIN;
@@ -3590,6 +3664,135 @@ out_free:
return ret;
}
+static int io_renameat_prep(struct io_kiocb *req,
+ const struct io_uring_sqe *sqe)
+{
+ struct io_rename *ren = &req->rename;
+ const char __user *oldf, *newf;
+
+ if (unlikely(req->flags & REQ_F_FIXED_FILE))
+ return -EBADF;
+
+ ren->old_dfd = READ_ONCE(sqe->fd);
+ oldf = u64_to_user_ptr(READ_ONCE(sqe->addr));
+ newf = u64_to_user_ptr(READ_ONCE(sqe->addr2));
+ ren->new_dfd = READ_ONCE(sqe->len);
+ ren->flags = READ_ONCE(sqe->rename_flags);
+
+ ren->oldpath = getname(oldf);
+ if (IS_ERR(ren->oldpath))
+ return PTR_ERR(ren->oldpath);
+
+ ren->newpath = getname(newf);
+ if (IS_ERR(ren->newpath)) {
+ putname(ren->oldpath);
+ return PTR_ERR(ren->newpath);
+ }
+
+ req->flags |= REQ_F_NEED_CLEANUP;
+ return 0;
+}
+
+static int io_renameat(struct io_kiocb *req, bool force_nonblock)
+{
+ struct io_rename *ren = &req->rename;
+ int ret;
+
+ if (force_nonblock)
+ return -EAGAIN;
+
+ ret = do_renameat2(ren->old_dfd, ren->oldpath, ren->new_dfd,
+ ren->newpath, ren->flags);
+
+ req->flags &= ~REQ_F_NEED_CLEANUP;
+ if (ret < 0)
+ req_set_fail_links(req);
+ io_req_complete(req, ret);
+ return 0;
+}
+
+static int io_unlinkat_prep(struct io_kiocb *req,
+ const struct io_uring_sqe *sqe)
+{
+ struct io_unlink *un = &req->unlink;
+ const char __user *fname;
+
+ if (unlikely(req->flags & REQ_F_FIXED_FILE))
+ return -EBADF;
+
+ un->dfd = READ_ONCE(sqe->fd);
+
+ un->flags = READ_ONCE(sqe->unlink_flags);
+ if (un->flags & ~AT_REMOVEDIR)
+ return -EINVAL;
+
+ fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
+ un->filename = getname(fname);
+ if (IS_ERR(un->filename))
+ return PTR_ERR(un->filename);
+
+ req->flags |= REQ_F_NEED_CLEANUP;
+ return 0;
+}
+
+static int io_unlinkat(struct io_kiocb *req, bool force_nonblock)
+{
+ struct io_unlink *un = &req->unlink;
+ int ret;
+
+ if (force_nonblock)
+ return -EAGAIN;
+
+ if (un->flags & AT_REMOVEDIR)
+ ret = do_rmdir(un->dfd, un->filename);
+ else
+ ret = do_unlinkat(un->dfd, un->filename);
+
+ req->flags &= ~REQ_F_NEED_CLEANUP;
+ if (ret < 0)
+ req_set_fail_links(req);
+ io_req_complete(req, ret);
+ return 0;
+}
+
+static int io_shutdown_prep(struct io_kiocb *req,
+ const struct io_uring_sqe *sqe)
+{
+#if defined(CONFIG_NET)
+ if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+ return -EINVAL;
+ if (sqe->ioprio || sqe->off || sqe->addr || sqe->rw_flags ||
+ sqe->buf_index)
+ return -EINVAL;
+
+ req->shutdown.how = READ_ONCE(sqe->len);
+ return 0;
+#else
+ return -EOPNOTSUPP;
+#endif
+}
+
+static int io_shutdown(struct io_kiocb *req, bool force_nonblock)
+{
+#if defined(CONFIG_NET)
+ struct socket *sock;
+ int ret;
+
+ if (force_nonblock)
+ return -EAGAIN;
+
+ sock = sock_from_file(req->file);
+ if (unlikely(!sock))
+ return -ENOTSOCK;
+
+ ret = __sys_shutdown_sock(sock, req->shutdown.how);
+ io_req_complete(req, ret);
+ return 0;
+#else
+ return -EOPNOTSUPP;
+#endif
+}
+
static int __io_splice_prep(struct io_kiocb *req,
const struct io_uring_sqe *sqe)
{
@@ -3804,7 +4007,7 @@ static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
u64 flags, mode;
- if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
+ if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
mode = READ_ONCE(sqe->len);
flags = READ_ONCE(sqe->open_flags);
@@ -3818,7 +4021,7 @@ static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
size_t len;
int ret;
- if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
+ if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
how = u64_to_user_ptr(READ_ONCE(sqe->addr2));
len = READ_ONCE(sqe->len);
@@ -3948,11 +4151,17 @@ static int io_remove_buffers(struct io_kiocb *req, bool force_nonblock,
head = idr_find(&ctx->io_buffer_idr, p->bgid);
if (head)
ret = __io_remove_buffers(ctx, head, p->bgid, p->nbufs);
-
- io_ring_submit_lock(ctx, !force_nonblock);
if (ret < 0)
req_set_fail_links(req);
- __io_req_complete(req, ret, 0, cs);
+
+ /* need to hold the lock to complete IOPOLL requests */
+ if (ctx->flags & IORING_SE