diff options
-rw-r--r-- | fs/exec.c | 6 | ||||
-rw-r--r-- | fs/file.c | 2 | ||||
-rw-r--r-- | fs/io-wq.c | 200 | ||||
-rw-r--r-- | fs/io-wq.h | 4 | ||||
-rw-r--r-- | fs/io_uring.c | 2181 | ||||
-rw-r--r-- | include/linux/fs.h | 46 | ||||
-rw-r--r-- | include/linux/io_uring.h | 58 | ||||
-rw-r--r-- | include/linux/sched.h | 5 | ||||
-rw-r--r-- | include/uapi/linux/io_uring.h | 61 | ||||
-rw-r--r-- | init/init_task.c | 3 | ||||
-rw-r--r-- | kernel/fork.c | 6 | ||||
-rw-r--r-- | net/unix/scm.c | 1 |
12 files changed, 1662 insertions, 911 deletions
diff --git a/fs/exec.c b/fs/exec.c index a91003e28eaa..07910f5032e7 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -62,6 +62,7 @@ #include <linux/oom.h> #include <linux/compat.h> #include <linux/vmalloc.h> +#include <linux/io_uring.h> #include <linux/uaccess.h> #include <asm/mmu_context.h> @@ -1895,6 +1896,11 @@ static int bprm_execve(struct linux_binprm *bprm, struct files_struct *displaced; int retval; + /* + * Cancel any io_uring activity across execve + */ + io_uring_task_cancel(); + retval = unshare_files(&displaced); if (retval) return retval; diff --git a/fs/file.c b/fs/file.c index 21c0893f2f1d..4559b5fec3bd 100644 --- a/fs/file.c +++ b/fs/file.c @@ -21,6 +21,7 @@ #include <linux/rcupdate.h> #include <linux/close_range.h> #include <net/sock.h> +#include <linux/io_uring.h> unsigned int sysctl_nr_open __read_mostly = 1024*1024; unsigned int sysctl_nr_open_min = BITS_PER_LONG; @@ -452,6 +453,7 @@ void exit_files(struct task_struct *tsk) struct files_struct * files = tsk->files; if (files) { + io_uring_files_cancel(files); task_lock(tsk); tsk->files = NULL; task_unlock(tsk); diff --git a/fs/io-wq.c b/fs/io-wq.c index 414beb543883..0a182f1333e8 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -17,6 +17,7 @@ #include <linux/rculist_nulls.h> #include <linux/fs_struct.h> #include <linux/task_work.h> +#include <linux/blk-cgroup.h> #include "io-wq.h" @@ -26,9 +27,8 @@ enum { IO_WORKER_F_UP = 1, /* up and active */ IO_WORKER_F_RUNNING = 2, /* account as running */ IO_WORKER_F_FREE = 4, /* worker on free list */ - IO_WORKER_F_EXITING = 8, /* worker exiting */ - IO_WORKER_F_FIXED = 16, /* static idle worker */ - IO_WORKER_F_BOUND = 32, /* is doing bounded work */ + IO_WORKER_F_FIXED = 8, /* static idle worker */ + IO_WORKER_F_BOUND = 16, /* is doing bounded work */ }; enum { @@ -57,9 +57,13 @@ struct io_worker { struct rcu_head rcu; struct mm_struct *mm; +#ifdef CONFIG_BLK_CGROUP + struct cgroup_subsys_state *blkcg_css; +#endif const struct cred *cur_creds; const struct cred *saved_creds; struct files_struct *restore_files; + struct nsproxy *restore_nsproxy; struct fs_struct *restore_fs; }; @@ -87,7 +91,7 @@ enum { */ struct io_wqe { struct { - spinlock_t lock; + raw_spinlock_t lock; struct io_wq_work_list work_list; unsigned long hash_map; unsigned flags; @@ -148,11 +152,12 @@ static bool __io_worker_unuse(struct io_wqe *wqe, struct io_worker *worker) if (current->files != worker->restore_files) { __acquire(&wqe->lock); - spin_unlock_irq(&wqe->lock); + raw_spin_unlock_irq(&wqe->lock); dropped_lock = true; task_lock(current); current->files = worker->restore_files; + current->nsproxy = worker->restore_nsproxy; task_unlock(current); } @@ -166,7 +171,7 @@ static bool __io_worker_unuse(struct io_wqe *wqe, struct io_worker *worker) if (worker->mm) { if (!dropped_lock) { __acquire(&wqe->lock); - spin_unlock_irq(&wqe->lock); + raw_spin_unlock_irq(&wqe->lock); dropped_lock = true; } __set_current_state(TASK_RUNNING); @@ -175,6 +180,13 @@ static bool __io_worker_unuse(struct io_wqe *wqe, struct io_worker *worker) worker->mm = NULL; } +#ifdef CONFIG_BLK_CGROUP + if (worker->blkcg_css) { + kthread_associate_blkcg(NULL); + worker->blkcg_css = NULL; + } +#endif + return dropped_lock; } @@ -200,7 +212,6 @@ static void io_worker_exit(struct io_worker *worker) { struct io_wqe *wqe = worker->wqe; struct io_wqe_acct *acct = io_wqe_get_acct(wqe, worker); - unsigned nr_workers; /* * If we're not at zero, someone else is holding a brief reference @@ -220,23 +231,19 @@ static void io_worker_exit(struct io_worker *worker) worker->flags = 0; preempt_enable(); - spin_lock_irq(&wqe->lock); + raw_spin_lock_irq(&wqe->lock); hlist_nulls_del_rcu(&worker->nulls_node); list_del_rcu(&worker->all_list); if (__io_worker_unuse(wqe, worker)) { __release(&wqe->lock); - spin_lock_irq(&wqe->lock); + raw_spin_lock_irq(&wqe->lock); } acct->nr_workers--; - nr_workers = wqe->acct[IO_WQ_ACCT_BOUND].nr_workers + - wqe->acct[IO_WQ_ACCT_UNBOUND].nr_workers; - spin_unlock_irq(&wqe->lock); - - /* all workers gone, wq exit can proceed */ - if (!nr_workers && refcount_dec_and_test(&wqe->wq->refs)) - complete(&wqe->wq->done); + raw_spin_unlock_irq(&wqe->lock); kfree_rcu(worker, rcu); + if (refcount_dec_and_test(&wqe->wq->refs)) + complete(&wqe->wq->done); } static inline bool io_wqe_run_queue(struct io_wqe *wqe) @@ -318,6 +325,7 @@ static void io_worker_start(struct io_wqe *wqe, struct io_worker *worker) worker->flags |= (IO_WORKER_F_UP | IO_WORKER_F_RUNNING); worker->restore_files = current->files; + worker->restore_nsproxy = current->nsproxy; worker->restore_fs = current->fs; io_wqe_inc_running(wqe, worker); } @@ -436,6 +444,17 @@ static void io_wq_switch_mm(struct io_worker *worker, struct io_wq_work *work) work->flags |= IO_WQ_WORK_CANCEL; } +static inline void io_wq_switch_blkcg(struct io_worker *worker, + struct io_wq_work *work) +{ +#ifdef CONFIG_BLK_CGROUP + if (work->blkcg_css != worker->blkcg_css) { + kthread_associate_blkcg(work->blkcg_css); + worker->blkcg_css = work->blkcg_css; + } +#endif +} + static void io_wq_switch_creds(struct io_worker *worker, struct io_wq_work *work) { @@ -454,6 +473,7 @@ static void io_impersonate_work(struct io_worker *worker, if (work->files && current->files != work->files) { task_lock(current); current->files = work->files; + current->nsproxy = work->nsproxy; task_unlock(current); } if (work->fs && current->fs != work->fs) @@ -463,6 +483,7 @@ static void io_impersonate_work(struct io_worker *worker, if (worker->cur_creds != work->creds) io_wq_switch_creds(worker, work); current->signal->rlim[RLIMIT_FSIZE].rlim_cur = work->fsize; + io_wq_switch_blkcg(worker, work); } static void io_assign_current_work(struct io_worker *worker, @@ -504,7 +525,7 @@ get_next: else if (!wq_list_empty(&wqe->work_list)) wqe->flags |= IO_WQE_FLAG_STALLED; - spin_unlock_irq(&wqe->lock); + raw_spin_unlock_irq(&wqe->lock); if (!work) break; io_assign_current_work(worker, work); @@ -538,17 +559,17 @@ get_next: io_wqe_enqueue(wqe, linked); if (hash != -1U && !next_hashed) { - spin_lock_irq(&wqe->lock); + raw_spin_lock_irq(&wqe->lock); wqe->hash_map &= ~BIT_ULL(hash); wqe->flags &= ~IO_WQE_FLAG_STALLED; /* skip unnecessary unlock-lock wqe->lock */ if (!work) goto get_next; - spin_unlock_irq(&wqe->lock); + raw_spin_unlock_irq(&wqe->lock); } } while (work); - spin_lock_irq(&wqe->lock); + raw_spin_lock_irq(&wqe->lock); } while (1); } @@ -563,7 +584,7 @@ static int io_wqe_worker(void *data) while (!test_bit(IO_WQ_BIT_EXIT, &wq->state)) { set_current_state(TASK_INTERRUPTIBLE); loop: - spin_lock_irq(&wqe->lock); + raw_spin_lock_irq(&wqe->lock); if (io_wqe_run_queue(wqe)) { __set_current_state(TASK_RUNNING); io_worker_handle_work(worker); @@ -574,7 +595,7 @@ loop: __release(&wqe->lock); goto loop; } - spin_unlock_irq(&wqe->lock); + raw_spin_unlock_irq(&wqe->lock); if (signal_pending(current)) flush_signals(current); if (schedule_timeout(WORKER_IDLE_TIMEOUT)) @@ -586,11 +607,11 @@ loop: } if (test_bit(IO_WQ_BIT_EXIT, &wq->state)) { - spin_lock_irq(&wqe->lock); + raw_spin_lock_irq(&wqe->lock); if (!wq_list_empty(&wqe->work_list)) io_worker_handle_work(worker); else - spin_unlock_irq(&wqe->lock); + raw_spin_unlock_irq(&wqe->lock); } io_worker_exit(worker); @@ -630,14 +651,14 @@ void io_wq_worker_sleeping(struct task_struct *tsk) worker->flags &= ~IO_WORKER_F_RUNNING; - spin_lock_irq(&wqe->lock); + raw_spin_lock_irq(&wqe->lock); io_wqe_dec_running(wqe, worker); - spin_unlock_irq(&wqe->lock); + raw_spin_unlock_irq(&wqe->lock); } static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index) { - struct io_wqe_acct *acct =&wqe->acct[index]; + struct io_wqe_acct *acct = &wqe->acct[index]; struct io_worker *worker; worker = kzalloc_node(sizeof(*worker), GFP_KERNEL, wqe->node); @@ -656,7 +677,7 @@ static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index) return false; } - spin_lock_irq(&wqe->lock); + raw_spin_lock_irq(&wqe->lock); hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->free_list); list_add_tail_rcu(&worker->all_list, &wqe->all_list); worker->flags |= IO_WORKER_F_FREE; @@ -665,11 +686,12 @@ static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index) if (!acct->nr_workers && (worker->flags & IO_WORKER_F_BOUND)) worker->flags |= IO_WORKER_F_FIXED; acct->nr_workers++; - spin_unlock_irq(&wqe->lock); + raw_spin_unlock_irq(&wqe->lock); if (index == IO_WQ_ACCT_UNBOUND) atomic_inc(&wq->user->processes); + refcount_inc(&wq->refs); wake_up_process(worker->task); return true; } @@ -685,28 +707,63 @@ static inline bool io_wqe_need_worker(struct io_wqe *wqe, int index) return acct->nr_workers < acct->max_workers; } +static bool io_wqe_worker_send_sig(struct io_worker *worker, void *data) +{ + send_sig(SIGINT, worker->task, 1); + return false; +} + +/* + * Iterate the passed in list and call the specific function for each + * worker that isn't exiting + */ +static bool io_wq_for_each_worker(struct io_wqe *wqe, + bool (*func)(struct io_worker *, void *), + void *data) +{ + struct io_worker *worker; + bool ret = false; + + list_for_each_entry_rcu(worker, &wqe->all_list, all_list) { + if (io_worker_get(worker)) { + /* no task if node is/was offline */ + if (worker->task) + ret = func(worker, data); + io_worker_release(worker); + if (ret) + break; + } + } + + return ret; +} + +static bool io_wq_worker_wake(struct io_worker *worker, void *data) +{ + wake_up_process(worker->task); + return false; +} + /* * Manager thread. Tasked with creating new workers, if we need them. */ static int io_wq_manager(void *data) { struct io_wq *wq = data; - int workers_to_create = num_possible_nodes(); int node; /* create fixed workers */ - refcount_set(&wq->refs, workers_to_create); + refcount_set(&wq->refs, 1); for_each_node(node) { if (!node_online(node)) continue; - if (!create_io_worker(wq, wq->wqes[node], IO_WQ_ACCT_BOUND)) - goto err; - workers_to_create--; + if (create_io_worker(wq, wq->wqes[node], IO_WQ_ACCT_BOUND)) + continue; + set_bit(IO_WQ_BIT_ERROR, &wq->state); + set_bit(IO_WQ_BIT_EXIT, &wq->state); + goto out; } - while (workers_to_create--) - refcount_dec(&wq->refs); - complete(&wq->done); while (!kthread_should_stop()) { @@ -720,12 +777,12 @@ static int io_wq_manager(void *data) if (!node_online(node)) continue; - spin_lock_irq(&wqe->lock); + raw_spin_lock_irq(&wqe->lock); if (io_wqe_need_worker(wqe, IO_WQ_ACCT_BOUND)) fork_worker[IO_WQ_ACCT_BOUND] = true; if (io_wqe_need_worker(wqe, IO_WQ_ACCT_UNBOUND)) fork_worker[IO_WQ_ACCT_UNBOUND] = true; - spin_unlock_irq(&wqe->lock); + raw_spin_unlock_irq(&wqe->lock); if (fork_worker[IO_WQ_ACCT_BOUND]) create_io_worker(wq, wqe, IO_WQ_ACCT_BOUND); if (fork_worker[IO_WQ_ACCT_UNBOUND]) @@ -738,12 +795,18 @@ static int io_wq_manager(void *data) if (current->task_works) task_work_run(); - return 0; -err: - set_bit(IO_WQ_BIT_ERROR, &wq->state); - set_bit(IO_WQ_BIT_EXIT, &wq->state); - if (refcount_sub_and_test(workers_to_create, &wq->refs)) +out: + if (refcount_dec_and_test(&wq->refs)) { complete(&wq->done); + return 0; + } + /* if ERROR is set and we get here, we have workers to wake */ + if (test_bit(IO_WQ_BIT_ERROR, &wq->state)) { + rcu_read_lock(); + for_each_node(node) + io_wq_for_each_worker(wq->wqes[node], io_wq_worker_wake, NULL); + rcu_read_unlock(); + } return 0; } @@ -821,10 +884,10 @@ static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work) } work_flags = work->flags; - spin_lock_irqsave(&wqe->lock, flags); + raw_spin_lock_irqsave(&wqe->lock, flags); io_wqe_insert_work(wqe, work); wqe->flags &= ~IO_WQE_FLAG_STALLED; - spin_unlock_irqrestore(&wqe->lock, flags); + raw_spin_unlock_irqrestore(&wqe->lock, flags); if ((work_flags & IO_WQ_WORK_CONCURRENT) || !atomic_read(&acct->nr_running)) @@ -850,37 +913,6 @@ void io_wq_hash_work(struct io_wq_work *work, void *val) work->flags |= (IO_WQ_WORK_HASHED | (bit << IO_WQ_HASH_SHIFT)); } -static bool io_wqe_worker_send_sig(struct io_worker *worker, void *data) -{ - send_sig(SIGINT, worker->task, 1); - return false; -} - -/* - * Iterate the passed in list and call the specific function for each - * worker that isn't exiting - */ -static bool io_wq_for_each_worker(struct io_wqe *wqe, - bool (*func)(struct io_worker *, void *), - void *data) -{ - struct io_worker *worker; - bool ret = false; - - list_for_each_entry_rcu(worker, &wqe->all_list, all_list) { - if (io_worker_get(worker)) { - /* no task if node is/was offline */ - if (worker->task) - ret = func(worker, data); - io_worker_release(worker); - if (ret) - break; - } - } - - return ret; -} - void io_wq_cancel_all(struct io_wq *wq) { int node; @@ -951,13 +983,13 @@ static void io_wqe_cancel_pending_work(struct io_wqe *wqe, unsigned long flags; retry: - spin_lock_irqsave(&wqe->lock, flags); + raw_spin_lock_irqsave(&wqe->lock, flags); wq_list_for_each(node, prev, &wqe->work_list) { work = container_of(node, struct io_wq_work, list); if (!match->fn(work, match->data)) continue; io_wqe_remove_pending(wqe, work, prev); - spin_unlock_irqrestore(&wqe->lock, flags); + raw_spin_unlock_irqrestore(&wqe->lock, flags); io_run_cancel(work, wqe); match->nr_pending++; if (!match->cancel_all) @@ -966,7 +998,7 @@ retry: /* not safe to continue after unlock */ goto retry; } - spin_unlock_irqrestore(&wqe->lock, flags); + raw_spin_unlock_irqrestore(&wqe->lock, flags); } static void io_wqe_cancel_running_work(struct io_wqe *wqe, @@ -1074,7 +1106,7 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) } atomic_set(&wqe->acct[IO_WQ_ACCT_UNBOUND].nr_running, 0); wqe->wq = wq; - spin_lock_init(&wqe->lock); + raw_spin_lock_init(&wqe->lock); INIT_WQ_LIST(&wqe->work_list); INIT_HLIST_NULLS_HEAD(&wqe->free_list, 0); INIT_LIST_HEAD(&wqe->all_list); @@ -1113,12 +1145,6 @@ bool io_wq_get(struct io_wq *wq, struct io_wq_data *data) return refcount_inc_not_zero(&wq->use_refs); } -static bool io_wq_worker_wake(struct io_worker *worker, void *data) -{ - wake_up_process(worker->task); - return false; -} - static void __io_wq_destroy(struct io_wq *wq) { int node; diff --git a/fs/io-wq.h b/fs/io-wq.h index ddaf9614cf9b..84bcf6a85523 100644 --- a/fs/io-wq.h +++ b/fs/io-wq.h @@ -87,7 +87,11 @@ struct io_wq_work { struct io_wq_work_node list; struct files_struct *files; struct mm_struct *mm; +#ifdef CONFIG_BLK_CGROUP + struct cgroup_subsys_state *blkcg_css; +#endif const struct cred *creds; + struct nsproxy *nsproxy; struct fs_struct *fs; unsigned long fsize; unsigned flags; diff --git a/fs/io_uring.c b/fs/io_uring.c index f58b3d6bba8a..fc6de6b4784e 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -79,6 +79,8 @@ #include <linux/splice.h> #include <linux/task_work.h> #include <linux/pagemap.h> +#include <linux/io_uring.h> +#include <linux/blk-cgroup.h> #define CREATE_TRACE_POINTS #include <trace/events/io_uring.h> @@ -98,6 +100,8 @@ #define IORING_MAX_FILES_TABLE (1U << IORING_FILE_TABLE_SHIFT) #define IORING_FILE_TABLE_MASK (IORING_MAX_FILES_TABLE - 1) #define IORING_MAX_FIXED_FILES (64 * IORING_MAX_FILES_TABLE) +#define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \ + IORING_REGISTER_LAST + IORING_OP_LAST) struct io_uring { u32 head ____cacheline_aligned_in_smp; @@ -187,6 +191,7 @@ struct io_mapped_ubuf { size_t len; struct bio_vec *bvec; unsigned int nr_bvecs; + unsigned long acct_pages; }; struct fixed_file_table { @@ -205,7 +210,7 @@ struct fixed_file_data { struct fixed_file_table *table; struct io_ring_ctx *ctx; - struct percpu_ref *cur_refs; + struct fixed_file_ref_node *node; struct percpu_ref refs; struct completion done; struct list_head ref_list; @@ -219,6 +224,27 @@ struct io_buffer { __u16 bid; }; +struct io_restriction { + DECLARE_BITMAP(register_op, IORING_REGISTER_LAST); + DECLARE_BITMAP(sqe_op, IORING_OP_LAST); + u8 sqe_flags_allowed; + u8 sqe_flags_required; + bool registered; +}; + +struct io_sq_data { + refcount_t refs; + struct mutex lock; + + /* ctx's that are using this sqd */ + struct list_head ctx_list; + struct list_head ctx_new_list; + struct mutex ctx_lock; + + struct task_struct *thread; + struct wait_queue_head wait; +}; + struct io_ring_ctx { struct { struct percpu_ref refs; @@ -231,6 +257,7 @@ struct io_ring_ctx { unsigned int cq_overflow_flushed: 1; unsigned int drain_next: 1; unsigned int eventfd_async: 1; + unsigned int restricted: 1; /* * Ring buffer of indices into array of io_uring_sqe, which is @@ -264,9 +291,25 @@ struct io_ring_ctx { /* IO offload */ struct io_wq *io_wq; - struct task_struct *sqo_thread; /* if using sq thread polling */ - struct mm_struct *sqo_mm; - wait_queue_head_t sqo_wait; + + /* + * For SQPOLL usage - we hold a reference to the parent task, so we + * have access to the ->files + */ + struct task_struct *sqo_task; + + /* Only used for accounting purposes */ + struct mm_struct *mm_account; + +#ifdef CONFIG_BLK_CGROUP + struct cgroup_subsys_state *sqo_blkcg_css; +#endif + + struct io_sq_data *sq_data; /* if using sq thread polling */ + + struct wait_queue_head sqo_sq_wait; + struct wait_queue_entry sqo_wait_entry; + struct list_head sqd_list; /* * If used, fixed file set. Writers must ensure that ->refs is dead, @@ -275,8 +318,6 @@ struct io_ring_ctx { */ struct fixed_file_data *file_data; unsigned nr_user_files; - int ring_fd; - struct file *ring_file; /* if used, fixed mapped user buffers */ unsigned nr_user_bufs; @@ -338,6 +379,7 @@ struct io_ring_ctx { struct llist_head file_put_llist; struct work_struct exit_work; + struct io_restriction restrictions; }; /* @@ -392,13 +434,16 @@ struct io_cancel { struct io_timeout { struct file *file; - u64 addr; - int flags; u32 off; u32 target_seq; struct list_head list; }; +struct io_timeout_rem { + struct file *file; + u64 addr; +}; + struct io_rw { /* NOTE: kiocb has the file as the first member, so don't do it here */ struct kiocb kiocb; @@ -514,15 +559,6 @@ struct io_async_rw { struct wait_page_queue wpq; }; -struct io_async_ctx { - union { - struct io_async_rw rw; - struct io_async_msghdr msg; - struct io_async_connect connect; - struct io_timeout_data timeout; - }; -}; - enum { REQ_F_FIXED_FILE_BIT = IOSQE_FIXED_FILE_BIT, REQ_F_IO_DRAIN_BIT = IOSQE_IO_DRAIN_BIT, @@ -544,7 +580,6 @@ enum { REQ_F_BUFFER_SELECTED_BIT, REQ_F_NO_FILE_TABLE_BIT, REQ_F_WORK_INITIALIZED_BIT, - REQ_F_TASK_PINNED_BIT, /* not a real bit, just to check we're not overflowing the space */ __REQ_F_LAST_BIT, @@ -590,8 +625,6 @@ enum { REQ_F_NO_FILE_TABLE = BIT(REQ_F_NO_FILE_TABLE_BIT), /* io_wq_work is initialized */ REQ_F_WORK_INITIALIZED = BIT(REQ_F_WORK_INITIALIZED_BIT), - /* req->task is refcounted */ - REQ_F_TASK_PINNED = BIT(REQ_F_TASK_PINNED_BIT), }; struct async_poll { @@ -614,6 +647,7 @@ struct io_kiocb { struct io_sync sync; struct io_cancel cancel; struct io_timeout timeout; + struct io_timeout_rem timeout_rem; struct io_connect connect; struct io_sr_msg sr_msg; struct io_open open; @@ -629,7 +663,8 @@ struct io_kiocb { struct io_completion compl; }; - struct io_async_ctx *io; + /* opcode allocated if it needs to store data for async defer */ + void *async_data; u8 opcode; /* polled IO has completed */ u8 iopoll_completed; @@ -697,8 +732,6 @@ struct io_submit_state { }; struct io_op_def { - /* needs req->io allocated for deferral/async */ - unsigned async_ctx : 1; /* needs current->mm setup, does mm access */ unsigned needs_mm : 1; /* needs req->file assigned */ @@ -720,35 +753,49 @@ struct io_op_def { unsigned pollout : 1; /* op supports buffer selection */ unsigned buffer_select : 1; + /* needs rlimit(RLIMIT_FSIZE) assigned */ unsigned needs_fsize : 1; + /* must always have async data allocated */ + unsigned needs_async_data : 1; + /* needs blkcg context, issues async io potentially */ + unsigned needs_blkcg : 1; + /* size of async data needed, if any */ + unsigned short async_size; }; -static const struct io_op_def io_op_defs[] = { +static const struct io_op_def io_op_defs[] __read_mostly = { [IORING_OP_NOP] = {}, [IORING_OP_READV] = { - .async_ctx = 1, .needs_mm = 1, .needs_file = 1, .unbound_nonreg_file = 1, .pollin = 1, .buffer_select = 1, + .needs_async_data = 1, + .needs_blkcg = 1, + .async_size = sizeof(struct io_async_rw), }, [IORING_OP_WRITEV] = { - .async_ctx = 1, .needs_mm = 1, .needs_file = 1, .hash_reg_file = 1, .unbound_nonreg_file = 1, .pollout = 1, .needs_fsize = 1, + .needs_async_data = 1, + .needs_blkcg = 1, + .async_size = sizeof(struct io_async_rw), }, [IORING_OP_FSYNC] = { .needs_file = 1, + .needs_blkcg = 1, }, [IORING_OP_READ_FIXED] = { .needs_file = 1, .unbound_nonreg_file = 1, .pollin = 1, + .needs_blkcg = 1, + .async_size = sizeof(struct io_async_rw), }, [IORING_OP_WRITE_FIXED] = { .needs_file = 1, @@ -756,6 +803,8 @@ static const struct io_op_def io_op_defs[] = { .unbound_nonreg_file = 1, .pollout = 1, .needs_fsize = 1, + .needs_blkcg = 1, + .async_size = sizeof(struct io_async_rw), }, [IORING_OP_POLL_ADD] = { .needs_file = 1, @@ -764,27 +813,33 @@ static const struct io_op_def io_op_defs[] = { [IORING_OP_POLL_REMOVE] = {}, [IORING_OP_SYNC_FILE_RANGE] = { .needs_file = 1, + .needs_blkcg = 1, }, [IORING_OP_SENDMSG] = { - .async_ctx = 1, .needs_mm = 1, .needs_file = 1, .unbound_nonreg_file = 1, .needs_fs = 1, .pollout = 1, + .needs_async_data = 1, + .needs_blkcg = 1, + .async_size = sizeof(struct io_async_msghdr), }, [IORING_OP_RECVMSG] = { - .async_ctx = 1, .needs_mm = 1, .needs_file = 1, .unbound_nonreg_file = 1, .needs_fs = 1, .pollin = 1, .buffer_select = 1, + .needs_async_data = 1, + .needs_blkcg = 1, + .async_size = sizeof(struct io_async_msghdr), }, [IORING_OP_TIMEOUT] = { - .async_ctx = 1, .needs_mm = 1, + .needs_async_data = 1, + .async_size = sizeof(struct io_timeout_data), }, [IORING_OP_TIMEOUT_REMOVE] = {}, [IORING_OP_ACCEPT] = { @@ -796,28 +851,33 @@ static const struct io_op_def io_op_defs[] = { }, [IORING_OP_ASYNC_CANCEL] = {}, [IORING_OP_LINK_TIMEOUT] = { - .async_ctx = 1, .needs_mm = 1, + .needs_async_data = 1, + .async_size = sizeof(struct io_timeout_data), }, [IORING_OP_CONNECT] = { - .async_ctx = 1, .needs_mm = 1, .needs_file = 1, .unbound_nonreg_file = 1, .pollout = 1, + .needs_async_data = 1, + .async_size = sizeof(struct io_async_connect), }, [IORING_OP_FALLOCATE] = { .needs_file = 1, .needs_fsize = 1, + .needs_blkcg = 1, }, [IORING_OP_OPENAT] = { .file_table = 1, .needs_fs = 1, + .needs_blkcg = 1, }, [IORING_OP_CLOSE] = { .needs_file = 1, .needs_file_no_error = 1, .file_table = 1, + .needs_blkcg = 1, }, [IORING_OP_FILES_UPDATE] = { .needs_mm = 1, @@ -827,6 +887,7 @@ static const struct io_op_def io_op_defs[] = { .needs_mm = 1, .needs_fs = 1, .file_table = 1, + .needs_blkcg = 1, }, [IORING_OP_READ] = { .needs_mm = 1, @@ -834,6 +895,8 @@ static const struct io_op_def io_op_defs[] = { .unbound_nonreg_file = 1, .pollin = 1, .buffer_select = 1, + .needs_blkcg = 1, + .async_size = sizeof(struct io_async_rw), }, [IORING_OP_WRITE] = { .needs_mm = 1, @@ -841,18 +904,23 @@ static const struct io_op_def io_op_defs[] = { .unbound_nonreg_file = 1, .pollout = 1, .needs_fsize = 1, + .needs_blkcg = 1, + .async_size = sizeof(struct io_async_rw), }, [IORING_OP_FADVISE] = { .needs_file = 1, + .needs_blkcg = 1, }, [IORING_OP_MADVISE] = { .needs_mm = 1, + .needs_blkcg = 1, }, [IORING_OP_SEND] = { .needs_mm = 1, .needs_file = 1, .unbound_nonreg_file = 1, .pollout = 1, + .needs_blkcg = 1, }, [IORING_OP_RECV] = { .needs_mm = 1, @@ -860,10 +928,12 @@ static const struct io_op_def io_op_defs[] = { .unbound_nonreg_file = 1, .pollin = 1, .buffer_select = 1, + .needs_blkcg = 1, }, [IORING_OP_OPENAT2] = { .file_table = 1, .needs_fs = 1, + .needs_blkcg = 1, }, [IORING_OP_EPOLL_CTL] = { .unbound_nonreg_file = 1, @@ -873,6 +943,7 @@ static const struct io_op_def io_op_defs[] = { .needs_file = 1, .hash_reg_file = 1, .unbound_nonreg_file = 1, + .needs_blkcg = 1, }, [IORING_OP_PROVIDE_BUFFERS] = {}, [IORING_OP_REMOVE_BUFFERS] = {}, @@ -900,13 +971,10 @@ static void io_queue_linked_timeout(struct io_kiocb *req); static int __io_sqe_files_update(struct io_ring_ctx *ctx, struct io_uring_files_update *ip, unsigned nr_args); -static int io_prep_work_files(struct io_kiocb *req); static void __io_clean_op(struct io_kiocb *req); -static int io_file_get(struct io_submit_state *state, struct io_kiocb *req, - int fd, struct file **out_file, bool fixed); -static void __io_queue_sqe(struct io_kiocb *req, - const struct io_uring_sqe *sqe, - struct io_comp_state *cs); +static struct file *io_file_get(struct io_submit_state *state, + struct io_kiocb *req, int fd, bool fixed); +static void __io_queue_sqe(struct io_kiocb *req, struct io_comp_state *cs); static void io_file_put_work(struct work_struct *work); static ssize_t io_import_iovec(int rw, struct io_kiocb *req, @@ -918,7 +986,7 @@ static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec, static struct kmem_cache *req_cachep; -static const struct file_operations io_uring_fops; +static const struct file_operations io_uring_fops __read_mostly; struct sock *io_uring_get_socket(struct file *file) { @@ -933,14 +1001,6 @@ struct sock *io_uring_get_socket(struct file *file) } EXPORT_SYMBOL(io_uring_get_socket); -static void io_get_req_task(struct io_kiocb *req) -{ - if (req->flags & REQ_F_TASK_PINNED) - return; - get_task_struct(req->task); - req->flags |= REQ_F_TASK_PINNED; -} - static inline void io_clean_op(struct io_kiocb *req) { if (req->flags & (REQ_F_NEED_CLEANUP | REQ_F_BUFFER_SELECTED | @@ -948,13 +1008,6 @@ static inline void io_clean_op(struct io_kiocb *req) __io_clean_op(req); } -/* not idempotent -- it doesn't clear REQ_F_TASK_PINNED */ -static void __io_put_req_task(struct io_kiocb *req) -{ - if (req->flags & REQ_F_TASK_PINNED) - put_task_struct(req->task); -} - static void io_sq_thread_drop_mm(void) { struct mm_struct *mm = current->mm; @@ -969,9 +1022,10 @@ static int __io_sq_thread_acquire_mm(struct io_ring_ctx *ctx) { if (!current->mm) { if (unlikely(!(ctx->flags & IORING_SETUP_SQPOLL) || - !mmget_not_zero(ctx->sqo_mm))) + !ctx->sqo_task->mm || + !mmget_not_zero(ctx->sqo_task->mm))) return -EFAULT; - kthread_use_mm(ctx->sqo_mm); + kthread_use_mm(ctx->sqo_task->mm); } return 0; @@ -985,6 +1039,26 @@ static int io_sq_thread_acquire_mm(struct io_ring_ctx *ctx, return __io_sq_thread_acquire_mm(ctx); } +static void io_sq_thread_associate_blkcg(struct io_ring_ctx *ctx, + struct cgroup_subsys_state **cur_css) + +{ +#ifdef CONFIG_BLK_CGROUP + /* puts the old one when swapping */ + if (*cur_css != ctx->sqo_blkcg_css) { + kthread_associate_blkcg(ctx->sqo_blkcg_css); + *cur_css = ctx->sqo_blkcg_css; + } +#endif +} + +static void io_sq_thread_unassociate_blkcg(void) +{ +#ifdef CONFIG_BLK_CGROUP + kthread_associate_blkcg(NULL); +#endif +} + static inline void req_set_fail_links(struct io_kiocb *req) { if ((req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) == REQ_F_LINK) @@ -1054,7 +1128,8 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) goto err; ctx->flags = p->flags; - init_waitqueue_head(&ctx->sqo_wait); + init_waitqueue_head(&ctx->sqo_sq_wait); + INIT_LIST_HEAD(&ctx->sqd_list); init_waitqueue_head(&ctx->cq_wait); INIT_LIST_HEAD(&ctx->cq_overflow_list); init_completion(&ctx->ref_comp); @@ -1121,6 +1196,10 @@ static bool io_req_clean_work(struct io_kiocb *req) mmdrop(req->work.mm); req->work.mm = NULL; } +#ifdef CONFIG_BLK_CGROUP + if (req->work.blkcg_css) + css_put(req->work.blkcg_css); +#endif if (req->work.creds) { put_cred(req->work.creds); req->work.creds = NULL; @@ -1146,20 +1225,45 @@ static bool io_req_clean_work(struct io_kiocb *req) static void io_prep_async_work(struct io_kiocb *req) { const struct io_op_def *def = &io_op_defs[req->opcode]; + struct io_ring_ctx *ctx = req->ctx; io_req_init_async(req); if (req->flags & REQ_F_ISREG) { - if (def->hash_reg_file || (req->ctx->flags & IORING_SETUP_IOPOLL)) + if (def->hash_reg_file || (ctx->flags & IORING_SETUP_IOPOLL)) io_wq_hash_work(&req->work, file_inode(req->file)); } else { if (def->unbound_nonreg_file) req->work.flags |= IO_WQ_WORK_UNBOUND; } + if (!req->work.files && io_op_defs[req->opcode].file_table && + !(req->flags & REQ_F_NO_FILE_TABLE)) { + req->work.files = get_files_struct(current); + get_nsproxy(current->nsproxy); + req->work.nsproxy = current->nsproxy; + req->flags |= REQ_F_INFLIGHT; + + spin_lock_irq(&ctx->inflight_lock); + list_add(&req->inflight_entry, &ctx->inflight_list); + spin_unlock_irq(&ctx->inflight_lock); + } if (!req->work.mm && def->needs_mm) { mmgrab(current->mm); req->work.mm = current->mm; } +#ifdef CONFIG_BLK_CGROUP + if (!req->work.blkcg_css && def->needs_blkcg) { + rcu_read_lock(); + req->work.blkcg_css = blkcg_css(); + /* + * This should be rare, either the cgroup is dying or the task + * is moving cgroups. Just punt to root for the handful of ios. + */ + if (!css_tryget_online(req->work.blkcg_css)) + req->work.blkcg_css = NULL; + rcu_read_unlock(); + } +#endif if (!req->w |