summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--fs/io-wq.c187
-rw-r--r--fs/io-wq.h63
-rw-r--r--fs/io_uring.c776
-rw-r--r--include/linux/socket.h3
-rw-r--r--include/trace/events/io_uring.h16
-rw-r--r--include/uapi/linux/io_uring.h1
-rw-r--r--net/socket.c214
7 files changed, 757 insertions, 503 deletions
diff --git a/fs/io-wq.c b/fs/io-wq.c
index 9174007ce107..91b85df0861e 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -33,6 +33,7 @@ enum {
enum {
IO_WQ_BIT_EXIT = 0, /* wq exiting */
IO_WQ_BIT_CANCEL = 1, /* cancel work on list */
+ IO_WQ_BIT_ERROR = 2, /* error on setup */
};
enum {
@@ -56,6 +57,7 @@ struct io_worker {
struct rcu_head rcu;
struct mm_struct *mm;
+ const struct cred *creds;
struct files_struct *restore_files;
};
@@ -82,7 +84,7 @@ enum {
struct io_wqe {
struct {
spinlock_t lock;
- struct list_head work_list;
+ struct io_wq_work_list work_list;
unsigned long hash_map;
unsigned flags;
} ____cacheline_aligned_in_smp;
@@ -103,13 +105,13 @@ struct io_wqe {
struct io_wq {
struct io_wqe **wqes;
unsigned long state;
- unsigned nr_wqes;
get_work_fn *get_work;
put_work_fn *put_work;
struct task_struct *manager;
struct user_struct *user;
+ struct cred *creds;
struct mm_struct *mm;
refcount_t refs;
struct completion done;
@@ -135,6 +137,11 @@ static bool __io_worker_unuse(struct io_wqe *wqe, struct io_worker *worker)
{
bool dropped_lock = false;
+ if (worker->creds) {
+ revert_creds(worker->creds);
+ worker->creds = NULL;
+ }
+
if (current->files != worker->restore_files) {
__acquire(&wqe->lock);
spin_unlock_irq(&wqe->lock);
@@ -229,7 +236,8 @@ static void io_worker_exit(struct io_worker *worker)
static inline bool io_wqe_run_queue(struct io_wqe *wqe)
__must_hold(wqe->lock)
{
- if (!list_empty(&wqe->work_list) && !(wqe->flags & IO_WQE_FLAG_STALLED))
+ if (!wq_list_empty(&wqe->work_list) &&
+ !(wqe->flags & IO_WQE_FLAG_STALLED))
return true;
return false;
}
@@ -327,9 +335,9 @@ static void __io_worker_busy(struct io_wqe *wqe, struct io_worker *worker,
* If worker is moving from bound to unbound (or vice versa), then
* ensure we update the running accounting.
*/
- worker_bound = (worker->flags & IO_WORKER_F_BOUND) != 0;
- work_bound = (work->flags & IO_WQ_WORK_UNBOUND) == 0;
- if (worker_bound != work_bound) {
+ worker_bound = (worker->flags & IO_WORKER_F_BOUND) != 0;
+ work_bound = (work->flags & IO_WQ_WORK_UNBOUND) == 0;
+ if (worker_bound != work_bound) {
io_wqe_dec_running(wqe, worker);
if (work_bound) {
worker->flags |= IO_WORKER_F_BOUND;
@@ -368,12 +376,15 @@ static bool __io_worker_idle(struct io_wqe *wqe, struct io_worker *worker)
static struct io_wq_work *io_get_next_work(struct io_wqe *wqe, unsigned *hash)
__must_hold(wqe->lock)
{
+ struct io_wq_work_node *node, *prev;
struct io_wq_work *work;
- list_for_each_entry(work, &wqe->work_list, list) {
+ wq_list_for_each(node, prev, &wqe->work_list) {
+ work = container_of(node, struct io_wq_work, list);
+
/* not hashed, can run anytime */
if (!(work->flags & IO_WQ_WORK_HASHED)) {
- list_del(&work->list);
+ wq_node_del(&wqe->work_list, node, prev);
return work;
}
@@ -381,7 +392,7 @@ static struct io_wq_work *io_get_next_work(struct io_wqe *wqe, unsigned *hash)
*hash = work->flags >> IO_WQ_HASH_SHIFT;
if (!(wqe->hash_map & BIT_ULL(*hash))) {
wqe->hash_map |= BIT_ULL(*hash);
- list_del(&work->list);
+ wq_node_del(&wqe->work_list, node, prev);
return work;
}
}
@@ -409,7 +420,7 @@ static void io_worker_handle_work(struct io_worker *worker)
work = io_get_next_work(wqe, &hash);
if (work)
__io_worker_busy(wqe, worker, work);
- else if (!list_empty(&wqe->work_list))
+ else if (!wq_list_empty(&wqe->work_list))
wqe->flags |= IO_WQE_FLAG_STALLED;
spin_unlock_irq(&wqe->lock);
@@ -426,6 +437,9 @@ next:
worker->cur_work = work;
spin_unlock_irq(&worker->lock);
+ if (work->flags & IO_WQ_WORK_CB)
+ work->func(&work);
+
if ((work->flags & IO_WQ_WORK_NEEDS_FILES) &&
current->files != work->files) {
task_lock(current);
@@ -438,6 +452,8 @@ next:
set_fs(USER_DS);
worker->mm = wq->mm;
}
+ if (!worker->creds)
+ worker->creds = override_creds(wq->creds);
if (test_bit(IO_WQ_BIT_CANCEL, &wq->state))
work->flags |= IO_WQ_WORK_CANCEL;
if (worker->mm)
@@ -514,7 +530,7 @@ static int io_wqe_worker(void *data)
if (test_bit(IO_WQ_BIT_EXIT, &wq->state)) {
spin_lock_irq(&wqe->lock);
- if (!list_empty(&wqe->work_list))
+ if (!wq_list_empty(&wqe->work_list))
io_worker_handle_work(worker);
else
spin_unlock_irq(&wqe->lock);
@@ -562,14 +578,14 @@ void io_wq_worker_sleeping(struct task_struct *tsk)
spin_unlock_irq(&wqe->lock);
}
-static void create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index)
+static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index)
{
struct io_wqe_acct *acct =&wqe->acct[index];
struct io_worker *worker;
- worker = kcalloc_node(1, sizeof(*worker), GFP_KERNEL, wqe->node);
+ worker = kzalloc_node(sizeof(*worker), GFP_KERNEL, wqe->node);
if (!worker)
- return;
+ return false;
refcount_set(&worker->ref, 1);
worker->nulls_node.pprev = NULL;
@@ -581,7 +597,7 @@ static void create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index)
"io_wqe_worker-%d/%d", index, wqe->node);
if (IS_ERR(worker->task)) {
kfree(worker);
- return;
+ return false;
}
spin_lock_irq(&wqe->lock);
@@ -599,6 +615,7 @@ static void create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index)
atomic_inc(&wq->user->processes);
wake_up_process(worker->task);
+ return true;
}
static inline bool io_wqe_need_worker(struct io_wqe *wqe, int index)
@@ -606,9 +623,6 @@ static inline bool io_wqe_need_worker(struct io_wqe *wqe, int index)
{
struct io_wqe_acct *acct = &wqe->acct[index];
- /* always ensure we have one bounded worker */
- if (index == IO_WQ_ACCT_BOUND && !acct->nr_workers)
- return true;
/* if we have available workers or no work, no need */
if (!hlist_nulls_empty(&wqe->free_list) || !io_wqe_run_queue(wqe))
return false;
@@ -621,12 +635,22 @@ static inline bool io_wqe_need_worker(struct io_wqe *wqe, int index)
static int io_wq_manager(void *data)
{
struct io_wq *wq = data;
+ int workers_to_create = num_possible_nodes();
+ int node;
- while (!kthread_should_stop()) {
- int i;
+ /* create fixed workers */
+ refcount_set(&wq->refs, workers_to_create);
+ for_each_node(node) {
+ if (!create_io_worker(wq, wq->wqes[node], IO_WQ_ACCT_BOUND))
+ goto err;
+ workers_to_create--;
+ }
+
+ complete(&wq->done);
- for (i = 0; i < wq->nr_wqes; i++) {
- struct io_wqe *wqe = wq->wqes[i];
+ while (!kthread_should_stop()) {
+ for_each_node(node) {
+ struct io_wqe *wqe = wq->wqes[node];
bool fork_worker[2] = { false, false };
spin_lock_irq(&wqe->lock);
@@ -645,6 +669,12 @@ static int io_wq_manager(void *data)
}
return 0;
+err:
+ set_bit(IO_WQ_BIT_ERROR, &wq->state);
+ set_bit(IO_WQ_BIT_EXIT, &wq->state);
+ if (refcount_sub_and_test(workers_to_create, &wq->refs))
+ complete(&wq->done);
+ return 0;
}
static bool io_wq_can_queue(struct io_wqe *wqe, struct io_wqe_acct *acct,
@@ -688,7 +718,7 @@ static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work)
}
spin_lock_irqsave(&wqe->lock, flags);
- list_add_tail(&work->list, &wqe->work_list);
+ wq_list_add_tail(&work->list, &wqe->work_list);
wqe->flags &= ~IO_WQE_FLAG_STALLED;
spin_unlock_irqrestore(&wqe->lock, flags);
@@ -750,7 +780,7 @@ static bool io_wq_for_each_worker(struct io_wqe *wqe,
void io_wq_cancel_all(struct io_wq *wq)
{
- int i;
+ int node;
set_bit(IO_WQ_BIT_CANCEL, &wq->state);
@@ -759,8 +789,8 @@ void io_wq_cancel_all(struct io_wq *wq)
* to a worker and the worker putting itself on the busy_list
*/
rcu_read_lock();
- for (i = 0; i < wq->nr_wqes; i++) {
- struct io_wqe *wqe = wq->wqes[i];
+ for_each_node(node) {
+ struct io_wqe *wqe = wq->wqes[node];
io_wq_for_each_worker(wqe, io_wqe_worker_send_sig, NULL);
}
@@ -803,14 +833,17 @@ static enum io_wq_cancel io_wqe_cancel_cb_work(struct io_wqe *wqe,
.cancel = cancel,
.caller_data = cancel_data,
};
+ struct io_wq_work_node *node, *prev;
struct io_wq_work *work;
unsigned long flags;
bool found = false;
spin_lock_irqsave(&wqe->lock, flags);
- list_for_each_entry(work, &wqe->work_list, list) {
+ wq_list_for_each(node, prev, &wqe->work_list) {
+ work = container_of(node, struct io_wq_work, list);
+
if (cancel(work, cancel_data)) {
- list_del(&work->list);
+ wq_node_del(&wqe->work_list, node, prev);
found = true;
break;
}
@@ -833,10 +866,10 @@ enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel,
void *data)
{
enum io_wq_cancel ret = IO_WQ_CANCEL_NOTFOUND;
- int i;
+ int node;
- for (i = 0; i < wq->nr_wqes; i++) {
- struct io_wqe *wqe = wq->wqes[i];
+ for_each_node(node) {
+ struct io_wqe *wqe = wq->wqes[node];
ret = io_wqe_cancel_cb_work(wqe, cancel, data);
if (ret != IO_WQ_CANCEL_NOTFOUND)
@@ -868,6 +901,7 @@ static bool io_wq_worker_cancel(struct io_worker *worker, void *data)
static enum io_wq_cancel io_wqe_cancel_work(struct io_wqe *wqe,
struct io_wq_work *cwork)
{
+ struct io_wq_work_node *node, *prev;
struct io_wq_work *work;
unsigned long flags;
bool found = false;
@@ -880,9 +914,11 @@ static enum io_wq_cancel io_wqe_cancel_work(struct io_wqe *wqe,
* no completion will be posted for it.
*/
spin_lock_irqsave(&wqe->lock, flags);
- list_for_each_entry(work, &wqe->work_list, list) {
+ wq_list_for_each(node, prev, &wqe->work_list) {
+ work = container_of(node, struct io_wq_work, list);
+
if (work == cwork) {
- list_del(&work->list);
+ wq_node_del(&wqe->work_list, node, prev);
found = true;
break;
}
@@ -910,10 +946,10 @@ static enum io_wq_cancel io_wqe_cancel_work(struct io_wqe *wqe,
enum io_wq_cancel io_wq_cancel_work(struct io_wq *wq, struct io_wq_work *cwork)
{
enum io_wq_cancel ret = IO_WQ_CANCEL_NOTFOUND;
- int i;
+ int node;
- for (i = 0; i < wq->nr_wqes; i++) {
- struct io_wqe *wqe = wq->wqes[i];
+ for_each_node(node) {
+ struct io_wqe *wqe = wq->wqes[node];
ret = io_wqe_cancel_work(wqe, cwork);
if (ret != IO_WQ_CANCEL_NOTFOUND)
@@ -944,10 +980,10 @@ static void io_wq_flush_func(struct io_wq_work **workptr)
void io_wq_flush(struct io_wq *wq)
{
struct io_wq_flush_data data;
- int i;
+ int node;
- for (i = 0; i < wq->nr_wqes; i++) {
- struct io_wqe *wqe = wq->wqes[i];
+ for_each_node(node) {
+ struct io_wqe *wqe = wq->wqes[node];
init_completion(&data.done);
INIT_IO_WORK(&data.work, io_wq_flush_func);
@@ -957,43 +993,39 @@ void io_wq_flush(struct io_wq *wq)
}
}
-struct io_wq *io_wq_create(unsigned bounded, struct mm_struct *mm,
- struct user_struct *user, get_work_fn *get_work,
- put_work_fn *put_work)
+struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
{
- int ret = -ENOMEM, i, node;
+ int ret = -ENOMEM, node;
struct io_wq *wq;
- wq = kcalloc(1, sizeof(*wq), GFP_KERNEL);
+ wq = kzalloc(sizeof(*wq), GFP_KERNEL);
if (!wq)
return ERR_PTR(-ENOMEM);
- wq->nr_wqes = num_online_nodes();
- wq->wqes = kcalloc(wq->nr_wqes, sizeof(struct io_wqe *), GFP_KERNEL);
+ wq->wqes = kcalloc(nr_node_ids, sizeof(struct io_wqe *), GFP_KERNEL);
if (!wq->wqes) {
kfree(wq);
return ERR_PTR(-ENOMEM);
}
- wq->get_work = get_work;
- wq->put_work = put_work;
+ wq->get_work = data->get_work;
+ wq->put_work = data->put_work;
/* caller must already hold a reference to this */
- wq->user = user;
+ wq->user = data->user;
+ wq->creds = data->creds;
- i = 0;
- refcount_set(&wq->refs, wq->nr_wqes);
- for_each_online_node(node) {
+ for_each_node(node) {
struct io_wqe *wqe;
- wqe = kcalloc_node(1, sizeof(struct io_wqe), GFP_KERNEL, node);
+ wqe = kzalloc_node(sizeof(struct io_wqe), GFP_KERNEL, node);
if (!wqe)
- break;
- wq->wqes[i] = wqe;
+ goto err;
+ wq->wqes[node] = wqe;
wqe->node = node;
wqe->acct[IO_WQ_ACCT_BOUND].max_workers = bounded;
atomic_set(&wqe->acct[IO_WQ_ACCT_BOUND].nr_running, 0);
- if (user) {
+ if (wq->user) {
wqe->acct[IO_WQ_ACCT_UNBOUND].max_workers =
task_rlimit(current, RLIMIT_NPROC);
}
@@ -1001,33 +1033,36 @@ struct io_wq *io_wq_create(unsigned bounded, struct mm_struct *mm,
wqe->node = node;
wqe->wq = wq;
spin_lock_init(&wqe->lock);
- INIT_LIST_HEAD(&wqe->work_list);
+ INIT_WQ_LIST(&wqe->work_list);
INIT_HLIST_NULLS_HEAD(&wqe->free_list, 0);
INIT_HLIST_NULLS_HEAD(&wqe->busy_list, 1);
INIT_LIST_HEAD(&wqe->all_list);
-
- i++;
}
init_completion(&wq->done);
- if (i != wq->nr_wqes)
- goto err;
-
/* caller must have already done mmgrab() on this mm */
- wq->mm = mm;
+ wq->mm = data->mm;
wq->manager = kthread_create(io_wq_manager, wq, "io_wq_manager");
if (!IS_ERR(wq->manager)) {
wake_up_process(wq->manager);
+ wait_for_completion(&wq->done);
+ if (test_bit(IO_WQ_BIT_ERROR, &wq->state)) {
+ ret = -ENOMEM;
+ goto err;
+ }
+ reinit_completion(&wq->done);
return wq;
}
ret = PTR_ERR(wq->manager);
- wq->manager = NULL;
-err:
complete(&wq->done);
- io_wq_destroy(wq);
+err:
+ for_each_node(node)
+ kfree(wq->wqes[node]);
+ kfree(wq->wqes);
+ kfree(wq);
return ERR_PTR(ret);
}
@@ -1039,27 +1074,21 @@ static bool io_wq_worker_wake(struct io_worker *worker, void *data)
void io_wq_destroy(struct io_wq *wq)
{
- int i;
+ int node;
- if (wq->manager) {
- set_bit(IO_WQ_BIT_EXIT, &wq->state);
+ set_bit(IO_WQ_BIT_EXIT, &wq->state);
+ if (wq->manager)
kthread_stop(wq->manager);
- }
rcu_read_lock();
- for (i = 0; i < wq->nr_wqes; i++) {
- struct io_wqe *wqe = wq->wqes[i];
-
- if (!wqe)
- continue;
- io_wq_for_each_worker(wqe, io_wq_worker_wake, NULL);
- }
+ for_each_node(node)
+ io_wq_for_each_worker(wq->wqes[node], io_wq_worker_wake, NULL);
rcu_read_unlock();
wait_for_completion(&wq->done);
- for (i = 0; i < wq->nr_wqes; i++)
- kfree(wq->wqes[i]);
+ for_each_node(node)
+ kfree(wq->wqes[node]);
kfree(wq->wqes);
kfree(wq);
}
diff --git a/fs/io-wq.h b/fs/io-wq.h
index 4b29f922f80c..600e0158cba7 100644
--- a/fs/io-wq.h
+++ b/fs/io-wq.h
@@ -11,6 +11,7 @@ enum {
IO_WQ_WORK_NEEDS_FILES = 16,
IO_WQ_WORK_UNBOUND = 32,
IO_WQ_WORK_INTERNAL = 64,
+ IO_WQ_WORK_CB = 128,
IO_WQ_HASH_SHIFT = 24, /* upper 8 bits are used for hash key */
};
@@ -21,15 +22,60 @@ enum io_wq_cancel {
IO_WQ_CANCEL_NOTFOUND, /* work not found */
};
+struct io_wq_work_node {
+ struct io_wq_work_node *next;
+};
+
+struct io_wq_work_list {
+ struct io_wq_work_node *first;
+ struct io_wq_work_node *last;
+};
+
+static inline void wq_list_add_tail(struct io_wq_work_node *node,
+ struct io_wq_work_list *list)
+{
+ if (!list->first) {
+ list->first = list->last = node;
+ } else {
+ list->last->next = node;
+ list->last = node;
+ }
+}
+
+static inline void wq_node_del(struct io_wq_work_list *list,
+ struct io_wq_work_node *node,
+ struct io_wq_work_node *prev)
+{
+ if (node == list->first)
+ list->first = node->next;
+ if (node == list->last)
+ list->last = prev;
+ if (prev)
+ prev->next = node->next;
+}
+
+#define wq_list_for_each(pos, prv, head) \
+ for (pos = (head)->first, prv = NULL; pos; prv = pos, pos = (pos)->next)
+
+#define wq_list_empty(list) ((list)->first == NULL)
+#define INIT_WQ_LIST(list) do { \
+ (list)->first = NULL; \
+ (list)->last = NULL; \
+} while (0)
+
struct io_wq_work {
- struct list_head list;
+ union {
+ struct io_wq_work_node list;
+ void *data;
+ };
void (*func)(struct io_wq_work **);
- unsigned flags;
struct files_struct *files;
+ unsigned flags;
};
#define INIT_IO_WORK(work, _func) \
do { \
+ (work)->list.next = NULL; \
(work)->func = _func; \
(work)->flags = 0; \
(work)->files = NULL; \
@@ -38,9 +84,16 @@ struct io_wq_work {
typedef void (get_work_fn)(struct io_wq_work *);
typedef void (put_work_fn)(struct io_wq_work *);
-struct io_wq *io_wq_create(unsigned bounded, struct mm_struct *mm,
- struct user_struct *user,
- get_work_fn *get_work, put_work_fn *put_work);
+struct io_wq_data {
+ struct mm_struct *mm;
+ struct user_struct *user;
+ struct cred *creds;
+
+ get_work_fn *get_work;
+ put_work_fn *put_work;
+};
+
+struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data);
void io_wq_destroy(struct io_wq *wq);
void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work);
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 4c030a92de79..2c2e8c25da01 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -186,6 +186,7 @@ struct io_ring_ctx {
bool compat;
bool account_mem;
bool cq_overflow_flushed;
+ bool drain_next;
/*
* Ring buffer of indices into array of io_uring_sqe, which is
@@ -236,6 +237,8 @@ struct io_ring_ctx {
struct user_struct *user;
+ struct cred *creds;
+
/* 0 is for ctx quiesce/reinit/free, 1 is for sqo_thread started */
struct completion *completions;
@@ -278,16 +281,6 @@ struct io_ring_ctx {
} ____cacheline_aligned_in_smp;
};
-struct sqe_submit {
- const struct io_uring_sqe *sqe;
- struct file *ring_file;
- int ring_fd;
- u32 sequence;
- bool has_user;
- bool in_async;
- bool needs_fixed_file;
-};
-
/*
* First field must be the file pointer in all the
* iocb unions! See also 'struct kiocb' in <linux/fs.h>
@@ -298,12 +291,20 @@ struct io_poll_iocb {
__poll_t events;
bool done;
bool canceled;
- struct wait_queue_entry wait;
+ struct wait_queue_entry *wait;
+};
+
+struct io_timeout_data {
+ struct io_kiocb *req;
+ struct hrtimer timer;
+ struct timespec64 ts;
+ enum hrtimer_mode mode;
+ u32 seq_offset;
};
struct io_timeout {
struct file *file;
- struct hrtimer timer;
+ struct io_timeout_data *data;
};
/*
@@ -320,7 +321,12 @@ struct io_kiocb {
struct io_timeout timeout;
};
- struct sqe_submit submit;
+ const struct io_uring_sqe *sqe;
+ struct file *ring_file;
+ int ring_fd;
+ bool has_user;
+ bool in_async;
+ bool needs_fixed_file;
struct io_ring_ctx *ctx;
union {
@@ -333,19 +339,20 @@ struct io_kiocb {
#define REQ_F_NOWAIT 1 /* must not punt to workers */
#define REQ_F_IOPOLL_COMPLETED 2 /* polled IO has completed */
#define REQ_F_FIXED_FILE 4 /* ctx owns file */
-#define REQ_F_SEQ_PREV 8 /* sequential with previous */
+#define REQ_F_LINK_NEXT 8 /* already grabbed next link */
#define REQ_F_IO_DRAIN 16 /* drain existing IO first */
#define REQ_F_IO_DRAINED 32 /* drain done */
#define REQ_F_LINK 64 /* linked sqes */
#define REQ_F_LINK_TIMEOUT 128 /* has linked timeout */
#define REQ_F_FAIL_LINK 256 /* fail rest of links */
-#define REQ_F_SHADOW_DRAIN 512 /* link-drain shadow req */
+#define REQ_F_DRAIN_LINK 512 /* link should be fully drained */
#define REQ_F_TIMEOUT 1024 /* timeout request */
#define REQ_F_ISREG 2048 /* regular file */
#define REQ_F_MUST_PUNT 4096 /* must be punted even for NONBLOCK */
#define REQ_F_TIMEOUT_NOSEQ 8192 /* no timeout sequence */
#define REQ_F_INFLIGHT 16384 /* on inflight list */
#define REQ_F_COMP_LOCKED 32768 /* completion under lock */
+#define REQ_F_FREE_SQE 65536 /* free sqe if not async queued */
u64 user_data;
u32 result;
u32 sequence;
@@ -383,6 +390,9 @@ static void io_cqring_fill_event(struct io_kiocb *req, long res);
static void __io_free_req(struct io_kiocb *req);
static void io_put_req(struct io_kiocb *req);
static void io_double_put_req(struct io_kiocb *req);
+static void __io_double_put_req(struct io_kiocb *req);
+static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req);
+static void io_queue_linked_timeout(struct io_kiocb *req);
static struct kmem_cache *req_cachep;
@@ -521,12 +531,13 @@ static inline bool io_sqe_needs_user(const struct io_uring_sqe *sqe)
opcode == IORING_OP_WRITE_FIXED);
}
-static inline bool io_prep_async_work(struct io_kiocb *req)
+static inline bool io_prep_async_work(struct io_kiocb *req,
+ struct io_kiocb **link)
{
bool do_hashed = false;
- if (req->submit.sqe) {
- switch (req->submit.sqe->opcode) {
+ if (req->sqe) {
+ switch (req->sqe->opcode) {
case IORING_OP_WRITEV:
case IORING_OP_WRITE_FIXED:
do_hashed = true;
@@ -537,6 +548,7 @@ static inline bool io_prep_async_work(struct io_kiocb *req)
case IORING_OP_RECVMSG:
case IORING_OP_ACCEPT:
case IORING_OP_POLL_ADD:
+ case IORING_OP_CONNECT:
/*
* We know REQ_F_ISREG is not set on some of these
* opcodes, but this enables us to keep the check in
@@ -546,17 +558,21 @@ static inline bool io_prep_async_work(struct io_kiocb *req)
req->work.flags |= IO_WQ_WORK_UNBOUND;
break;
}
- if (io_sqe_needs_user(req->submit.sqe))
+ if (io_sqe_needs_user(req->sqe))
req->work.flags |= IO_WQ_WORK_NEEDS_USER;
}
+ *link = io_prep_linked_timeout(req);
return do_hashed;
}
static inline void io_queue_async_work(struct io_kiocb *req)
{
- bool do_hashed = io_prep_async_work(req);
struct io_ring_ctx *ctx = req->ctx;
+ struct io_kiocb *link;
+ bool do_hashed;
+
+ do_hashed = io_prep_async_work(req, &link);
trace_io_uring_queue_async_work(ctx, do_hashed, req, &req->work,
req->flags);
@@ -566,13 +582,16 @@ static inline void io_queue_async_work(struct io_kiocb *req)
io_wq_enqueue_hashed(ctx->io_wq, &req->work,
file_inode(req->file));
}
+
+ if (link)
+ io_queue_linked_timeout(link);
}
static void io_kill_timeout(struct io_kiocb *req)
{
int ret;
- ret = hrtimer_try_to_cancel(&req->timeout.timer);
+ ret = hrtimer_try_to_cancel(&req->timeout.data->timer);
if (ret != -1) {
atomic_inc(&req->ctx->cq_timeouts);
list_del_init(&req->list);
@@ -601,11 +620,6 @@ static void io_commit_cqring(struct io_ring_ctx *ctx)
__io_commit_cqring(ctx);
while ((req = io_get_deferred_req(ctx)) != NULL) {
- if (req->flags & REQ_F_SHADOW_DRAIN) {
- /* Just for drain, free it. */
- __io_free_req(req);
- continue;
- }
req->flags |= REQ_F_IO_DRAINED;
io_queue_async_work(req);
}
@@ -639,7 +653,8 @@ static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
eventfd_signal(ctx->cq_ev_fd, 1);
}
-static void io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
+/* Returns true if there are no backlogged entries after the flush */
+static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
{
struct io_rings *rings = ctx->rings;
struct io_uring_cqe *cqe;
@@ -649,10 +664,10 @@ static void io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
if (!force) {
if (list_empty_careful(&ctx->cq_overflow_list))
- return;
+ return true;
if ((ctx->cached_cq_tail - READ_ONCE(rings->cq.head) ==
rings->cq_ring_entries))
- return;
+ return false;
}
spin_lock_irqsave(&ctx->completion_lock, flags);
@@ -661,6 +676,7 @@ static void io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
if (force)
ctx->cq_overflow_flushed = true;
+ cqe = NULL;
while (!list_empty(&ctx->cq_overflow_list)) {
cqe = io_get_cqring(ctx);
if (!cqe && !force)
@@ -688,6 +704,8 @@ static void io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
list_del(&req->list);
io_put_req(req);
}
+
+ return cqe != NULL;
}
static void io_cqring_fill_event(struct io_kiocb *req, long res)
@@ -787,6 +805,7 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx,
}
got_it:
+ req->ring_file = NULL;
req->file = NULL;
req->ctx = ctx;
req->flags = 0;
@@ -816,6 +835,8 @@ static void __io_free_req(struct io_kiocb *req)
{
struct io_ring_ctx *ctx = req->ctx;
+ if (req->flags & REQ_F_FREE_SQE)
+ kfree(req->sqe);
if (req->file && !(req->flags & REQ_F_FIXED_FILE))
fput(req->file);
if (req->flags & REQ_F_INFLIGHT) {
@@ -827,6 +848,8 @@ static void __io_free_req(struct io_kiocb *req)
wake_up(&ctx->inflight_wait);
spin_unlock_irqrestore(&ctx->inflight_lock, flags);
}
+ if (req->flags & REQ_F_TIMEOUT)
+ kfree(req->timeout.data);
percpu_ref_put(&ctx->refs);
if (likely(!io_is_fallback_req(req)))
kmem_cache_free(req_cachep, req);
@@ -839,7 +862,7 @@ static bool io_link_cancel_timeout(struct io_kiocb *req)
struct io_ring_ctx *ctx = req->ctx;
int ret;
- ret = hrtimer_try_to_cancel(&req->timeout.timer);
+ ret = hrtimer_try_to_cancel(&req->timeout.data->timer);
if (ret != -1) {
io_cqring_fill_event(req, -ECANCELED);
io_commit_cqring(ctx);
@@ -857,6 +880,10 @@ static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr)
struct io_kiocb *nxt;
bool wake_ev = false;
+ /* Already got next link */
+ if (req->flags & REQ_F_LINK_NEXT)
+ return;
+
/*
* The list should never be empty when we are called here. But could
* potentially happen if the chain is messed up, check to be on the
@@ -865,31 +892,26 @@ static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr)
nxt = list_first_entry_or_null(&req->link_list, struct io_kiocb, list);
while (nxt) {
list_del_init(&nxt->list);
+
+ if ((req->flags & REQ_F_LINK_TIMEOUT) &&
+ (nxt->flags & REQ_F_TIMEOUT)) {
+ wake_ev |= io_link_cancel_timeout(nxt);
+ nxt = list_first_entry_or_null(&req->link_list,
+ struct io_kiocb, list);
+ req->flags &= ~REQ_F_LINK_TIMEOUT;
+ continue;
+ }
if (!list_empty(&req->link_list)) {
INIT_LIST_HEAD(&nxt->link_list);
list_splice(&req->link_list, &nxt->link_list);
nxt->flags |= REQ_F_LINK;
}
- /*
- * If we're in async work, we can continue processing the chain
- * in this context instead of having to queue up new async work.
- */
- if (req->flags & REQ_F_LINK_TIMEOUT) {
- wake_ev = io_link_cancel_timeout(nxt);
-
- /* we dropped this link, get next */
- nxt = list_first_entry_or_null(&req->link_list,
- struct io_kiocb, list);
- } else if (nxtptr && io_wq_current_is_worker()) {
- *nxtptr = nxt;
- break;
- } else {
- io_queue_async_work(nxt);
- break;
- }
+ *nxtptr = nxt;
+ break;
}
+ req->flags |= REQ_F_LINK_NEXT;
if (wake_ev)
io_cqring_ev_posted(ctx);
}
@@ -912,12 +934,13 @@ static void io_fail_links(struct io_kiocb *req)
trace_io_uring_fail_link(req, link);
if ((req->flags & REQ_F_LINK_TIMEOUT) &&
- link->submit.sqe->opcode == IORING_OP_LINK_TIMEOUT) {
+ link->sqe->opcode == IORING_OP_LINK_TIMEOUT) {
io_link_cancel_timeout(link);
} else {
io_cqring_fill_event(link, -ECANCELED);
- io_double_put_req(link);
+ __io_double_put_req(link);
}
+ req->flags &= ~REQ_F_LINK_TIMEOUT;
}
io_commit_cqring(ctx);
@@ -925,12 +948,10 @@ static void io_fail_links(struct io_kiocb *req)
io_cqring_ev_posted(ctx);
}
-static void io_free_req_find_next(struct io_kiocb *req, struct io_kiocb **nxt)
+static void io_req_find_next(struct io_kiocb *req, struct io_kiocb **nxt)
{
- if (likely(!(req->flags & REQ_F_LINK))) {
- __io_free_req(req);
+ if (likely(!(req->flags & REQ_F_LINK)))
return;
- }
/*
* If LINK is set, we have dependent requests in this chain. If we
@@ -956,32 +977,30 @@ static void io_free_req_find_next(struct io_kiocb *req, struct io_kiocb **nxt)
} else {
io_req_link_next(req, nxt);
}
-
- __io_free_req(req);
}
static void io_free_req(struct io_kiocb *req)
{
- io_free_req_find_next(req, NULL);
+ struct io_kiocb *nxt = NULL;
+
+ io_req_find_next(req, &nxt);
+ __io_free_req(req);
+
+ if (nxt)
+ io_queue_async_work(nxt);
}
/*
* Drop reference to request, return next in chain (if there is one) if this
* was the last reference to this request.
*/
+__attribute__((nonnull))
static void io_put_req_find_next(struct io_kiocb *req, struct io_kiocb **nxtptr)
{
- struct io_kiocb *nxt = NULL;
+ io_req_find_next(req, nxtptr);
if (refcount_dec_and_test(&req->refs))
- io_free_req_find_next(req, &nxt);
-
- if (nxt) {
- if (nxtptr)
- *nxtptr = nxt;
- else
- io_queue_async_work(nxt);
- }
+ __io_free_req(req);
}
static void io_put_req(struct io_kiocb *req)
@@ -990,13 +1009,24 @@ static void io_put_req(struct io_kiocb *req)
io_free_req(req);
}
-static void io_double_put_req(struct io_kiocb *req)
+/*
+ * Must only be used if we don't need to care about links, usually from
+ * within the completion handling itself.
+ */
+static void __io_double_put_req(struct io_kiocb *req)
{
/* drop both submit and complete references */
if (refcount_sub_and_test(2, &req->refs))
__io_free_req(req);
}
+static void io_double_put_req(struct io_kiocb *req)
+{
+ /* drop both submit and complete references */
+ if (refcount_sub_and_test(2, &req->refs))
+ io_free_req(req);
+}
+
static unsigned io_cqring_events(struct io_ring_ctx *ctx, bool noflush)
{
struct io_rings *rings = ctx->rings;
@@ -1048,7 +1078,8 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
* completions for those, only batch free for fixed
* file and non-linked commands.
*/
- if (((req->flags & (REQ_F_FIXED_FILE|REQ_F_LINK)) ==
+ if (((req->flags &
+ (REQ_F_FIXED_FILE|REQ_F_LINK|REQ_F_FREE_SQE)) ==
REQ_F_FIXED_FILE) && !io_is_fallback_req(req)) {
reqs[to_free++] = req;
if (to_free == ARRAY_SIZE(reqs))
@@ -1366,7 +1397,7 @@ static bool io_file_supports_async(struct file *file)
static int io_prep_rw(struct io_kiocb *req, bool force_nonblock)
{
- const struct io_uring_sqe *sqe = req->submit.sqe;
+ const struct io_uring_sqe *sqe = req->sqe;
struct io_ring_ctx *ctx = req->ctx;
struct kiocb *kiocb = &req->rw;
unsigned ioprio;
@@ -1453,15 +1484,15 @@ static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
static void kiocb_done(struct kiocb *kiocb, ssize_t ret, struct io_kiocb **nxt,
bool in_async)
{
- if (in_async && ret >= 0 && nxt && kiocb->ki_complete == io_complete_rw)
+ if (in_async && ret >= 0 && kiocb->ki_complete == io_complete_rw)
*nxt = __io_complete_rw(kiocb, ret);
else
io_rw_done(kiocb, ret);
}
-static int io_import_fixed(struct io_ring_ctx *ctx, int rw,
- const struct io_uring_sqe *sqe,
- struct iov_iter *iter)
+static ssize_t io_import_fixed(struct io_ring_ctx *ctx, int rw,
+ const struct io_uring_sqe *sqe,
+ struct iov_iter *iter)
{
size_t len = READ_ONCE(sqe->len);
struct io_mapped_ubuf *imu;
@@ -1533,11 +1564,10 @@ static int io_import_fixed(struct io_ring_ctx *ctx, int rw,
return len;
}
-static ssize_t io_import_iovec(struct io_ring_ctx *ctx, int rw,
- const struct sqe_submit *s, struct iovec **iovec,
- struct iov_iter *iter)
+static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
+ struct iovec **iovec, struct iov_iter *iter)
{
- const struct io_uring_sqe *sqe = s->sqe;
+ const struct io_uring_sqe *sqe = req->sqe;
void __user *buf = u64_to_user_ptr(READ_ONCE(sqe->addr));
size_t sqe_len = READ_ONCE(sqe->len);
u8 opcode;
@@ -1551,18 +1581,16 @@ static ssize_t io_import_iovec(struct io_ring_ctx *ctx, int rw,
* flag.
*/
opcode = READ_ONCE(sqe->opcode);
- if (opcode == IORING_OP_READ_FIXED ||
- opcode == IORING_OP_WRITE_FIXED) {
- ssize_t ret = io_import_fixed(ctx, rw, sqe, iter);
+ if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) {
*iovec = NULL;
- return ret;
+ return io_import_fixed(req->ctx, rw, sqe, iter);
}
- if (!s->has_user)
+ if (!req->has_user)
return -EFAULT;
#ifdef CONFIG_COMPAT
- if (ctx->compat)
+ if (req->ctx->compat)
return compat_import_iovec(rw, buf, sqe_len, UIO_FASTIOV,
iovec, iter);
#endif
@@ -1590,9 +1618,19 @@ static ssize_t loop_rw_iter(int rw, struct file *file, struct kiocb *kiocb,
return -EAGAIN;
while (iov_iter_count(iter)) {
- struct iovec iovec = iov_iter_iovec(iter);
+ struct iovec iovec;
ssize_t nr;
+ if (!iov_iter_is_bvec(iter)) {
+ iovec = iov_iter_iovec(iter);
+ } else {
+ /* fixed buffers import bvec */
+ iovec.iov_base = kmap(iter->bvec->bv_page)
+ + iter->iov_offset;
+ iovec.iov_len = min(iter->count,
+ iter->bvec->bv_len - iter->iov_offset);
+ }
+
if (rw == READ) {
nr = file->f_op->read(file, iovec.iov_base,
iovec.iov_len, &kiocb->ki_pos);
@@ -1601,6 +1639,9 @@ static ssize_t loop_rw_iter(int rw, struct file *file, struct kiocb *kiocb,
iovec.iov_len, &kiocb->ki_pos);
}
+ if (iov_iter_is_bvec(iter))
+ kunmap(iter->bvec->bv_page);
+
if (nr < 0) {
if (!ret)
ret = nr;
@@ -1633,7 +1674,7 @@ static int io_read(struct io_ki