summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2020-03-30 12:18:49 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2020-03-30 12:18:49 -0700
commite59cd88028dbd41472453e5883f78330aa73c56e (patch)
tree5576a15b6cf3e70df7cf6d0a2d7711b4bc0417ed
parent1592614838cb52f4313ceff64894e2ca78591498 (diff)
parent3d9932a8b240c9019f48358e8a6928c53c2c7f6b (diff)
Merge tag 'for-5.7/io_uring-2020-03-29' of git://git.kernel.dk/linux-block
Pull io_uring updates from Jens Axboe: "Here are the io_uring changes for this merge window. Light on new features this time around (just splice + buffer selection), lots of cleanups, fixes, and improvements to existing support. In particular, this contains: - Cleanup fixed file update handling for stack fallback (Hillf) - Re-work of how pollable async IO is handled, we no longer require thread offload to handle that. Instead we rely using poll to drive this, with task_work execution. - In conjunction with the above, allow expendable buffer selection, so that poll+recv (for example) no longer has to be a split operation. - Make sure we honor RLIMIT_FSIZE for buffered writes - Add support for splice (Pavel) - Linked work inheritance fixes and optimizations (Pavel) - Async work fixes and cleanups (Pavel) - Improve io-wq locking (Pavel) - Hashed link write improvements (Pavel) - SETUP_IOPOLL|SETUP_SQPOLL improvements (Xiaoguang)" * tag 'for-5.7/io_uring-2020-03-29' of git://git.kernel.dk/linux-block: (54 commits) io_uring: cleanup io_alloc_async_ctx() io_uring: fix missing 'return' in comment io-wq: handle hashed writes in chains io-uring: drop 'free_pfile' in struct io_file_put io-uring: drop completion when removing file io_uring: Fix ->data corruption on re-enqueue io-wq: close cancel gap for hashed linked work io_uring: make spdxcheck.py happy io_uring: honor original task RLIMIT_FSIZE io-wq: hash dependent work io-wq: split hashing and enqueueing io-wq: don't resched if there is no work io-wq: remove duplicated cancel code io_uring: fix truncated async read/readv and write/writev retry io_uring: dual license io_uring.h uapi header io_uring: io_uring_enter(2) don't poll while SETUP_IOPOLL|SETUP_SQPOLL enabled io_uring: Fix unused function warnings io_uring: add end-of-bits marker and build time verify it io_uring: provide means of removing buffers io_uring: add IOSQE_BUFFER_SELECT support for IORING_OP_RECVMSG ...
-rw-r--r--fs/io-wq.c368
-rw-r--r--fs/io-wq.h65
-rw-r--r--fs/io_uring.c2015
-rw-r--r--fs/splice.c6
-rw-r--r--include/linux/socket.h4
-rw-r--r--include/linux/splice.h3
-rw-r--r--include/net/compat.h3
-rw-r--r--include/trace/events/io_uring.h103
-rw-r--r--include/uapi/linux/io_uring.h42
-rw-r--r--kernel/task_work.c18
-rw-r--r--net/compat.c30
-rw-r--r--net/socket.c25
12 files changed, 1826 insertions, 856 deletions
diff --git a/fs/io-wq.c b/fs/io-wq.c
index 5cef075c0b37..cc5cf2209fb0 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -69,6 +69,8 @@ struct io_worker {
#define IO_WQ_HASH_ORDER 5
#endif
+#define IO_WQ_NR_HASH_BUCKETS (1u << IO_WQ_HASH_ORDER)
+
struct io_wqe_acct {
unsigned nr_workers;
unsigned max_workers;
@@ -98,6 +100,7 @@ struct io_wqe {
struct list_head all_list;
struct io_wq *wq;
+ struct io_wq_work *hash_tail[IO_WQ_NR_HASH_BUCKETS];
};
/*
@@ -107,8 +110,7 @@ struct io_wq {
struct io_wqe **wqes;
unsigned long state;
- get_work_fn *get_work;
- put_work_fn *put_work;
+ free_work_fn *free_work;
struct task_struct *manager;
struct user_struct *user;
@@ -376,26 +378,35 @@ static bool __io_worker_idle(struct io_wqe *wqe, struct io_worker *worker)
return __io_worker_unuse(wqe, worker);
}
-static struct io_wq_work *io_get_next_work(struct io_wqe *wqe, unsigned *hash)
+static inline unsigned int io_get_work_hash(struct io_wq_work *work)
+{
+ return work->flags >> IO_WQ_HASH_SHIFT;
+}
+
+static struct io_wq_work *io_get_next_work(struct io_wqe *wqe)
__must_hold(wqe->lock)
{
struct io_wq_work_node *node, *prev;
- struct io_wq_work *work;
+ struct io_wq_work *work, *tail;
+ unsigned int hash;
wq_list_for_each(node, prev, &wqe->work_list) {
work = container_of(node, struct io_wq_work, list);
/* not hashed, can run anytime */
- if (!(work->flags & IO_WQ_WORK_HASHED)) {
- wq_node_del(&wqe->work_list, node, prev);
+ if (!io_wq_is_hashed(work)) {
+ wq_list_del(&wqe->work_list, node, prev);
return work;
}
/* hashed, can run if not already running */
- *hash = work->flags >> IO_WQ_HASH_SHIFT;
- if (!(wqe->hash_map & BIT_ULL(*hash))) {
- wqe->hash_map |= BIT_ULL(*hash);
- wq_node_del(&wqe->work_list, node, prev);
+ hash = io_get_work_hash(work);
+ if (!(wqe->hash_map & BIT(hash))) {
+ wqe->hash_map |= BIT(hash);
+ /* all items with this hash lie in [work, tail] */
+ tail = wqe->hash_tail[hash];
+ wqe->hash_tail[hash] = NULL;
+ wq_list_cut(&wqe->work_list, &tail->list, prev);
return work;
}
}
@@ -440,16 +451,49 @@ static void io_wq_switch_creds(struct io_worker *worker,
worker->saved_creds = old_creds;
}
+static void io_impersonate_work(struct io_worker *worker,
+ struct io_wq_work *work)
+{
+ if (work->files && current->files != work->files) {
+ task_lock(current);
+ current->files = work->files;
+ task_unlock(current);
+ }
+ if (work->fs && current->fs != work->fs)
+ current->fs = work->fs;
+ if (work->mm != worker->mm)
+ io_wq_switch_mm(worker, work);
+ if (worker->cur_creds != work->creds)
+ io_wq_switch_creds(worker, work);
+}
+
+static void io_assign_current_work(struct io_worker *worker,
+ struct io_wq_work *work)
+{
+ if (work) {
+ /* flush pending signals before assigning new work */
+ if (signal_pending(current))
+ flush_signals(current);
+ cond_resched();
+ }
+
+ spin_lock_irq(&worker->lock);
+ worker->cur_work = work;
+ spin_unlock_irq(&worker->lock);
+}
+
+static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work);
+
static void io_worker_handle_work(struct io_worker *worker)
__releases(wqe->lock)
{
- struct io_wq_work *work, *old_work = NULL, *put_work = NULL;
struct io_wqe *wqe = worker->wqe;
struct io_wq *wq = wqe->wq;
do {
- unsigned hash = -1U;
-
+ struct io_wq_work *work;
+ unsigned int hash;
+get_next:
/*
* If we got some work, mark us as busy. If we didn't, but
* the list isn't empty, it means we stalled on hashed work.
@@ -457,81 +501,60 @@ static void io_worker_handle_work(struct io_worker *worker)
* can't make progress, any work completion or insertion will
* clear the stalled flag.
*/
- work = io_get_next_work(wqe, &hash);
+ work = io_get_next_work(wqe);
if (work)
__io_worker_busy(wqe, worker, work);
else if (!wq_list_empty(&wqe->work_list))
wqe->flags |= IO_WQE_FLAG_STALLED;
spin_unlock_irq(&wqe->lock);
- if (put_work && wq->put_work)
- wq->put_work(old_work);
if (!work)
break;
-next:
- /* flush any pending signals before assigning new work */
- if (signal_pending(current))
- flush_signals(current);
-
- cond_resched();
-
- spin_lock_irq(&worker->lock);
- worker->cur_work = work;
- spin_unlock_irq(&worker->lock);
-
- if (work->flags & IO_WQ_WORK_CB)
- work->func(&work);
-
- if (work->files && current->files != work->files) {
- task_lock(current);
- current->files = work->files;
- task_unlock(current);
- }
- if (work->fs && current->fs != work->fs)
- current->fs = work->fs;
- if (work->mm != worker->mm)
- io_wq_switch_mm(worker, work);
- if (worker->cur_creds != work->creds)
- io_wq_switch_creds(worker, work);
- /*
- * OK to set IO_WQ_WORK_CANCEL even for uncancellable work,
- * the worker function will do the right thing.
- */
- if (test_bit(IO_WQ_BIT_CANCEL, &wq->state))
- work->flags |= IO_WQ_WORK_CANCEL;
- if (worker->mm)
- work->flags |= IO_WQ_WORK_HAS_MM;
-
- if (wq->get_work) {
- put_work = work;
- wq->get_work(work);
- }
-
- old_work = work;
- work->func(&work);
-
- spin_lock_irq(&worker->lock);
- worker->cur_work = NULL;
- spin_unlock_irq(&worker->lock);
-
- spin_lock_irq(&wqe->lock);
-
- if (hash != -1U) {
- wqe->hash_map &= ~BIT_ULL(hash);
- wqe->flags &= ~IO_WQE_FLAG_STALLED;
- }
- if (work && work != old_work) {
- spin_unlock_irq(&wqe->lock);
-
- if (put_work && wq->put_work) {
- wq->put_work(put_work);
- put_work = NULL;
+ io_assign_current_work(worker, work);
+
+ /* handle a whole dependent link */
+ do {
+ struct io_wq_work *old_work, *next_hashed, *linked;
+
+ next_hashed = wq_next_work(work);
+ io_impersonate_work(worker, work);
+ /*
+ * OK to set IO_WQ_WORK_CANCEL even for uncancellable
+ * work, the worker function will do the right thing.
+ */
+ if (test_bit(IO_WQ_BIT_CANCEL, &wq->state))
+ work->flags |= IO_WQ_WORK_CANCEL;
+
+ hash = io_get_work_hash(work);
+ linked = old_work = work;
+ linked->func(&linked);
+ linked = (old_work == linked) ? NULL : linked;
+
+ work = next_hashed;
+ if (!work && linked && !io_wq_is_hashed(linked)) {
+ work = linked;
+ linked = NULL;
}
+ io_assign_current_work(worker, work);
+ wq->free_work(old_work);
+
+ if (linked)
+ io_wqe_enqueue(wqe, linked);
+
+ if (hash != -1U && !next_hashed) {
+ spin_lock_irq(&wqe->lock);
+ wqe->hash_map &= ~BIT_ULL(hash);
+ wqe->flags &= ~IO_WQE_FLAG_STALLED;
+ /* dependent work is not hashed */
+ hash = -1U;
+ /* skip unnecessary unlock-lock wqe->lock */
+ if (!work)
+ goto get_next;
+ spin_unlock_irq(&wqe->lock);
+ }
+ } while (work);
- /* dependent work not hashed */
- hash = -1U;
- goto next;
- }
+ spin_lock_irq(&wqe->lock);
} while (1);
}
@@ -747,17 +770,40 @@ static bool io_wq_can_queue(struct io_wqe *wqe, struct io_wqe_acct *acct,
return true;
}
-static void io_run_cancel(struct io_wq_work *work)
+static void io_run_cancel(struct io_wq_work *work, struct io_wqe *wqe)
{
+ struct io_wq *wq = wqe->wq;
+
do {
struct io_wq_work *old_work = work;
work->flags |= IO_WQ_WORK_CANCEL;
work->func(&work);
work = (work == old_work) ? NULL : work;
+ wq->free_work(old_work);
} while (work);
}
+static void io_wqe_insert_work(struct io_wqe *wqe, struct io_wq_work *work)
+{
+ unsigned int hash;
+ struct io_wq_work *tail;
+
+ if (!io_wq_is_hashed(work)) {
+append:
+ wq_list_add_tail(&work->list, &wqe->work_list);
+ return;
+ }
+
+ hash = io_get_work_hash(work);
+ tail = wqe->hash_tail[hash];
+ wqe->hash_tail[hash] = work;
+ if (!tail)
+ goto append;
+
+ wq_list_add_after(&work->list, &tail->list, &wqe->work_list);
+}
+
static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work)
{
struct io_wqe_acct *acct = io_work_get_acct(wqe, work);
@@ -771,13 +817,13 @@ static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work)
* It's close enough to not be an issue, fork() has the same delay.
*/
if (unlikely(!io_wq_can_queue(wqe, acct, work))) {
- io_run_cancel(work);
+ io_run_cancel(work, wqe);
return;
}
work_flags = work->flags;
spin_lock_irqsave(&wqe->lock, flags);
- wq_list_add_tail(&work->list, &wqe->work_list);
+ io_wqe_insert_work(wqe, work);
wqe->flags &= ~IO_WQE_FLAG_STALLED;
spin_unlock_irqrestore(&wqe->lock, flags);
@@ -794,19 +840,15 @@ void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work)
}
/*
- * Enqueue work, hashed by some key. Work items that hash to the same value
- * will not be done in parallel. Used to limit concurrent writes, generally
- * hashed by inode.
+ * Work items that hash to the same value will not be done in parallel.
+ * Used to limit concurrent writes, generally hashed by inode.
*/
-void io_wq_enqueue_hashed(struct io_wq *wq, struct io_wq_work *work, void *val)
+void io_wq_hash_work(struct io_wq_work *work, void *val)
{
- struct io_wqe *wqe = wq->wqes[numa_node_id()];
- unsigned bit;
-
+ unsigned int bit;
bit = hash_ptr(val, IO_WQ_HASH_ORDER);
work->flags |= (IO_WQ_WORK_HASHED | (bit << IO_WQ_HASH_SHIFT));
- io_wqe_enqueue(wqe, work);
}
static bool io_wqe_worker_send_sig(struct io_worker *worker, void *data)
@@ -856,14 +898,13 @@ void io_wq_cancel_all(struct io_wq *wq)
}
struct io_cb_cancel_data {
- struct io_wqe *wqe;
- work_cancel_fn *cancel;
- void *caller_data;
+ work_cancel_fn *fn;
+ void *data;
};
-static bool io_work_cancel(struct io_worker *worker, void *cancel_data)
+static bool io_wq_worker_cancel(struct io_worker *worker, void *data)
{
- struct io_cb_cancel_data *data = cancel_data;
+ struct io_cb_cancel_data *match = data;
unsigned long flags;
bool ret = false;
@@ -874,83 +915,7 @@ static bool io_work_cancel(struct io_worker *worker, void *cancel_data)
spin_lock_irqsave(&worker->lock, flags);
if (worker->cur_work &&
!(worker->cur_work->flags & IO_WQ_WORK_NO_CANCEL) &&
- data->cancel(worker->cur_work, data->caller_data)) {
- send_sig(SIGINT, worker->task, 1);
- ret = true;
- }
- spin_unlock_irqrestore(&worker->lock, flags);
-
- return ret;
-}
-
-static enum io_wq_cancel io_wqe_cancel_cb_work(struct io_wqe *wqe,
- work_cancel_fn *cancel,
- void *cancel_data)
-{
- struct io_cb_cancel_data data = {
- .wqe = wqe,
- .cancel = cancel,
- .caller_data = cancel_data,
- };
- struct io_wq_work_node *node, *prev;
- struct io_wq_work *work;
- unsigned long flags;
- bool found = false;
-
- spin_lock_irqsave(&wqe->lock, flags);
- wq_list_for_each(node, prev, &wqe->work_list) {
- work = container_of(node, struct io_wq_work, list);
-
- if (cancel(work, cancel_data)) {
- wq_node_del(&wqe->work_list, node, prev);
- found = true;
- break;
- }
- }
- spin_unlock_irqrestore(&wqe->lock, flags);
-
- if (found) {
- io_run_cancel(work);
- return IO_WQ_CANCEL_OK;
- }
-
- rcu_read_lock();
- found = io_wq_for_each_worker(wqe, io_work_cancel, &data);
- rcu_read_unlock();
- return found ? IO_WQ_CANCEL_RUNNING : IO_WQ_CANCEL_NOTFOUND;
-}
-
-enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel,
- void *data)
-{
- enum io_wq_cancel ret = IO_WQ_CANCEL_NOTFOUND;
- int node;
-
- for_each_node(node) {
- struct io_wqe *wqe = wq->wqes[node];
-
- ret = io_wqe_cancel_cb_work(wqe, cancel, data);
- if (ret != IO_WQ_CANCEL_NOTFOUND)
- break;
- }
-
- return ret;
-}
-
-struct work_match {
- bool (*fn)(struct io_wq_work *, void *data);
- void *data;
-};
-
-static bool io_wq_worker_cancel(struct io_worker *worker, void *data)
-{
- struct work_match *match = data;
- unsigned long flags;
- bool ret = false;
-
- spin_lock_irqsave(&worker->lock, flags);
- if (match->fn(worker->cur_work, match->data) &&
- !(worker->cur_work->flags & IO_WQ_WORK_NO_CANCEL)) {
+ match->fn(worker->cur_work, match->data)) {
send_sig(SIGINT, worker->task, 1);
ret = true;
}
@@ -960,7 +925,7 @@ static bool io_wq_worker_cancel(struct io_worker *worker, void *data)
}
static enum io_wq_cancel io_wqe_cancel_work(struct io_wqe *wqe,
- struct work_match *match)
+ struct io_cb_cancel_data *match)
{
struct io_wq_work_node *node, *prev;
struct io_wq_work *work;
@@ -977,7 +942,7 @@ static enum io_wq_cancel io_wqe_cancel_work(struct io_wqe *wqe,
work = container_of(node, struct io_wq_work, list);
if (match->fn(work, match->data)) {
- wq_node_del(&wqe->work_list, node, prev);
+ wq_list_del(&wqe->work_list, node, prev);
found = true;
break;
}
@@ -985,7 +950,7 @@ static enum io_wq_cancel io_wqe_cancel_work(struct io_wqe *wqe,
spin_unlock_irqrestore(&wqe->lock, flags);
if (found) {
- io_run_cancel(work);
+ io_run_cancel(work, wqe);
return IO_WQ_CANCEL_OK;
}
@@ -1001,22 +966,16 @@ static enum io_wq_cancel io_wqe_cancel_work(struct io_wqe *wqe,
return found ? IO_WQ_CANCEL_RUNNING : IO_WQ_CANCEL_NOTFOUND;
}
-static bool io_wq_work_match(struct io_wq_work *work, void *data)
-{
- return work == data;
-}
-
-enum io_wq_cancel io_wq_cancel_work(struct io_wq *wq, struct io_wq_work *cwork)
+enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel,
+ void *data)
{
- struct work_match match = {
- .fn = io_wq_work_match,
- .data = cwork
+ struct io_cb_cancel_data match = {
+ .fn = cancel,
+ .data = data,
};
enum io_wq_cancel ret = IO_WQ_CANCEL_NOTFOUND;
int node;
- cwork->flags |= IO_WQ_WORK_CANCEL;
-
for_each_node(node) {
struct io_wqe *wqe = wq->wqes[node];
@@ -1028,33 +987,28 @@ enum io_wq_cancel io_wq_cancel_work(struct io_wq *wq, struct io_wq_work *cwork)
return ret;
}
+static bool io_wq_io_cb_cancel_data(struct io_wq_work *work, void *data)
+{
+ return work == data;
+}
+
+enum io_wq_cancel io_wq_cancel_work(struct io_wq *wq, struct io_wq_work *cwork)
+{
+ return io_wq_cancel_cb(wq, io_wq_io_cb_cancel_data, (void *)cwork);
+}
+
static bool io_wq_pid_match(struct io_wq_work *work, void *data)
{
pid_t pid = (pid_t) (unsigned long) data;
- if (work)
- return work->task_pid == pid;
- return false;
+ return work->task_pid == pid;
}
enum io_wq_cancel io_wq_cancel_pid(struct io_wq *wq, pid_t pid)
{
- struct work_match match = {
- .fn = io_wq_pid_match,
- .data = (void *) (unsigned long) pid
- };
- enum io_wq_cancel ret = IO_WQ_CANCEL_NOTFOUND;
- int node;
-
- for_each_node(node) {
- struct io_wqe *wqe = wq->wqes[node];
+ void *data = (void *) (unsigned long) pid;
- ret = io_wqe_cancel_work(wqe, &match);
- if (ret != IO_WQ_CANCEL_NOTFOUND)
- break;
- }
-
- return ret;
+ return io_wq_cancel_cb(wq, io_wq_pid_match, data);
}
struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
@@ -1062,6 +1016,9 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
int ret = -ENOMEM, node;
struct io_wq *wq;
+ if (WARN_ON_ONCE(!data->free_work))
+ return ERR_PTR(-EINVAL);
+
wq = kzalloc(sizeof(*wq), GFP_KERNEL);
if (!wq)
return ERR_PTR(-ENOMEM);
@@ -1072,8 +1029,7 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
return ERR_PTR(-ENOMEM);
}
- wq->get_work = data->get_work;
- wq->put_work = data->put_work;
+ wq->free_work = data->free_work;
/* caller must already hold a reference to this */
wq->user = data->user;
@@ -1130,7 +1086,7 @@ err:
bool io_wq_get(struct io_wq *wq, struct io_wq_data *data)
{
- if (data->get_work != wq->get_work || data->put_work != wq->put_work)
+ if (data->free_work != wq->free_work)
return false;
return refcount_inc_not_zero(&wq->use_refs);
diff --git a/fs/io-wq.h b/fs/io-wq.h
index e5e15f2c93ec..3ee7356d6be5 100644
--- a/fs/io-wq.h
+++ b/fs/io-wq.h
@@ -5,10 +5,8 @@ struct io_wq;
enum {
IO_WQ_WORK_CANCEL = 1,
- IO_WQ_WORK_HAS_MM = 2,
IO_WQ_WORK_HASHED = 4,
IO_WQ_WORK_UNBOUND = 32,
- IO_WQ_WORK_CB = 128,
IO_WQ_WORK_NO_CANCEL = 256,
IO_WQ_WORK_CONCURRENT = 512,
@@ -30,6 +28,18 @@ struct io_wq_work_list {
struct io_wq_work_node *last;
};
+static inline void wq_list_add_after(struct io_wq_work_node *node,
+ struct io_wq_work_node *pos,
+ struct io_wq_work_list *list)
+{
+ struct io_wq_work_node *next = pos->next;
+
+ pos->next = node;
+ node->next = next;
+ if (!next)
+ list->last = node;
+}
+
static inline void wq_list_add_tail(struct io_wq_work_node *node,
struct io_wq_work_list *list)
{
@@ -42,17 +52,26 @@ static inline void wq_list_add_tail(struct io_wq_work_node *node,
}
}
-static inline void wq_node_del(struct io_wq_work_list *list,
- struct io_wq_work_node *node,
+static inline void wq_list_cut(struct io_wq_work_list *list,
+ struct io_wq_work_node *last,
struct io_wq_work_node *prev)
{
- if (node == list->first)
- WRITE_ONCE(list->first, node->next);
- if (node == list->last)
+ /* first in the list, if prev==NULL */
+ if (!prev)
+ WRITE_ONCE(list->first, last->next);
+ else
+ prev->next = last->next;
+
+ if (last == list->last)
list->last = prev;
- if (prev)
- prev->next = node->next;
- node->next = NULL;
+ last->next = NULL;
+}
+
+static inline void wq_list_del(struct io_wq_work_list *list,
+ struct io_wq_work_node *node,
+ struct io_wq_work_node *prev)
+{
+ wq_list_cut(list, node, prev);
}
#define wq_list_for_each(pos, prv, head) \
@@ -65,10 +84,7 @@ static inline void wq_node_del(struct io_wq_work_list *list,
} while (0)
struct io_wq_work {
- union {
- struct io_wq_work_node list;
- void *data;
- };
+ struct io_wq_work_node list;
void (*func)(struct io_wq_work **);
struct files_struct *files;
struct mm_struct *mm;
@@ -83,14 +99,20 @@ struct io_wq_work {
*(work) = (struct io_wq_work){ .func = _func }; \
} while (0) \
-typedef void (get_work_fn)(struct io_wq_work *);
-typedef void (put_work_fn)(struct io_wq_work *);
+static inline struct io_wq_work *wq_next_work(struct io_wq_work *work)
+{
+ if (!work->list.next)
+ return NULL;
+
+ return container_of(work->list.next, struct io_wq_work, list);
+}
+
+typedef void (free_work_fn)(struct io_wq_work *);
struct io_wq_data {
struct user_struct *user;
- get_work_fn *get_work;
- put_work_fn *put_work;
+ free_work_fn *free_work;
};
struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data);
@@ -98,7 +120,12 @@ bool io_wq_get(struct io_wq *wq, struct io_wq_data *data);
void io_wq_destroy(struct io_wq *wq);
void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work);
-void io_wq_enqueue_hashed(struct io_wq *wq, struct io_wq_work *work, void *val);
+void io_wq_hash_work(struct io_wq_work *work, void *val);
+
+static inline bool io_wq_is_hashed(struct io_wq_work *work)
+{
+ return work->flags & IO_WQ_WORK_HASHED;
+}
void io_wq_cancel_all(struct io_wq *wq);
enum io_wq_cancel io_wq_cancel_work(struct io_wq *wq, struct io_wq_work *cwork);
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 3affd96a98ba..358f97be9c7b 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -44,6 +44,7 @@
#include <linux/errno.h>
#include <linux/syscalls.h>
#include <linux/compat.h>
+#include <net/compat.h>
#include <linux/refcount.h>
#include <linux/uio.h>
#include <linux/bits.h>
@@ -76,6 +77,8 @@
#include <linux/fadvise.h>
#include <linux/eventpoll.h>
#include <linux/fs_struct.h>
+#include <linux/splice.h>
+#include <linux/task_work.h>
#define CREATE_TRACE_POINTS
#include <trace/events/io_uring.h>
@@ -193,6 +196,13 @@ struct fixed_file_data {
struct completion done;
};
+struct io_buffer {
+ struct list_head list;
+ __u64 addr;
+ __s32 len;
+ __u16 bid;
+};
+
struct io_ring_ctx {
struct {
struct percpu_ref refs;
@@ -270,6 +280,8 @@ struct io_ring_ctx {
struct socket *ring_sock;
#endif
+ struct idr io_buffer_idr;
+
struct idr personality_idr;
struct {
@@ -290,7 +302,6 @@ struct io_ring_ctx {
struct {
spinlock_t completion_lock;
- struct llist_head poll_llist;
/*
* ->poll_list is protected by the ctx->uring_lock for
@@ -386,7 +397,9 @@ struct io_sr_msg {
void __user *buf;
};
int msg_flags;
+ int bgid;
size_t len;
+ struct io_buffer *kbuf;
};
struct io_open {
@@ -430,6 +443,24 @@ struct io_epoll {
struct epoll_event event;
};
+struct io_splice {
+ struct file *file_out;
+ struct file *file_in;
+ loff_t off_out;
+ loff_t off_in;
+ u64 len;
+ unsigned int flags;
+};
+
+struct io_provide_buf {
+ struct file *file;
+ __u64 addr;
+ __s32 len;
+ __u32 bgid;
+ __u16 nbufs;
+ __u16 bid;
+};
+
struct io_async_connect {
struct sockaddr_storage address;
};
@@ -464,6 +495,7 @@ enum {
REQ_F_LINK_BIT = IOSQE_IO_LINK_BIT,
REQ_F_HARDLINK_BIT = IOSQE_IO_HARDLINK_BIT,
REQ_F_FORCE_ASYNC_BIT = IOSQE_ASYNC_BIT,
+ REQ_F_BUFFER_SELECT_BIT = IOSQE_BUFFER_SELECT_BIT,
REQ_F_LINK_NEXT_BIT,
REQ_F_FAIL_LINK_BIT,
@@ -479,6 +511,11 @@ enum {
REQ_F_COMP_LOCKED_BIT,
REQ_F_NEED_CLEANUP_BIT,
REQ_F_OVERFLOW_BIT,
+ REQ_F_POLLED_BIT,
+ REQ_F_BUFFER_SELECTED_BIT,
+
+ /* not a real bit, just to check we're not overflowing the space */
+ __REQ_F_LAST_BIT,
};
enum {
@@ -492,6 +529,8 @@ enum {
REQ_F_HARDLINK = BIT(REQ_F_HARDLINK_BIT),
/* IOSQE_ASYNC */
REQ_F_FORCE_ASYNC = BIT(REQ_F_FORCE_ASYNC_BIT),
+ /* IOSQE_BUFFER_SELECT */
+ REQ_F_BUFFER_SELECT = BIT(REQ_F_BUFFER_SELECT_BIT),
/* already grabbed next link */
REQ_F_LINK_NEXT = BIT(REQ_F_LINK_NEXT_BIT),
@@ -521,6 +560,15 @@ enum {
REQ_F_NEED_CLEANUP = BIT(REQ_F_NEED_CLEANUP_BIT),
/* in overflow list */
REQ_F_OVERFLOW = BIT(REQ_F_OVERFLOW_BIT),
+ /* already went through poll handler */
+ REQ_F_POLLED = BIT(REQ_F_POLLED_BIT),
+ /* buffer already selected */
+ REQ_F_BUFFER_SELECTED = BIT(REQ_F_BUFFER_SELECTED_BIT),
+};
+
+struct async_poll {
+ struct io_poll_iocb poll;
+ struct io_wq_work work;
};
/*
@@ -546,32 +594,45 @@ struct io_kiocb {
struct io_fadvise fadvise;
struct io_madvise madvise;
struct io_epoll epoll;
+ struct io_splice splice;
+ struct io_provide_buf pbuf;
};
struct io_async_ctx *io;
- /*
- * llist_node is only used for poll deferred completions
- */
- struct llist_node llist_node;
- bool in_async;
bool needs_fixed_file;
u8 opcode;
struct io_ring_ctx *ctx;
- union {
- struct list_head list;
- struct hlist_node hash_node;
- };
- struct list_head link_list;
+ struct list_head list;
unsigned int flags;
refcount_t refs;
+ union {
+ struct task_struct *task;
+ unsigned long fsize;
+ };
u64 user_data;
u32 result;
u32 sequence;
+ struct list_head link_list;
+
struct list_head inflight_entry;
- struct io_wq_work work;
+ union {
+ /*
+ * Only commands that never go async can use the below fields,
+ * obviously. Right now only IORING_OP_POLL_ADD uses them, and
+ * async armed poll handlers for regular commands. The latter
+ * restore the work, if needed.
+ */
+ struct {
+ struct callback_head task_work;
+ struct hlist_node hash_node;
+ struct async_poll *apoll;
+ int cflags;
+ };
+ struct io_wq_work work;
+ };
};
#define IO_PLUG_THRESHOLD 2
@@ -615,6 +676,11 @@ struct io_op_def {
unsigned file_table : 1;
/* needs ->fs */
unsigned needs_fs : 1;
+ /* set if opcode supports polled "wait" */
+ unsigned pollin : 1;
+ unsigned pollout : 1;
+ /* op supports buffer selection */
+ unsigned buffer_select : 1;
};
static const struct io_op_def io_op_defs[] = {
@@ -624,6 +690,8 @@ static const struct io_op_def io_op_defs[] = {
.needs_mm = 1,
.needs_file = 1,
.unbound_nonreg_file = 1,
+ .pollin = 1,
+ .buffer_select = 1,
},
[IORING_OP_WRITEV] = {
.async_ctx = 1,
@@ -631,6 +699,7 @@ static const struct io_op_def io_op_defs[] = {
.needs_file = 1,
.hash_reg_file = 1,
.unbound_nonreg_file = 1,
+ .pollout = 1,
},
[IORING_OP_FSYNC] = {
.needs_file = 1,
@@ -638,11 +707,13 @@ static const struct io_op_def io_op_defs[] = {
[IORING_OP_READ_FIXED] = {
.needs_file = 1,
.unbound_nonreg_file = 1,
+ .pollin = 1,
},
[IORING_OP_WRITE_FIXED] = {
.needs_file = 1,
.hash_reg_file = 1,
.unbound_nonreg_file = 1,
+ .pollout = 1,
},
[IORING_OP_POLL_ADD] = {
.needs_file = 1,
@@ -658,6 +729,7 @@ static const struct io_op_def io_op_defs[] = {
.needs_file = 1,
.unbound_nonreg_file = 1,
.needs_fs = 1,
+ .pollout = 1,
},
[IORING_OP_RECVMSG] = {
.async_ctx = 1,
@@ -665,6 +737,8 @@ static const struct io_op_def io_op_defs[] = {
.needs_file = 1,
.unbound_nonreg_file = 1,
.needs_fs = 1,
+ .pollin = 1,
+ .buffer_select = 1,
},
[IORING_OP_TIMEOUT] = {
.async_ctx = 1,
@@ -676,6 +750,7 @@ static const struct io_op_def io_op_defs[] = {
.needs_file = 1,
.unbound_nonreg_file = 1,
.file_table = 1,
+ .pollin = 1,
},
[IORING_OP_ASYNC_CANCEL] = {},
[IORING_OP_LINK_TIMEOUT] = {
@@ -687,6 +762,7 @@ static const struct io_op_def io_op_defs[] = {
.needs_mm = 1,
.needs_file = 1,
.unbound_nonreg_file = 1,
+ .pollout = 1,
},
[IORING_OP_FALLOCATE] = {
.needs_file = 1,
@@ -715,11 +791,14 @@ static const struct io_op_def io_op_defs[] = {
.needs_mm = 1,
.needs_file = 1,
.unbound_nonreg_file = 1,
+ .pollin = 1,
+ .buffer_select = 1,
},
[IORING_OP_WRITE] = {
.needs_mm = 1,
.needs_file = 1,
.unbound_nonreg_file = 1,
+ .pollout = 1,
},
[IORING_OP_FADVISE] = {
.needs_file = 1,
@@ -731,11 +810,14 @@ static const struct io_op_def io_op_defs[] = {
.needs_mm = 1,
.needs_file = 1,
.unbound_nonreg_file = 1,
+ .pollout = 1,
},
[IORING_OP_RECV] = {
.needs_mm = 1,
.needs_file = 1,
.unbound_nonreg_file = 1,
+ .pollin = 1,
+ .buffer_select = 1,
},
[IORING_OP_OPENAT2] = {
.needs_file = 1,
@@ -747,6 +829,13 @@ static const struct io_op_def io_op_defs[] = {
.unbound_nonreg_file = 1,
.file_table = 1,
},
+ [IORING_OP_SPLICE] = {
+ .needs_file = 1,
+ .hash_reg_file = 1,
+ .unbound_nonreg_file = 1,
+ },
+ [IORING_OP_PROVIDE_BUFFERS] = {},
+ [IORING_OP_REMOVE_BUFFERS] = {},
};
static void io_wq_submit_work(struct io_wq_work **workptr);
@@ -761,6 +850,10 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
static int io_grab_files(struct io_kiocb *req);
static void io_ring_file_ref_flush(struct fixed_file_data *data);
static void io_cleanup_req(struct io_kiocb *req);
+static int io_file_get(struct io_submit_state *state, struct io_kiocb *req,
+ int fd, struct file **out_file, bool fixed);
+static void __io_queue_sqe(struct io_kiocb *req,
+ const struct io_uring_sqe *sqe);
static struct kmem_cache *req_cachep;
@@ -827,11 +920,11 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
INIT_LIST_HEAD(&ctx->cq_overflow_list);
init_completion(&ctx->completions[0]);
init_completion(&ctx->completions[1]);
+ idr_init(&ctx->io_buffer_idr);
idr_init(&ctx->personality_idr);
mutex_init(&ctx->uring_lock);
init_waitqueue_head(&ctx->wait);
spin_lock_init(&ctx->completion_lock);
- init_llist_head(&ctx->poll_llist);
INIT_LIST_HEAD(&ctx->poll_list);
INIT_LIST_HEAD(&ctx->defer_list);
INIT_LIST_HEAD(&ctx->timeout_list);
@@ -952,15 +1045,14 @@ static inline void io_req_work_drop_env(struct io_kiocb *req)
}
}
-static inline bool io_prep_async_work(struct io_kiocb *req,
+static inline void io_prep_async_work(struct io_kiocb *req,
struct io_kiocb **link)
{
const struct io_op_def *def = &io_op_defs[req->opcode];
- bool do_hashed = false;
if (req->flags & REQ_F_ISREG) {
if (def->hash_reg_file)
- do_hashed = true;
+ io_wq_hash_work(&req->work, file_inode(req->file));
} else {
if (def->unbound_nonreg_file)
req->work.flags |= IO_WQ_WORK_UNBOUND;
@@ -969,25 +1061,18 @@ static inline bool io_prep_async_work(struct io_kiocb *req,
io_req_work_grab_env(req, def);
*link = io_prep_linked_timeout(req);
- return do_hashed;
}
static inline void io_queue_async_work(struct io_kiocb *req)
{
struct io_ring_ctx *ctx = req->ctx;
struct io_kiocb *link;
- bool do_hashed;
- do_hashed = io_prep_async_work(req, &link);
+ io_prep_async_work(req, &link);
- trace_io_uring_queue_async_work(ctx, do_hashed, req, &req->work,
- req->flags);