From 2b188cc1bb857a9d4701ae59aa7768b5124e262e Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 7 Jan 2019 10:46:33 -0700 Subject: Add io_uring IO interface The submission queue (SQ) and completion queue (CQ) rings are shared between the application and the kernel. This eliminates the need to copy data back and forth to submit and complete IO. IO submissions use the io_uring_sqe data structure, and completions are generated in the form of io_uring_cqe data structures. The SQ ring is an index into the io_uring_sqe array, which makes it possible to submit a batch of IOs without them being contiguous in the ring. The CQ ring is always contiguous, as completion events are inherently unordered, and hence any io_uring_cqe entry can point back to an arbitrary submission. Two new system calls are added for this: io_uring_setup(entries, params) Sets up an io_uring instance for doing async IO. On success, returns a file descriptor that the application can mmap to gain access to the SQ ring, CQ ring, and io_uring_sqes. io_uring_enter(fd, to_submit, min_complete, flags, sigset, sigsetsize) Initiates IO against the rings mapped to this fd, or waits for them to complete, or both. The behavior is controlled by the parameters passed in. If 'to_submit' is non-zero, then we'll try and submit new IO. If IORING_ENTER_GETEVENTS is set, the kernel will wait for 'min_complete' events, if they aren't already available. It's valid to set IORING_ENTER_GETEVENTS and 'min_complete' == 0 at the same time, this allows the kernel to return already completed events without waiting for them. This is useful only for polling, as for IRQ driven IO, the application can just check the CQ ring without entering the kernel. With this setup, it's possible to do async IO with a single system call. Future developments will enable polled IO with this interface, and polled submission as well. The latter will enable an application to do IO without doing ANY system calls at all. For IRQ driven IO, an application only needs to enter the kernel for completions if it wants to wait for them to occur. Each io_uring is backed by a workqueue, to support buffered async IO as well. We will only punt to an async context if the command would need to wait for IO on the device side. Any data that can be accessed directly in the page cache is done inline. This avoids the slowness issue of usual threadpools, since cached data is accessed as quickly as a sync interface. Sample application: http://git.kernel.dk/cgit/fio/plain/t/io_uring.c Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- include/linux/fs.h | 9 ++++ include/linux/sched/user.h | 2 +- include/linux/syscalls.h | 6 +++ include/uapi/asm-generic/unistd.h | 6 ++- include/uapi/linux/io_uring.h | 95 +++++++++++++++++++++++++++++++++++++++ 5 files changed, 116 insertions(+), 2 deletions(-) create mode 100644 include/uapi/linux/io_uring.h (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index dedcc2e9265c..61aa210f0c2b 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3517,4 +3517,13 @@ extern void inode_nohighmem(struct inode *inode); extern int vfs_fadvise(struct file *file, loff_t offset, loff_t len, int advice); +#if defined(CONFIG_IO_URING) +extern struct sock *io_uring_get_socket(struct file *file); +#else +static inline struct sock *io_uring_get_socket(struct file *file) +{ + return NULL; +} +#endif + #endif /* _LINUX_FS_H */ diff --git a/include/linux/sched/user.h b/include/linux/sched/user.h index 39ad98c09c58..c7b5f86b91a1 100644 --- a/include/linux/sched/user.h +++ b/include/linux/sched/user.h @@ -40,7 +40,7 @@ struct user_struct { kuid_t uid; #if defined(CONFIG_PERF_EVENTS) || defined(CONFIG_BPF_SYSCALL) || \ - defined(CONFIG_NET) + defined(CONFIG_NET) || defined(CONFIG_IO_URING) atomic_long_t locked_vm; #endif diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 257cccba3062..3072dbaa7869 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -69,6 +69,7 @@ struct file_handle; struct sigaltstack; struct rseq; union bpf_attr; +struct io_uring_params; #include #include @@ -309,6 +310,11 @@ asmlinkage long sys_io_pgetevents_time32(aio_context_t ctx_id, struct io_event __user *events, struct old_timespec32 __user *timeout, const struct __aio_sigset *sig); +asmlinkage long sys_io_uring_setup(u32 entries, + struct io_uring_params __user *p); +asmlinkage long sys_io_uring_enter(unsigned int fd, u32 to_submit, + u32 min_complete, u32 flags, + const sigset_t __user *sig, size_t sigsz); /* fs/xattr.c */ asmlinkage long sys_setxattr(const char __user *path, const char __user *name, diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h index d90127298f12..87871e7b7ea7 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -740,9 +740,13 @@ __SC_COMP(__NR_io_pgetevents, sys_io_pgetevents, compat_sys_io_pgetevents) __SYSCALL(__NR_rseq, sys_rseq) #define __NR_kexec_file_load 294 __SYSCALL(__NR_kexec_file_load, sys_kexec_file_load) +#define __NR_io_uring_setup 425 +__SYSCALL(__NR_io_uring_setup, sys_io_uring_setup) +#define __NR_io_uring_enter 426 +__SYSCALL(__NR_io_uring_enter, sys_io_uring_enter) #undef __NR_syscalls -#define __NR_syscalls 295 +#define __NR_syscalls 427 /* * 32 bit systems traditionally used different diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h new file mode 100644 index 000000000000..ac692823d6f4 --- /dev/null +++ b/include/uapi/linux/io_uring.h @@ -0,0 +1,95 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * Header file for the io_uring interface. + * + * Copyright (C) 2019 Jens Axboe + * Copyright (C) 2019 Christoph Hellwig + */ +#ifndef LINUX_IO_URING_H +#define LINUX_IO_URING_H + +#include +#include + +/* + * IO submission data structure (Submission Queue Entry) + */ +struct io_uring_sqe { + __u8 opcode; /* type of operation for this sqe */ + __u8 flags; /* as of now unused */ + __u16 ioprio; /* ioprio for the request */ + __s32 fd; /* file descriptor to do IO on */ + __u64 off; /* offset into file */ + __u64 addr; /* pointer to buffer or iovecs */ + __u32 len; /* buffer size or number of iovecs */ + union { + __kernel_rwf_t rw_flags; + __u32 __resv; + }; + __u64 user_data; /* data to be passed back at completion time */ + __u64 __pad2[3]; +}; + +#define IORING_OP_NOP 0 +#define IORING_OP_READV 1 +#define IORING_OP_WRITEV 2 + +/* + * IO completion data structure (Completion Queue Entry) + */ +struct io_uring_cqe { + __u64 user_data; /* sqe->data submission passed back */ + __s32 res; /* result code for this event */ + __u32 flags; +}; + +/* + * Magic offsets for the application to mmap the data it needs + */ +#define IORING_OFF_SQ_RING 0ULL +#define IORING_OFF_CQ_RING 0x8000000ULL +#define IORING_OFF_SQES 0x10000000ULL + +/* + * Filled with the offset for mmap(2) + */ +struct io_sqring_offsets { + __u32 head; + __u32 tail; + __u32 ring_mask; + __u32 ring_entries; + __u32 flags; + __u32 dropped; + __u32 array; + __u32 resv1; + __u64 resv2; +}; + +struct io_cqring_offsets { + __u32 head; + __u32 tail; + __u32 ring_mask; + __u32 ring_entries; + __u32 overflow; + __u32 cqes; + __u64 resv[2]; +}; + +/* + * io_uring_enter(2) flags + */ +#define IORING_ENTER_GETEVENTS (1U << 0) + +/* + * Passed in for io_uring_setup(2). Copied back with updated info on success + */ +struct io_uring_params { + __u32 sq_entries; + __u32 cq_entries; + __u32 flags; + __u32 resv[7]; + struct io_sqring_offsets sq_off; + struct io_cqring_offsets cq_off; +}; + +#endif -- cgit v1.2.3 From c992fe2925d776be066d9f6cc13f9ea11d78b657 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 11 Jan 2019 09:43:02 -0700 Subject: io_uring: add fsync support Add a new fsync opcode, which either syncs a range if one is passed, or the whole file if the offset and length fields are both cleared to zero. A flag is provided to use fdatasync semantics, that is only force out metadata which is required to retrieve the file data, but not others like metadata. Reviewed-by: Hannes Reinecke Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index ac692823d6f4..4589d56d0b68 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -24,7 +24,7 @@ struct io_uring_sqe { __u32 len; /* buffer size or number of iovecs */ union { __kernel_rwf_t rw_flags; - __u32 __resv; + __u32 fsync_flags; }; __u64 user_data; /* data to be passed back at completion time */ __u64 __pad2[3]; @@ -33,6 +33,12 @@ struct io_uring_sqe { #define IORING_OP_NOP 0 #define IORING_OP_READV 1 #define IORING_OP_WRITEV 2 +#define IORING_OP_FSYNC 3 + +/* + * sqe->fsync_flags + */ +#define IORING_FSYNC_DATASYNC (1U << 0) /* * IO completion data structure (Completion Queue Entry) -- cgit v1.2.3 From def596e9557c91d9846fc4d84d26f2c564644416 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 9 Jan 2019 08:59:42 -0700 Subject: io_uring: support for IO polling Add support for a polled io_uring instance. When a read or write is submitted to a polled io_uring, the application must poll for completions on the CQ ring through io_uring_enter(2). Polled IO may not generate IRQ completions, hence they need to be actively found by the application itself. To use polling, io_uring_setup() must be used with the IORING_SETUP_IOPOLL flag being set. It is illegal to mix and match polled and non-polled IO on an io_uring. Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 4589d56d0b68..5c457ea396e6 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -30,6 +30,11 @@ struct io_uring_sqe { __u64 __pad2[3]; }; +/* + * io_uring_setup() flags + */ +#define IORING_SETUP_IOPOLL (1U << 0) /* io_context is polled */ + #define IORING_OP_NOP 0 #define IORING_OP_READV 1 #define IORING_OP_WRITEV 2 -- cgit v1.2.3 From 091141a42e15fe47ada737f3996b317072afcefb Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 21 Nov 2018 10:32:39 -0700 Subject: fs: add fget_many() and fput_many() Some uses cases repeatedly get and put references to the same file, but the only exposed interface is doing these one at the time. As each of these entail an atomic inc or dec on a shared structure, that cost can add up. Add fget_many(), which works just like fget(), except it takes an argument for how many references to get on the file. Ditto fput_many(), which can drop an arbitrary number of references to a file. Reviewed-by: Hannes Reinecke Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/file.h | 2 ++ include/linux/fs.h | 4 +++- 2 files changed, 5 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/file.h b/include/linux/file.h index 6b2fb032416c..3fcddff56bc4 100644 --- a/include/linux/file.h +++ b/include/linux/file.h @@ -13,6 +13,7 @@ struct file; extern void fput(struct file *); +extern void fput_many(struct file *, unsigned int); struct file_operations; struct vfsmount; @@ -44,6 +45,7 @@ static inline void fdput(struct fd fd) } extern struct file *fget(unsigned int fd); +extern struct file *fget_many(unsigned int fd, unsigned int refs); extern struct file *fget_raw(unsigned int fd); extern unsigned long __fdget(unsigned int fd); extern unsigned long __fdget_raw(unsigned int fd); diff --git a/include/linux/fs.h b/include/linux/fs.h index 61aa210f0c2b..80e1b199a4b1 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -952,7 +952,9 @@ static inline struct file *get_file(struct file *f) atomic_long_inc(&f->f_count); return f; } -#define get_file_rcu(x) atomic_long_inc_not_zero(&(x)->f_count) +#define get_file_rcu_many(x, cnt) \ + atomic_long_add_unless(&(x)->f_count, (cnt), 0) +#define get_file_rcu(x) get_file_rcu_many((x), 1) #define fput_atomic(x) atomic_long_add_unless(&(x)->f_count, -1, 1) #define file_count(x) atomic_long_read(&(x)->f_count) -- cgit v1.2.3 From edafccee56ff31678a091ddb7219aba9b28bc3cb Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 9 Jan 2019 09:16:05 -0700 Subject: io_uring: add support for pre-mapped user IO buffers If we have fixed user buffers, we can map them into the kernel when we setup the io_uring. That avoids the need to do get_user_pages() for each and every IO. To utilize this feature, the application must call io_uring_register() after having setup an io_uring instance, passing in IORING_REGISTER_BUFFERS as the opcode. The argument must be a pointer to an iovec array, and the nr_args should contain how many iovecs the application wishes to map. If successful, these buffers are now mapped into the kernel, eligible for IO. To use these fixed buffers, the application must use the IORING_OP_READ_FIXED and IORING_OP_WRITE_FIXED opcodes, and then set sqe->index to the desired buffer index. sqe->addr..sqe->addr+seq->len must point to somewhere inside the indexed buffer. The application may register buffers throughout the lifetime of the io_uring instance. It can call io_uring_register() with IORING_UNREGISTER_BUFFERS as the opcode to unregister the current set of buffers, and then register a new set. The application need not unregister buffers explicitly before shutting down the io_uring instance. It's perfectly valid to setup a larger buffer, and then sometimes only use parts of it for an IO. As long as the range is within the originally mapped region, it will work just fine. For now, buffers must not be file backed. If file backed buffers are passed in, the registration will fail with -1/EOPNOTSUPP. This restriction may be relaxed in the future. RLIMIT_MEMLOCK is used to check how much memory we can pin. A somewhat arbitrary 1G per buffer size is also imposed. Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- include/linux/syscalls.h | 2 ++ include/uapi/asm-generic/unistd.h | 4 +++- include/uapi/linux/io_uring.h | 13 ++++++++++++- 3 files changed, 17 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 3072dbaa7869..3681c05ac538 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -315,6 +315,8 @@ asmlinkage long sys_io_uring_setup(u32 entries, asmlinkage long sys_io_uring_enter(unsigned int fd, u32 to_submit, u32 min_complete, u32 flags, const sigset_t __user *sig, size_t sigsz); +asmlinkage long sys_io_uring_register(unsigned int fd, unsigned int op, + void __user *arg, unsigned int nr_args); /* fs/xattr.c */ asmlinkage long sys_setxattr(const char __user *path, const char __user *name, diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h index 87871e7b7ea7..d346229a1eb0 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -744,9 +744,11 @@ __SYSCALL(__NR_kexec_file_load, sys_kexec_file_load) __SYSCALL(__NR_io_uring_setup, sys_io_uring_setup) #define __NR_io_uring_enter 426 __SYSCALL(__NR_io_uring_enter, sys_io_uring_enter) +#define __NR_io_uring_register 427 +__SYSCALL(__NR_io_uring_register, sys_io_uring_register) #undef __NR_syscalls -#define __NR_syscalls 427 +#define __NR_syscalls 428 /* * 32 bit systems traditionally used different diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 5c457ea396e6..cf28f7a11f12 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -27,7 +27,10 @@ struct io_uring_sqe { __u32 fsync_flags; }; __u64 user_data; /* data to be passed back at completion time */ - __u64 __pad2[3]; + union { + __u16 buf_index; /* index into fixed buffers, if used */ + __u64 __pad2[3]; + }; }; /* @@ -39,6 +42,8 @@ struct io_uring_sqe { #define IORING_OP_READV 1 #define IORING_OP_WRITEV 2 #define IORING_OP_FSYNC 3 +#define IORING_OP_READ_FIXED 4 +#define IORING_OP_WRITE_FIXED 5 /* * sqe->fsync_flags @@ -103,4 +108,10 @@ struct io_uring_params { struct io_cqring_offsets cq_off; }; +/* + * io_uring_register(2) opcodes and arguments + */ +#define IORING_REGISTER_BUFFERS 0 +#define IORING_UNREGISTER_BUFFERS 1 + #endif -- cgit v1.2.3 From f4e65870e5cede5ca1ec0006b6c9803994e5f7b8 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 8 Feb 2019 09:01:44 -0700 Subject: net: split out functions related to registering inflight socket files We need this functionality for the io_uring file registration, but we cannot rely on it since CONFIG_UNIX can be modular. Move the helpers to a separate file, that's always builtin to the kernel if CONFIG_UNIX is m/y. No functional changes in this patch, just moving code around. Reviewed-by: Hannes Reinecke Acked-by: David S. Miller Signed-off-by: Jens Axboe --- include/net/af_unix.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/af_unix.h b/include/net/af_unix.h index ddbba838d048..3426d6dacc45 100644 --- a/include/net/af_unix.h +++ b/include/net/af_unix.h @@ -10,6 +10,7 @@ void unix_inflight(struct user_struct *user, struct file *fp); void unix_notinflight(struct user_struct *user, struct file *fp); +void unix_destruct_scm(struct sk_buff *skb); void unix_gc(void); void wait_for_unix_gc(void); struct sock *unix_get_socket(struct file *filp); -- cgit v1.2.3 From 6b06314c47e141031be043539900d80d2c7ba10f Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 10 Jan 2019 22:13:58 -0700 Subject: io_uring: add file set registration We normally have to fget/fput for each IO we do on a file. Even with the batching we do, the cost of the atomic inc/dec of the file usage count adds up. This adds IORING_REGISTER_FILES, and IORING_UNREGISTER_FILES opcodes for the io_uring_register(2) system call. The arguments passed in must be an array of __s32 holding file descriptors, and nr_args should hold the number of file descriptors the application wishes to pin for the duration of the io_uring instance (or until IORING_UNREGISTER_FILES is called). When used, the application must set IOSQE_FIXED_FILE in the sqe->flags member. Then, instead of setting sqe->fd to the real fd, it sets sqe->fd to the index in the array passed in to IORING_REGISTER_FILES. Files are automatically unregistered when the io_uring instance is torn down. An application need only unregister if it wishes to register a new set of fds. Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index cf28f7a11f12..6257478d55e9 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -16,7 +16,7 @@ */ struct io_uring_sqe { __u8 opcode; /* type of operation for this sqe */ - __u8 flags; /* as of now unused */ + __u8 flags; /* IOSQE_ flags */ __u16 ioprio; /* ioprio for the request */ __s32 fd; /* file descriptor to do IO on */ __u64 off; /* offset into file */ @@ -33,6 +33,11 @@ struct io_uring_sqe { }; }; +/* + * sqe->flags + */ +#define IOSQE_FIXED_FILE (1U << 0) /* use fixed fileset */ + /* * io_uring_setup() flags */ @@ -113,5 +118,7 @@ struct io_uring_params { */ #define IORING_REGISTER_BUFFERS 0 #define IORING_UNREGISTER_BUFFERS 1 +#define IORING_REGISTER_FILES 2 +#define IORING_UNREGISTER_FILES 3 #endif -- cgit v1.2.3 From 6c271ce2f1d572f7fa225700a13cfe7ced492434 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 10 Jan 2019 11:22:30 -0700 Subject: io_uring: add submission polling This enables an application to do IO, without ever entering the kernel. By using the SQ ring to fill in new sqes and watching for completions on the CQ ring, we can submit and reap IOs without doing a single system call. The kernel side thread will poll for new submissions, and in case of HIPRI/polled IO, it'll also poll for completions. By default, we allow 1 second of active spinning. This can by changed by passing in a different grace period at io_uring_register(2) time. If the thread exceeds this idle time without having any work to do, it will set: sq_ring->flags |= IORING_SQ_NEED_WAKEUP. The application will have to call io_uring_enter() to start things back up again. If IO is kept busy, that will never be needed. Basically an application that has this feature enabled will guard it's io_uring_enter(2) call with: read_barrier(); if (*sq_ring->flags & IORING_SQ_NEED_WAKEUP) io_uring_enter(fd, 0, 0, IORING_ENTER_SQ_WAKEUP); instead of calling it unconditionally. It's mandatory to use fixed files with this feature. Failure to do so will result in the application getting an -EBADF CQ entry when submitting IO. Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 6257478d55e9..0ec74bab8dbe 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -42,6 +42,8 @@ struct io_uring_sqe { * io_uring_setup() flags */ #define IORING_SETUP_IOPOLL (1U << 0) /* io_context is polled */ +#define IORING_SETUP_SQPOLL (1U << 1) /* SQ poll thread */ +#define IORING_SETUP_SQ_AFF (1U << 2) /* sq_thread_cpu is valid */ #define IORING_OP_NOP 0 #define IORING_OP_READV 1 @@ -86,6 +88,11 @@ struct io_sqring_offsets { __u64 resv2; }; +/* + * sq_ring->flags + */ +#define IORING_SQ_NEED_WAKEUP (1U << 0) /* needs io_uring_enter wakeup */ + struct io_cqring_offsets { __u32 head; __u32 tail; @@ -100,6 +107,7 @@ struct io_cqring_offsets { * io_uring_enter(2) flags */ #define IORING_ENTER_GETEVENTS (1U << 0) +#define IORING_ENTER_SQ_WAKEUP (1U << 1) /* * Passed in for io_uring_setup(2). Copied back with updated info on success @@ -108,7 +116,9 @@ struct io_uring_params { __u32 sq_entries; __u32 cq_entries; __u32 flags; - __u32 resv[7]; + __u32 sq_thread_cpu; + __u32 sq_thread_idle; + __u32 resv[5]; struct io_sqring_offsets sq_off; struct io_cqring_offsets cq_off; }; -- cgit v1.2.3 From 221c5eb2338232f7340386de1c43decc32682e58 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 17 Jan 2019 09:41:58 -0700 Subject: io_uring: add support for IORING_OP_POLL This is basically a direct port of bfe4037e722e, which implements a one-shot poll command through aio. Description below is based on that commit as well. However, instead of adding a POLL command and relying on io_cancel(2) to remove it, we mimic the epoll(2) interface of having a command to add a poll notification, IORING_OP_POLL_ADD, and one to remove it again, IORING_OP_POLL_REMOVE. To poll for a file descriptor the application should submit an sqe of type IORING_OP_POLL. It will poll the fd for the events specified in the poll_events field. Unlike poll or epoll without EPOLLONESHOT this interface always works in one shot mode, that is once the sqe is completed, it will have to be resubmitted. Reviewed-by: Hannes Reinecke Based-on-code-from: Christoph Hellwig Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 0ec74bab8dbe..e23408692118 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -25,6 +25,7 @@ struct io_uring_sqe { union { __kernel_rwf_t rw_flags; __u32 fsync_flags; + __u16 poll_events; }; __u64 user_data; /* data to be passed back at completion time */ union { @@ -51,6 +52,8 @@ struct io_uring_sqe { #define IORING_OP_FSYNC 3 #define IORING_OP_READ_FIXED 4 #define IORING_OP_WRITE_FIXED 5 +#define IORING_OP_POLL_ADD 6 +#define IORING_OP_POLL_REMOVE 7 /* * sqe->fsync_flags -- cgit v1.2.3