summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/userspace-api/ioctl/ioctl-number.rst1
-rw-r--r--Documentation/watch_queue.rst339
-rw-r--r--fs/pipe.c206
-rw-r--r--fs/splice.c12
-rw-r--r--include/linux/pipe_fs_i.h19
-rw-r--r--include/linux/watch_queue.h127
-rw-r--r--include/uapi/linux/watch_queue.h20
-rw-r--r--init/Kconfig12
-rw-r--r--kernel/Makefile1
-rw-r--r--kernel/watch_queue.c657
10 files changed, 1318 insertions, 76 deletions
diff --git a/Documentation/userspace-api/ioctl/ioctl-number.rst b/Documentation/userspace-api/ioctl/ioctl-number.rst
index f759edafd938..9425377615ce 100644
--- a/Documentation/userspace-api/ioctl/ioctl-number.rst
+++ b/Documentation/userspace-api/ioctl/ioctl-number.rst
@@ -201,6 +201,7 @@ Code Seq# Include File Comments
'W' 00-1F linux/wanrouter.h conflict! (pre 3.9)
'W' 00-3F sound/asound.h conflict!
'W' 40-5F drivers/pci/switch/switchtec.c
+'W' 60-61 linux/watch_queue.h
'X' all fs/xfs/xfs_fs.h, conflict!
fs/xfs/linux-2.6/xfs_ioctl32.h,
include/linux/falloc.h,
diff --git a/Documentation/watch_queue.rst b/Documentation/watch_queue.rst
new file mode 100644
index 000000000000..849fad6893ef
--- /dev/null
+++ b/Documentation/watch_queue.rst
@@ -0,0 +1,339 @@
+==============================
+General notification mechanism
+==============================
+
+The general notification mechanism is built on top of the standard pipe driver
+whereby it effectively splices notification messages from the kernel into pipes
+opened by userspace. This can be used in conjunction with::
+
+ * Key/keyring notifications
+
+
+The notifications buffers can be enabled by:
+
+ "General setup"/"General notification queue"
+ (CONFIG_WATCH_QUEUE)
+
+This document has the following sections:
+
+.. contents:: :local:
+
+
+Overview
+========
+
+This facility appears as a pipe that is opened in a special mode. The pipe's
+internal ring buffer is used to hold messages that are generated by the kernel.
+These messages are then read out by read(). Splice and similar are disabled on
+such pipes due to them wanting to, under some circumstances, revert their
+additions to the ring - which might end up interleaved with notification
+messages.
+
+The owner of the pipe has to tell the kernel which sources it would like to
+watch through that pipe. Only sources that have been connected to a pipe will
+insert messages into it. Note that a source may be bound to multiple pipes and
+insert messages into all of them simultaneously.
+
+Filters may also be emplaced on a pipe so that certain source types and
+subevents can be ignored if they're not of interest.
+
+A message will be discarded if there isn't a slot available in the ring or if
+no preallocated message buffer is available. In both of these cases, read()
+will insert a WATCH_META_LOSS_NOTIFICATION message into the output buffer after
+the last message currently in the buffer has been read.
+
+Note that when producing a notification, the kernel does not wait for the
+consumers to collect it, but rather just continues on. This means that
+notifications can be generated whilst spinlocks are held and also protects the
+kernel from being held up indefinitely by a userspace malfunction.
+
+
+Message Structure
+=================
+
+Notification messages begin with a short header::
+
+ struct watch_notification {
+ __u32 type:24;
+ __u32 subtype:8;
+ __u32 info;
+ };
+
+"type" indicates the source of the notification record and "subtype" indicates
+the type of record from that source (see the Watch Sources section below). The
+type may also be "WATCH_TYPE_META". This is a special record type generated
+internally by the watch queue itself. There are two subtypes:
+
+ * WATCH_META_REMOVAL_NOTIFICATION
+ * WATCH_META_LOSS_NOTIFICATION
+
+The first indicates that an object on which a watch was installed was removed
+or destroyed and the second indicates that some messages have been lost.
+
+"info" indicates a bunch of things, including:
+
+ * The length of the message in bytes, including the header (mask with
+ WATCH_INFO_LENGTH and shift by WATCH_INFO_LENGTH__SHIFT). This indicates
+ the size of the record, which may be between 8 and 127 bytes.
+
+ * The watch ID (mask with WATCH_INFO_ID and shift by WATCH_INFO_ID__SHIFT).
+ This indicates that caller's ID of the watch, which may be between 0
+ and 255. Multiple watches may share a queue, and this provides a means to
+ distinguish them.
+
+ * A type-specific field (WATCH_INFO_TYPE_INFO). This is set by the
+ notification producer to indicate some meaning specific to the type and
+ subtype.
+
+Everything in info apart from the length can be used for filtering.
+
+The header can be followed by supplementary information. The format of this is
+at the discretion is defined by the type and subtype.
+
+
+Watch List (Notification Source) API
+====================================
+
+A "watch list" is a list of watchers that are subscribed to a source of
+notifications. A list may be attached to an object (say a key or a superblock)
+or may be global (say for device events). From a userspace perspective, a
+non-global watch list is typically referred to by reference to the object it
+belongs to (such as using KEYCTL_NOTIFY and giving it a key serial number to
+watch that specific key).
+
+To manage a watch list, the following functions are provided:
+
+ * ``void init_watch_list(struct watch_list *wlist,
+ void (*release_watch)(struct watch *wlist));``
+
+ Initialise a watch list. If ``release_watch`` is not NULL, then this
+ indicates a function that should be called when the watch_list object is
+ destroyed to discard any references the watch list holds on the watched
+ object.
+
+ * ``void remove_watch_list(struct watch_list *wlist);``
+
+ This removes all of the watches subscribed to a watch_list and frees them
+ and then destroys the watch_list object itself.
+
+
+Watch Queue (Notification Output) API
+=====================================
+
+A "watch queue" is the buffer allocated by an application that notification
+records will be written into. The workings of this are hidden entirely inside
+of the pipe device driver, but it is necessary to gain a reference to it to set
+a watch. These can be managed with:
+
+ * ``struct watch_queue *get_watch_queue(int fd);``
+
+ Since watch queues are indicated to the kernel by the fd of the pipe that
+ implements the buffer, userspace must hand that fd through a system call.
+ This can be used to look up an opaque pointer to the watch queue from the
+ system call.
+
+ * ``void put_watch_queue(struct watch_queue *wqueue);``
+
+ This discards the reference obtained from ``get_watch_queue()``.
+
+
+Watch Subscription API
+======================
+
+A "watch" is a subscription on a watch list, indicating the watch queue, and
+thus the buffer, into which notification records should be written. The watch
+queue object may also carry filtering rules for that object, as set by
+userspace. Some parts of the watch struct can be set by the driver::
+
+ struct watch {
+ union {
+ u32 info_id; /* ID to be OR'd in to info field */
+ ...
+ };
+ void *private; /* Private data for the watched object */
+ u64 id; /* Internal identifier */
+ ...
+ };
+
+The ``info_id`` value should be an 8-bit number obtained from userspace and
+shifted by WATCH_INFO_ID__SHIFT. This is OR'd into the WATCH_INFO_ID field of
+struct watch_notification::info when and if the notification is written into
+the associated watch queue buffer.
+
+The ``private`` field is the driver's data associated with the watch_list and
+is cleaned up by the ``watch_list::release_watch()`` method.
+
+The ``id`` field is the source's ID. Notifications that are posted with a
+different ID are ignored.
+
+The following functions are provided to manage watches:
+
+ * ``void init_watch(struct watch *watch, struct watch_queue *wqueue);``
+
+ Initialise a watch object, setting its pointer to the watch queue, using
+ appropriate barriering to avoid lockdep complaints.
+
+ * ``int add_watch_to_object(struct watch *watch, struct watch_list *wlist);``
+
+ Subscribe a watch to a watch list (notification source). The
+ driver-settable fields in the watch struct must have been set before this
+ is called.
+
+ * ``int remove_watch_from_object(struct watch_list *wlist,
+ struct watch_queue *wqueue,
+ u64 id, false);``
+
+ Remove a watch from a watch list, where the watch must match the specified
+ watch queue (``wqueue``) and object identifier (``id``). A notification
+ (``WATCH_META_REMOVAL_NOTIFICATION``) is sent to the watch queue to
+ indicate that the watch got removed.
+
+ * ``int remove_watch_from_object(struct watch_list *wlist, NULL, 0, true);``
+
+ Remove all the watches from a watch list. It is expected that this will be
+ called preparatory to destruction and that the watch list will be
+ inaccessible to new watches by this point. A notification
+ (``WATCH_META_REMOVAL_NOTIFICATION``) is sent to the watch queue of each
+ subscribed watch to indicate that the watch got removed.
+
+
+Notification Posting API
+========================
+
+To post a notification to watch list so that the subscribed watches can see it,
+the following function should be used::
+
+ void post_watch_notification(struct watch_list *wlist,
+ struct watch_notification *n,
+ const struct cred *cred,
+ u64 id);
+
+The notification should be preformatted and a pointer to the header (``n``)
+should be passed in. The notification may be larger than this and the size in
+units of buffer slots is noted in ``n->info & WATCH_INFO_LENGTH``.
+
+The ``cred`` struct indicates the credentials of the source (subject) and is
+passed to the LSMs, such as SELinux, to allow or suppress the recording of the
+note in each individual queue according to the credentials of that queue
+(object).
+
+The ``id`` is the ID of the source object (such as the serial number on a key).
+Only watches that have the same ID set in them will see this notification.
+
+
+Watch Sources
+=============
+
+Any particular buffer can be fed from multiple sources. Sources include:
+
+ * WATCH_TYPE_KEY_NOTIFY
+
+ Notifications of this type indicate changes to keys and keyrings, including
+ the changes of keyring contents or the attributes of keys.
+
+ See Documentation/security/keys/core.rst for more information.
+
+
+Event Filtering
+===============
+
+Once a watch queue has been created, a set of filters can be applied to limit
+the events that are received using::
+
+ struct watch_notification_filter filter = {
+ ...
+ };
+ ioctl(fd, IOC_WATCH_QUEUE_SET_FILTER, &filter)
+
+The filter description is a variable of type::
+
+ struct watch_notification_filter {
+ __u32 nr_filters;
+ __u32 __reserved;
+ struct watch_notification_type_filter filters[];
+ };
+
+Where "nr_filters" is the number of filters in filters[] and "__reserved"
+should be 0. The "filters" array has elements of the following type::
+
+ struct watch_notification_type_filter {
+ __u32 type;
+ __u32 info_filter;
+ __u32 info_mask;
+ __u32 subtype_filter[8];
+ };
+
+Where:
+
+ * ``type`` is the event type to filter for and should be something like
+ "WATCH_TYPE_KEY_NOTIFY"
+
+ * ``info_filter`` and ``info_mask`` act as a filter on the info field of the
+ notification record. The notification is only written into the buffer if::
+
+ (watch.info & info_mask) == info_filter
+
+ This could be used, for example, to ignore events that are not exactly on
+ the watched point in a mount tree.
+
+ * ``subtype_filter`` is a bitmask indicating the subtypes that are of
+ interest. Bit 0 of subtype_filter[0] corresponds to subtype 0, bit 1 to
+ subtype 1, and so on.
+
+If the argument to the ioctl() is NULL, then the filters will be removed and
+all events from the watched sources will come through.
+
+
+Userspace Code Example
+======================
+
+A buffer is created with something like the following::
+
+ pipe2(fds, O_TMPFILE);
+ ioctl(fds[1], IOC_WATCH_QUEUE_SET_SIZE, 256);
+
+It can then be set to receive keyring change notifications::
+
+ keyctl(KEYCTL_WATCH_KEY, KEY_SPEC_SESSION_KEYRING, fds[1], 0x01);
+
+The notifications can then be consumed by something like the following::
+
+ static void consumer(int rfd, struct watch_queue_buffer *buf)
+ {
+ unsigned char buffer[128];
+ ssize_t buf_len;
+
+ while (buf_len = read(rfd, buffer, sizeof(buffer)),
+ buf_len > 0
+ ) {
+ void *p = buffer;
+ void *end = buffer + buf_len;
+ while (p < end) {
+ union {
+ struct watch_notification n;
+ unsigned char buf1[128];
+ } n;
+ size_t largest, len;
+
+ largest = end - p;
+ if (largest > 128)
+ largest = 128;
+ memcpy(&n, p, largest);
+
+ len = (n->info & WATCH_INFO_LENGTH) >>
+ WATCH_INFO_LENGTH__SHIFT;
+ if (len == 0 || len > largest)
+ return;
+
+ switch (n.n.type) {
+ case WATCH_TYPE_META:
+ got_meta(&n.n);
+ case WATCH_TYPE_KEY_NOTIFY:
+ saw_key_change(&n.n);
+ break;
+ }
+
+ p += len;
+ }
+ }
+ }
diff --git a/fs/pipe.c b/fs/pipe.c
index 16fb72e9abf7..da9bc1f21fd1 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -24,6 +24,7 @@
#include <linux/syscalls.h>
#include <linux/fcntl.h>
#include <linux/memcontrol.h>
+#include <linux/watch_queue.h>
#include <linux/uaccess.h>
#include <asm/ioctls.h>
@@ -459,6 +460,13 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from)
goto out;
}
+#ifdef CONFIG_WATCH_QUEUE
+ if (pipe->watch_queue) {
+ ret = -EXDEV;
+ goto out;
+ }
+#endif
+
/*
* Only wake up if the pipe started out empty, since
* otherwise there should be no readers waiting.
@@ -628,22 +636,37 @@ static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
int count, head, tail, mask;
switch (cmd) {
- case FIONREAD:
- __pipe_lock(pipe);
- count = 0;
- head = pipe->head;
- tail = pipe->tail;
- mask = pipe->ring_size - 1;
+ case FIONREAD:
+ __pipe_lock(pipe);
+ count = 0;
+ head = pipe->head;
+ tail = pipe->tail;
+ mask = pipe->ring_size - 1;
- while (tail != head) {
- count += pipe->bufs[tail & mask].len;
- tail++;
- }
- __pipe_unlock(pipe);
+ while (tail != head) {
+ count += pipe->bufs[tail & mask].len;
+ tail++;
+ }
+ __pipe_unlock(pipe);
- return put_user(count, (int __user *)arg);
- default:
- return -ENOIOCTLCMD;
+ return put_user(count, (int __user *)arg);
+
+#ifdef CONFIG_WATCH_QUEUE
+ case IOC_WATCH_QUEUE_SET_SIZE: {
+ int ret;
+ __pipe_lock(pipe);
+ ret = watch_queue_set_size(pipe, arg);
+ __pipe_unlock(pipe);
+ return ret;
+ }
+
+ case IOC_WATCH_QUEUE_SET_FILTER:
+ return watch_queue_set_filter(
+ pipe, (struct watch_notification_filter __user *)arg);
+#endif
+
+ default:
+ return -ENOIOCTLCMD;
}
}
@@ -754,27 +777,27 @@ pipe_fasync(int fd, struct file *filp, int on)
return retval;
}
-static unsigned long account_pipe_buffers(struct user_struct *user,
- unsigned long old, unsigned long new)
+unsigned long account_pipe_buffers(struct user_struct *user,
+ unsigned long old, unsigned long new)
{
return atomic_long_add_return(new - old, &user->pipe_bufs);
}
-static bool too_many_pipe_buffers_soft(unsigned long user_bufs)
+bool too_many_pipe_buffers_soft(unsigned long user_bufs)
{
unsigned long soft_limit = READ_ONCE(pipe_user_pages_soft);
return soft_limit && user_bufs > soft_limit;
}
-static bool too_many_pipe_buffers_hard(unsigned long user_bufs)
+bool too_many_pipe_buffers_hard(unsigned long user_bufs)
{
unsigned long hard_limit = READ_ONCE(pipe_user_pages_hard);
return hard_limit && user_bufs > hard_limit;
}
-static bool is_unprivileged_user(void)
+bool pipe_is_unprivileged_user(void)
{
return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN);
}
@@ -796,12 +819,12 @@ struct pipe_inode_info *alloc_pipe_info(void)
user_bufs = account_pipe_buffers(user, 0, pipe_bufs);
- if (too_many_pipe_buffers_soft(user_bufs) && is_unprivileged_user()) {
+ if (too_many_pipe_buffers_soft(user_bufs) && pipe_is_unprivileged_user()) {
user_bufs = account_pipe_buffers(user, pipe_bufs, 1);
pipe_bufs = 1;
}
- if (too_many_pipe_buffers_hard(user_bufs) && is_unprivileged_user())
+ if (too_many_pipe_buffers_hard(user_bufs) && pipe_is_unprivileged_user())
goto out_revert_acct;
pipe->bufs = kcalloc(pipe_bufs, sizeof(struct pipe_buffer),
@@ -813,6 +836,7 @@ struct pipe_inode_info *alloc_pipe_info(void)
pipe->r_counter = pipe->w_counter = 1;
pipe->max_usage = pipe_bufs;
pipe->ring_size = pipe_bufs;
+ pipe->nr_accounted = pipe_bufs;
pipe->user = user;
mutex_init(&pipe->mutex);
return pipe;
@@ -830,7 +854,14 @@ void free_pipe_info(struct pipe_inode_info *pipe)
{
int i;
- (void) account_pipe_buffers(pipe->user, pipe->ring_size, 0);
+#ifdef CONFIG_WATCH_QUEUE
+ if (pipe->watch_queue) {
+ watch_queue_clear(pipe->watch_queue);
+ put_watch_queue(pipe->watch_queue);
+ }
+#endif
+
+ (void) account_pipe_buffers(pipe->user, pipe->nr_accounted, 0);
free_uid(pipe->user);
for (i = 0; i < pipe->ring_size; i++) {
struct pipe_buffer *buf = pipe->bufs + i;
@@ -906,6 +937,17 @@ int create_pipe_files(struct file **res, int flags)
if (!inode)
return -ENFILE;
+ if (flags & O_NOTIFICATION_PIPE) {
+#ifdef CONFIG_WATCH_QUEUE
+ if (watch_queue_init(inode->i_pipe) < 0) {
+ iput(inode);
+ return -ENOMEM;
+ }
+#else
+ return -ENOPKG;
+#endif
+ }
+
f = alloc_file_pseudo(inode, pipe_mnt, "",
O_WRONLY | (flags & (O_NONBLOCK | O_DIRECT)),
&pipefifo_fops);
@@ -936,7 +978,7 @@ static int __do_pipe_flags(int *fd, struct file **files, int flags)
int error;
int fdw, fdr;
- if (flags & ~(O_CLOEXEC | O_NONBLOCK | O_DIRECT))
+ if (flags & ~(O_CLOEXEC | O_NONBLOCK | O_DIRECT | O_NOTIFICATION_PIPE))
return -EINVAL;
error = create_pipe_files(files, flags);
@@ -1184,42 +1226,12 @@ unsigned int round_pipe_size(unsigned long size)
}
/*
- * Allocate a new array of pipe buffers and copy the info over. Returns the
- * pipe size if successful, or return -ERROR on error.
+ * Resize the pipe ring to a number of slots.
*/
-static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg)
+int pipe_resize_ring(struct pipe_inode_info *pipe, unsigned int nr_slots)
{
struct pipe_buffer *bufs;
- unsigned int size, nr_slots, head, tail, mask, n;
- unsigned long user_bufs;
- long ret = 0;
-
- size = round_pipe_size(arg);
- nr_slots = size >> PAGE_SHIFT;
-
- if (!nr_slots)
- return -EINVAL;
-
- /*
- * If trying to increase the pipe capacity, check that an
- * unprivileged user is not trying to exceed various limits
- * (soft limit check here, hard limit check just below).
- * Decreasing the pipe capacity is always permitted, even
- * if the user is currently over a limit.
- */
- if (nr_slots > pipe->ring_size &&
- size > pipe_max_size && !capable(CAP_SYS_RESOURCE))
- return -EPERM;
-
- user_bufs = account_pipe_buffers(pipe->user, pipe->ring_size, nr_slots);
-
- if (nr_slots > pipe->ring_size &&
- (too_many_pipe_buffers_hard(user_bufs) ||
- too_many_pipe_buffers_soft(user_bufs)) &&
- is_unprivileged_user()) {
- ret = -EPERM;
- goto out_revert_acct;
- }
+ unsigned int head, tail, mask, n;
/*
* We can shrink the pipe, if arg is greater than the ring occupancy.
@@ -1231,17 +1243,13 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg)
head = pipe->head;
tail = pipe->tail;
n = pipe_occupancy(pipe->head, pipe->tail);
- if (nr_slots < n) {
- ret = -EBUSY;
- goto out_revert_acct;
- }
+ if (nr_slots < n)
+ return -EBUSY;
bufs = kcalloc(nr_slots, sizeof(*bufs),
GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
- if (unlikely(!bufs)) {
- ret = -ENOMEM;
- goto out_revert_acct;
- }
+ if (unlikely(!bufs))
+ return -ENOMEM;
/*
* The pipe array wraps around, so just start the new one at zero
@@ -1269,16 +1277,68 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg)
kfree(pipe->bufs);
pipe->bufs = bufs;
pipe->ring_size = nr_slots;
- pipe->max_usage = nr_slots;
+ if (pipe->max_usage > nr_slots)
+ pipe->max_usage = nr_slots;
pipe->tail = tail;
pipe->head = head;
/* This might have made more room for writers */
wake_up_interruptible(&pipe->wr_wait);
+ return 0;
+}
+
+/*
+ * Allocate a new array of pipe buffers and copy the info over. Returns the
+ * pipe size if successful, or return -ERROR on error.
+ */
+static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg)
+{
+ unsigned long user_bufs;
+ unsigned int nr_slots, size;
+ long ret = 0;
+
+#ifdef CONFIG_WATCH_QUEUE
+ if (pipe->watch_queue)
+ return -EBUSY;
+#endif
+
+ size = round_pipe_size(arg);
+ nr_slots = size >> PAGE_SHIFT;
+
+ if (!nr_slots)
+ return -EINVAL;
+
+ /*
+ * If trying to increase the pipe capacity, check that an
+ * unprivileged user is not trying to exceed various limits
+ * (soft limit check here, hard limit check just below).
+ * Decreasing the pipe capacity is always permitted, even
+ * if the user is currently over a limit.
+ */
+ if (nr_slots > pipe->max_usage &&
+ size > pipe_max_size && !capable(CAP_SYS_RESOURCE))
+ return -EPERM;
+
+ user_bufs = account_pipe_buffers(pipe->user, pipe->nr_accounted, nr_slots);
+
+ if (nr_slots > pipe->max_usage &&
+ (too_many_pipe_buffers_hard(user_bufs) ||
+ too_many_pipe_buffers_soft(user_bufs)) &&
+ pipe_is_unprivileged_user()) {
+ ret = -EPERM;
+ goto out_revert_acct;
+ }
+
+ ret = pipe_resize_ring(pipe, nr_slots);
+ if (ret < 0)
+ goto out_revert_acct;
+
+ pipe->max_usage = nr_slots;
+ pipe->nr_accounted = nr_slots;
return pipe->max_usage * PAGE_SIZE;
out_revert_acct:
- (void) account_pipe_buffers(pipe->user, nr_slots, pipe->ring_size);
+ (void) account_pipe_buffers(pipe->user, nr_slots, pipe->nr_accounted);
return ret;
}
@@ -1287,9 +1347,17 @@ out_revert_acct:
* location, so checking ->i_pipe is not enough to verify that this is a
* pipe.
*/
-struct pipe_inode_info *get_pipe_info(struct file *file)
+struct pipe_inode_info *get_pipe_info(struct file *file, bool for_splice)
{
- return file->f_op == &pipefifo_fops ? file->private_data : NULL;
+ struct pipe_inode_info *pipe = file->private_data;
+
+ if (file->f_op != &pipefifo_fops || !pipe)
+ return NULL;
+#ifdef CONFIG_WATCH_QUEUE
+ if (for_splice && pipe->watch_queue)
+ return NULL;
+#endif
+ return pipe;
}
long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
@@ -1297,7 +1365,7 @@ long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
struct pipe_inode_info *pipe;
long ret;
- pipe = get_pipe_info(file);
+ pipe = get_pipe_info(file, false);
if (!pipe)
return -EBADF;
diff --git a/fs/splice.c b/fs/splice.c
index fd0a1e7e5959..50f3c0260c00 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -1122,8 +1122,8 @@ long do_splice(struct file *in, loff_t __user *off_in,
!(out->f_mode & FMODE_WRITE)))
return -EBADF;
- ipipe = get_pipe_info(in);
- opipe = get_pipe_info(out);
+ ipipe = get_pipe_info(in, true);
+ opipe = get_pipe_info(out, true);
if (ipipe && opipe) {
if (off_in || off_out)
@@ -1273,7 +1273,7 @@ static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
static long vmsplice_to_user(struct file *file, struct iov_iter *iter,
unsigned int flags)
{
- struct pipe_inode_info *pipe = get_pipe_info(file);
+ struct pipe_inode_info *pipe = get_pipe_info(file, true);
struct splice_desc sd = {
.total_len = iov_iter_count(iter),
.flags = flags,
@@ -1308,7 +1308,7 @@ static long vmsplice_to_pipe(struct file *file, struct iov_iter *iter,
if (flags & SPLICE_F_GIFT)
buf_flag = PIPE_BUF_FLAG_GIFT;
- pipe = get_pipe_info(file);
+ pipe = get_pipe_info(file, true);
if (!pipe)
return -EBADF;
@@ -1757,8 +1757,8 @@ static int link_pipe(struct pipe_inode_info *ipipe,
static long do_tee(struct file *in, struct file *out, size_t len,
unsigned int flags)
{
- struct pipe_inode_info *ipipe = get_pipe_info(in);
- struct pipe_inode_info *opipe = get_pipe_info(out);
+ struct pipe_inode_info *ipipe = get_pipe_info(in, true);
+ struct pipe_inode_info *opipe = get_pipe_info(out, true);
int ret = -EINVAL;
if (unlikely(!(in->f_mode & FMODE_READ) ||
diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h
index ae58fad7f1e0..1d3eaa233f4a 100644
--- a/include/linux/pipe_fs_i.h
+++ b/include/linux/pipe_fs_i.h
@@ -35,6 +35,7 @@ struct pipe_buffer {
* @tail: The point of buffer consumption
* @max_usage: The maximum number of slots that may be used in the ring
* @ring_size: total number of buffers (should be a power of 2)
+ * @nr_accounted: The amount this pipe accounts for in user->pipe_bufs
* @tmp_page: cached released page
* @readers: number of current readers of this pipe
* @writers: number of current writers of this pipe
@@ -45,6 +46,7 @@ struct pipe_buffer {
* @fasync_writers: writer side fasync
* @bufs: the circular array of pipe buffers
* @user: the user who created this pipe
+ * @watch_queue: If this pipe is a watch_queue, this is the stuff for that
**/
struct pipe_inode_info {
struct mutex mutex;
@@ -53,6 +55,7 @@ struct pipe_inode_info {
unsigned int tail;
unsigned int max_usage;
unsigned int ring_size;
+ unsigned int nr_accounted;
unsigned int readers;
unsigned int writers;
unsigned int files;
@@ -63,6 +66,9 @@ struct pipe_inode_info {
struct fasync_struct *fasync_writers;
struct pipe_buffer *bufs;
struct user_struct *user;
+#ifdef CONFIG_WATCH_QUEUE
+ struct watch_queue *watch_queue;
+#endif
};
/*
@@ -237,9 +243,20 @@ void pipe_buf_mark_unmergeable(struct pipe_buffer *buf);
extern const struct pipe_buf_operations nosteal_pipe_buf_ops;
+#ifdef CONFIG_WATCH_QUEUE
+unsigned long account_pipe_buffers(struct user_struct *user,
+ unsigned long old, unsigned long new);
+bool too_many_pipe_buffers_soft(unsigned long user_bufs);
+bool too_many_pipe_buffers_hard(unsigned long user_bufs);
+bool pipe_is_unprivileged_user(void);
+#endif
+
/* for F_SETPIPE_SZ and F_GETPIPE_SZ */
+#ifdef CONFIG_WATCH_QUEUE
+int pipe_resize_ring(struct pipe_inode_info *pipe, unsigned int nr_slots);
+#endif
long pipe_fcntl(struct file *, unsigned int, unsigned long arg);
-struct pipe_inode_info *get_pipe_info(struct file *file);
+struct pipe_inode_info *get_pipe_info(struct file *file, bool for_splice);
int create_pipe_files(struct file **, int);
unsigned int round_pipe_size(unsigned long size);
diff --git a/include/linux/watch_queue.h b/include/linux/watch_queue.h
new file mode 100644
index 000000000000..5e08db2adc31
--- /dev/null
+++ b/include/linux/watch_queue.h
@@ -0,0 +1,127 @@
+// SPDX-License-Identifier: GPL-2.0
+/* User-mappable watch queue
+ *
+ * Copyright (C) 2020 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * See Documentation/watch_queue.rst
+ */
+
+#ifndef _LINUX_WATCH_QUEUE_H
+#define _LINUX_WATCH_QUEUE_H
+
+#include <uapi/linux/watch_queue.h>
+#include <linux/kref.h>
+#include <linux/rcupdate.h>
+
+#ifdef CONFIG_WATCH_QUEUE
+
+struct cred;
+
+struct watch_type_filter {
+ enum watch_notification_type type;
+ __u32 subtype_filter[1]; /* Bitmask of subtypes to filter on */
+ __u32 info_filter; /* Filter on watch_notification::info */
+ __u32 info_mask; /* Mask of relevant bits in info_filter */
+};
+
+struct watch_filter {
+ union {
+ struct rcu_head rcu;
+ unsigned long type_filter[2]; /* Bitmask of accepted types */
+ };
+ u32 nr_filters; /* Number of filters */
+ struct watch_type_filter filters[];
+};
+
+struct watch_queue {
+ struct rcu_head rcu;
+ struct watch_filter __rcu *filter;
+ struct pipe_inode_info *pipe; /* The pipe we're using as a buffer */
+ struct hlist_head watches; /* Contributory watches */
+ struct page **notes; /* Preallocated notifications */
+ unsigned long *notes_bitmap; /* Allocation bitmap for notes */
+ struct kref usage; /* Object usage count */
+ spinlock_t lock;
+ unsigned int nr_notes; /* Number of notes */
+ unsigned int nr_pages; /* Number of pages in notes[] */
+ bool defunct; /* T when queues closed */
+};
+
+/*
+ * Representation of a watch on an object.
+ */
+struct watch {
+ union {
+ struct rcu_head rcu;
+ u32 info_id; /* ID to be OR'd in to info field */
+ };
+ struct watch_queue __rcu *queue; /* Queue to post events to */
+ struct hlist_node queue_node; /* Link in queue->watches */
+ struct watch_list __rcu *watch_list;
+ struct hlist_node list_node; /* Link in watch_list->watchers */
+ const struct cred *cred; /* Creds of the owner of the watch */
+ void *private; /* Private data for the watched object */
+ u64 id; /* Internal identifier */
+ struct kref usage; /* Object usage count */
+};
+
+/*
+ * List of watches on an object.
+ */
+struct watch_list {
+ struct rcu_head rcu;
+ struct hlist_head watchers;
+ void (*release_watch)(struct watch *);
+ spinlock_t lock;
+};
+
+extern void __post_watch_notification(struct watch_list *,
+ struct watch_notification *,
+ const struct cred *,
+ u64);
+extern struct watch_queue *get_watch_queue(int);
+extern void put_watch_queue(struct watch_queue *);
+extern void init_watch(struct watch *, struct watch_queue *);
+extern int add_watch_to_object(struct watch *, struct watch_list *);
+extern int remove_watch_from_object(struct watch_list *, struct watch_queue *, u64, bool);
+extern long watch_queue_set_size(struct pipe_inode_info *, unsigned int);
+extern long watch_queue_set_filter(struct pipe_inode_info *,
+ struct watch_notification_filter __user *);
+extern int watch_queue_init(struct pipe_inode_info *);
+extern void watch_queue_clear(struct watch_queue *);
+
+static inline void init_watch_list(struct watch_list *wlist,
+ void (*release_watch)(struct watch *))
+{
+ INIT_HLIST_HEAD(&wlist->watchers);
+ spin_lock_init(&wlist->lock);
+ wlist->release_watch = release_watch;
+}
+
+static inline void post_watch_notification(struct watch_list *wlist,
+ struct watch_notification *n,
+ const struct cred *cred,
+ u64 id)
+{
+ if (unlikely(wlist))
+ __post_watch_notification(wlist, n, cred, id);
+}
+
+static inline void remove_watch_list(struct watch_list *wlist, u64 id)
+{
+ if (wlist) {
+ remove_watch_from_object(wlist, NULL, id, true);
+ kfree_rcu(wlist, rcu);
+ }
+}
+
+/**
+ * watch_sizeof - Calculate the information part of the size of a watch record,
+ * given the structure size.
+ */
+#define watch_sizeof(STRUCT) (sizeof(STRUCT) << WATCH_INFO_LENGTH__SHIFT)
+
+#endif
+
+#endif /* _LINUX_WATCH_QUEUE_H */
diff --git a/include/uapi/linux/watch_queue.h b/include/uapi/linux/watch_queue.h
index 9df72227f515..3a5790f1f05d 100644
--- a/include/uapi/linux/watch_queue.h
+++ b/include/uapi/linux/watch_queue.h
@@ -4,9 +4,13 @@
#include <linux/types.h>
#include <linux/fcntl.h>
+#include <linux/ioctl.h>
#define O_NOTIFICATION_PIPE O_EXCL /* Parameter to pipe2() selecting notification pipe */
+#define IOC_WATCH_QUEUE_SET_SIZE _IO('W', 0x60) /* Set the size in pages */
+#define IOC_WATCH_QUEUE_SET_FILTER _IO('W', 0x61) /* Set the filter */
+
enum watch_notification_type {
WATCH_TYPE_META = 0, /* Special record */
WATCH_TYPE__NR = 1
@@ -41,6 +45,22 @@ struct watch_notification {
#define WATCH_INFO_FLAG_7 0x00800000
};
+/*
+ * Notification filtering rules (IOC_WATCH_QUEUE_SET_FILTER).
+ */
+struct watch_notification_type_filter {
+ __u32 type; /* Type to apply filter to */
+ __u32 info_filter; /* Filter on watch_notification::info */
+ __u32 info_mask; /* Mask of relevant bits in info_filter */
+ __u32 subtype_filter[8]; /* Bitmask of subtypes to filter on */
+};
+
+struct watch_notification_filter {
+ __u32 nr_filters; /* Number of filters */
+ __u32 __reserved; /* Must be 0 */
+ struct watch_notification_type_filter filters[];
+};
+
/*
* Extended watch removal notification. This is used optionally if the type
diff --git a/init/Kconfig b/init/Kconfig
index 74a5ac65644f..c95a2a5654a9 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -326,6 +326,18 @@ config POSIX_MQUEUE_SYSCTL
depends on SYSCTL
default y
+config WATCH_QUEUE
+ bool "General notification queue"
+ default n
+ help
+
+ This is a general notification queue for the kernel to pass events to
+ userspace by splicing them into pipes. It can be used in conjunction
+ with watches for key/keyring change notifications and device
+ notifications.
+
+ See Documentation/watch_queue.rst
+
config CROSS_MEMORY_ATTACH
bool "Enable process_vm_readv/writev syscalls"
depends on MMU
diff --git a/kernel/Makefile b/kernel/Makefile
index 4cb4130ced32..41e7e3ae07ec 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -115,6 +115,7 @@ obj-$(CONFIG_TORTURE_TEST) += torture.o
obj-$(CONFIG_HAS_IOMEM) += iomem.o
obj-$(CONFIG_RSEQ) += rseq.o
+obj-$(CONFIG_WATCH_QUEUE) += watch_queue.o
obj-$(CONFIG_SYSCTL_KUNIT_TEST) += sysctl-test.o
diff --git a/kernel/watch_queue.c b/kernel/watch_queue.c
new file mode 100644
index 000000000000..c103e34f8705
--- /dev/null
+++ b/kernel/watch_queue.c
@@ -0,0 +1,657 @@
+// SPDX-License-Ident