summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAndy King <acking@vmware.com>2013-02-06 14:23:56 +0000
committerDavid S. Miller <davem@davemloft.net>2013-02-10 19:41:08 -0500
commitd021c344051af91f42c5ba9fdedc176740cbd238 (patch)
tree8c02cd94a59556da4b74823816e670dd007db72f
parentfd5023111cf720db890ef34f305ac5d427e690a0 (diff)
VSOCK: Introduce VM Sockets
VM Sockets allows communication between virtual machines and the hypervisor. User level applications both in a virtual machine and on the host can use the VM Sockets API, which facilitates fast and efficient communication between guest virtual machines and their host. A socket address family, designed to be compatible with UDP and TCP at the interface level, is provided. Today, VM Sockets is used by various VMware Tools components inside the guest for zero-config, network-less access to VMware host services. In addition to this, VMware's users are using VM Sockets for various applications, where network access of the virtual machine is restricted or non-existent. Examples of this are VMs communicating with device proxies for proprietary hardware running as host applications and automated testing of applications running within virtual machines. The VMware VM Sockets are similar to other socket types, like Berkeley UNIX socket interface. The VM Sockets module supports both connection-oriented stream sockets like TCP, and connectionless datagram sockets like UDP. The VM Sockets protocol family is defined as "AF_VSOCK" and the socket operations split for SOCK_DGRAM and SOCK_STREAM. For additional information about the use of VM Sockets, please refer to the VM Sockets Programming Guide available at: https://www.vmware.com/support/developer/vmci-sdk/ Signed-off-by: George Zhang <georgezhang@vmware.com> Signed-off-by: Dmitry Torokhov <dtor@vmware.com> Signed-off-by: Andy king <acking@vmware.com> Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--include/linux/socket.h4
-rw-r--r--include/uapi/linux/vm_sockets.h171
-rw-r--r--net/Kconfig1
-rw-r--r--net/Makefile1
-rw-r--r--net/vmw_vsock/Kconfig28
-rw-r--r--net/vmw_vsock/Makefile7
-rw-r--r--net/vmw_vsock/af_vsock.c2015
-rw-r--r--net/vmw_vsock/af_vsock.h175
-rw-r--r--net/vmw_vsock/vmci_transport.c2157
-rw-r--r--net/vmw_vsock/vmci_transport.h139
-rw-r--r--net/vmw_vsock/vmci_transport_notify.c680
-rw-r--r--net/vmw_vsock/vmci_transport_notify.h83
-rw-r--r--net/vmw_vsock/vmci_transport_notify_qstate.c438
-rw-r--r--net/vmw_vsock/vsock_addr.c86
-rw-r--r--net/vmw_vsock/vsock_addr.h32
-rw-r--r--net/vmw_vsock/vsock_version.h22
16 files changed, 6038 insertions, 1 deletions
diff --git a/include/linux/socket.h b/include/linux/socket.h
index 9a546ff853dc..2b9f74b0ffea 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -178,7 +178,8 @@ struct ucred {
#define AF_CAIF 37 /* CAIF sockets */
#define AF_ALG 38 /* Algorithm sockets */
#define AF_NFC 39 /* NFC sockets */
-#define AF_MAX 40 /* For now.. */
+#define AF_VSOCK 40 /* vSockets */
+#define AF_MAX 41 /* For now.. */
/* Protocol families, same as address families. */
#define PF_UNSPEC AF_UNSPEC
@@ -221,6 +222,7 @@ struct ucred {
#define PF_CAIF AF_CAIF
#define PF_ALG AF_ALG
#define PF_NFC AF_NFC
+#define PF_VSOCK AF_VSOCK
#define PF_MAX AF_MAX
/* Maximum queue length specifiable by listen. */
diff --git a/include/uapi/linux/vm_sockets.h b/include/uapi/linux/vm_sockets.h
new file mode 100644
index 000000000000..f7f2e99dec84
--- /dev/null
+++ b/include/uapi/linux/vm_sockets.h
@@ -0,0 +1,171 @@
+/*
+ * VMware vSockets Driver
+ *
+ * Copyright (C) 2007-2013 VMware, Inc. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation version 2 and no later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ */
+
+#ifndef _VM_SOCKETS_H_
+#define _VM_SOCKETS_H_
+
+#if !defined(__KERNEL__)
+#include <sys/socket.h>
+#endif
+
+/* Option name for STREAM socket buffer size. Use as the option name in
+ * setsockopt(3) or getsockopt(3) to set or get an unsigned long long that
+ * specifies the size of the buffer underlying a vSockets STREAM socket.
+ * Value is clamped to the MIN and MAX.
+ */
+
+#define SO_VM_SOCKETS_BUFFER_SIZE 0
+
+/* Option name for STREAM socket minimum buffer size. Use as the option name
+ * in setsockopt(3) or getsockopt(3) to set or get an unsigned long long that
+ * specifies the minimum size allowed for the buffer underlying a vSockets
+ * STREAM socket.
+ */
+
+#define SO_VM_SOCKETS_BUFFER_MIN_SIZE 1
+
+/* Option name for STREAM socket maximum buffer size. Use as the option name
+ * in setsockopt(3) or getsockopt(3) to set or get an unsigned long long
+ * that specifies the maximum size allowed for the buffer underlying a
+ * vSockets STREAM socket.
+ */
+
+#define SO_VM_SOCKETS_BUFFER_MAX_SIZE 2
+
+/* Option name for socket peer's host-specific VM ID. Use as the option name
+ * in getsockopt(3) to get a host-specific identifier for the peer endpoint's
+ * VM. The identifier is a signed integer.
+ * Only available for hypervisor endpoints.
+ */
+
+#define SO_VM_SOCKETS_PEER_HOST_VM_ID 3
+
+/* Option name for socket's service label. Use as the option name in
+ * setsockopt(3) or getsockopt(3) to set or get the service label for a socket.
+ * The service label is a C-style NUL-terminated string. Only available for
+ * hypervisor endpoints.
+ */
+
+#define SO_VM_SOCKETS_SERVICE_LABEL 4
+
+/* Option name for determining if a socket is trusted. Use as the option name
+ * in getsockopt(3) to determine if a socket is trusted. The value is a
+ * signed integer.
+ */
+
+#define SO_VM_SOCKETS_TRUSTED 5
+
+/* Option name for STREAM socket connection timeout. Use as the option name
+ * in setsockopt(3) or getsockopt(3) to set or get the connection
+ * timeout for a STREAM socket.
+ */
+
+#define SO_VM_SOCKETS_CONNECT_TIMEOUT 6
+
+/* Option name for using non-blocking send/receive. Use as the option name
+ * for setsockopt(3) or getsockopt(3) to set or get the non-blocking
+ * transmit/receive flag for a STREAM socket. This flag determines whether
+ * send() and recv() can be called in non-blocking contexts for the given
+ * socket. The value is a signed integer.
+ *
+ * This option is only relevant to kernel endpoints, where descheduling the
+ * thread of execution is not allowed, for example, while holding a spinlock.
+ * It is not to be confused with conventional non-blocking socket operations.
+ *
+ * Only available for hypervisor endpoints.
+ */
+
+#define SO_VM_SOCKETS_NONBLOCK_TXRX 7
+
+/* The vSocket equivalent of INADDR_ANY. This works for the svm_cid field of
+ * sockaddr_vm and indicates the context ID of the current endpoint.
+ */
+
+#define VMADDR_CID_ANY -1U
+
+/* Bind to any available port. Works for the svm_port field of
+ * sockaddr_vm.
+ */
+
+#define VMADDR_PORT_ANY -1U
+
+/* Use this as the destination CID in an address when referring to the
+ * hypervisor. VMCI relies on it being 0, but this would be useful for other
+ * transports too.
+ */
+
+#define VMADDR_CID_HYPERVISOR 0
+
+/* This CID is specific to VMCI and can be considered reserved (even VMCI
+ * doesn't use it anymore, it's a legacy value from an older release).
+ */
+
+#define VMADDR_CID_RESERVED 1
+
+/* Use this as the destination CID in an address when referring to the host
+ * (any process other than the hypervisor). VMCI relies on it being 2, but
+ * this would be useful for other transports too.
+ */
+
+#define VMADDR_CID_HOST 2
+
+/* Invalid vSockets version. */
+
+#define VM_SOCKETS_INVALID_VERSION -1U
+
+/* The epoch (first) component of the vSockets version. A single byte
+ * representing the epoch component of the vSockets version.
+ */
+
+#define VM_SOCKETS_VERSION_EPOCH(_v) (((_v) & 0xFF000000) >> 24)
+
+/* The major (second) component of the vSockets version. A single byte
+ * representing the major component of the vSockets version. Typically
+ * changes for every major release of a product.
+ */
+
+#define VM_SOCKETS_VERSION_MAJOR(_v) (((_v) & 0x00FF0000) >> 16)
+
+/* The minor (third) component of the vSockets version. Two bytes representing
+ * the minor component of the vSockets version.
+ */
+
+#define VM_SOCKETS_VERSION_MINOR(_v) (((_v) & 0x0000FFFF))
+
+/* Address structure for vSockets. The address family should be set to
+ * whatever vmci_sock_get_af_value_fd() returns. The structure members should
+ * all align on their natural boundaries without resorting to compiler packing
+ * directives. The total size of this structure should be exactly the same as
+ * that of struct sockaddr.
+ */
+
+struct sockaddr_vm {
+ sa_family_t svm_family;
+ unsigned short svm_reserved1;
+ unsigned int svm_port;
+ unsigned int svm_cid;
+ unsigned char svm_zero[sizeof(struct sockaddr) -
+ sizeof(sa_family_t) -
+ sizeof(unsigned short) -
+ sizeof(unsigned int) - sizeof(unsigned int)];
+};
+
+#define IOCTL_VM_SOCKETS_GET_LOCAL_CID _IO(7, 0xb9)
+
+#if defined(__KERNEL__)
+int vm_sockets_get_local_cid(void);
+#endif
+
+#endif
diff --git a/net/Kconfig b/net/Kconfig
index c31348e70aad..5a1888bb036d 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -217,6 +217,7 @@ source "net/dcb/Kconfig"
source "net/dns_resolver/Kconfig"
source "net/batman-adv/Kconfig"
source "net/openvswitch/Kconfig"
+source "net/vmw_vsock/Kconfig"
config RPS
boolean
diff --git a/net/Makefile b/net/Makefile
index c5aa8b3b49dc..091e7b04f301 100644
--- a/net/Makefile
+++ b/net/Makefile
@@ -69,3 +69,4 @@ obj-$(CONFIG_CEPH_LIB) += ceph/
obj-$(CONFIG_BATMAN_ADV) += batman-adv/
obj-$(CONFIG_NFC) += nfc/
obj-$(CONFIG_OPENVSWITCH) += openvswitch/
+obj-$(CONFIG_VSOCKETS) += vmw_vsock/
diff --git a/net/vmw_vsock/Kconfig b/net/vmw_vsock/Kconfig
new file mode 100644
index 000000000000..b5fa7e40cdcb
--- /dev/null
+++ b/net/vmw_vsock/Kconfig
@@ -0,0 +1,28 @@
+#
+# Vsock protocol
+#
+
+config VSOCKETS
+ tristate "Virtual Socket protocol"
+ help
+ Virtual Socket Protocol is a socket protocol similar to TCP/IP
+ allowing comunication between Virtual Machines and hypervisor
+ or host.
+
+ You should also select one or more hypervisor-specific transports
+ below.
+
+ To compile this driver as a module, choose M here: the module
+ will be called vsock. If unsure, say N.
+
+config VMWARE_VMCI_VSOCKETS
+ tristate "VMware VMCI transport for Virtual Sockets"
+ depends on VSOCKETS && VMWARE_VMCI
+ help
+ This module implements a VMCI transport for Virtual Sockets.
+
+ Enable this transport if your Virtual Machine runs on a VMware
+ hypervisor.
+
+ To compile this driver as a module, choose M here: the module
+ will be called vmw_vsock_vmci_transport. If unsure, say N.
diff --git a/net/vmw_vsock/Makefile b/net/vmw_vsock/Makefile
new file mode 100644
index 000000000000..2ce52d70f224
--- /dev/null
+++ b/net/vmw_vsock/Makefile
@@ -0,0 +1,7 @@
+obj-$(CONFIG_VSOCKETS) += vsock.o
+obj-$(CONFIG_VMWARE_VMCI_VSOCKETS) += vmw_vsock_vmci_transport.o
+
+vsock-y += af_vsock.o vsock_addr.o
+
+vmw_vsock_vmci_transport-y += vmci_transport.o vmci_transport_notify.o \
+ vmci_transport_notify_qstate.o
diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
new file mode 100644
index 000000000000..54bb7bdf92d3
--- /dev/null
+++ b/net/vmw_vsock/af_vsock.c
@@ -0,0 +1,2015 @@
+/*
+ * VMware vSockets Driver
+ *
+ * Copyright (C) 2007-2013 VMware, Inc. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation version 2 and no later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ */
+
+/* Implementation notes:
+ *
+ * - There are two kinds of sockets: those created by user action (such as
+ * calling socket(2)) and those created by incoming connection request packets.
+ *
+ * - There are two "global" tables, one for bound sockets (sockets that have
+ * specified an address that they are responsible for) and one for connected
+ * sockets (sockets that have established a connection with another socket).
+ * These tables are "global" in that all sockets on the system are placed
+ * within them. - Note, though, that the bound table contains an extra entry
+ * for a list of unbound sockets and SOCK_DGRAM sockets will always remain in
+ * that list. The bound table is used solely for lookup of sockets when packets
+ * are received and that's not necessary for SOCK_DGRAM sockets since we create
+ * a datagram handle for each and need not perform a lookup. Keeping SOCK_DGRAM
+ * sockets out of the bound hash buckets will reduce the chance of collisions
+ * when looking for SOCK_STREAM sockets and prevents us from having to check the
+ * socket type in the hash table lookups.
+ *
+ * - Sockets created by user action will either be "client" sockets that
+ * initiate a connection or "server" sockets that listen for connections; we do
+ * not support simultaneous connects (two "client" sockets connecting).
+ *
+ * - "Server" sockets are referred to as listener sockets throughout this
+ * implementation because they are in the SS_LISTEN state. When a connection
+ * request is received (the second kind of socket mentioned above), we create a
+ * new socket and refer to it as a pending socket. These pending sockets are
+ * placed on the pending connection list of the listener socket. When future
+ * packets are received for the address the listener socket is bound to, we
+ * check if the source of the packet is from one that has an existing pending
+ * connection. If it does, we process the packet for the pending socket. When
+ * that socket reaches the connected state, it is removed from the listener
+ * socket's pending list and enqueued in the listener socket's accept queue.
+ * Callers of accept(2) will accept connected sockets from the listener socket's
+ * accept queue. If the socket cannot be accepted for some reason then it is
+ * marked rejected. Once the connection is accepted, it is owned by the user
+ * process and the responsibility for cleanup falls with that user process.
+ *
+ * - It is possible that these pending sockets will never reach the connected
+ * state; in fact, we may never receive another packet after the connection
+ * request. Because of this, we must schedule a cleanup function to run in the
+ * future, after some amount of time passes where a connection should have been
+ * established. This function ensures that the socket is off all lists so it
+ * cannot be retrieved, then drops all references to the socket so it is cleaned
+ * up (sock_put() -> sk_free() -> our sk_destruct implementation). Note this
+ * function will also cleanup rejected sockets, those that reach the connected
+ * state but leave it before they have been accepted.
+ *
+ * - Sockets created by user action will be cleaned up when the user process
+ * calls close(2), causing our release implementation to be called. Our release
+ * implementation will perform some cleanup then drop the last reference so our
+ * sk_destruct implementation is invoked. Our sk_destruct implementation will
+ * perform additional cleanup that's common for both types of sockets.
+ *
+ * - A socket's reference count is what ensures that the structure won't be
+ * freed. Each entry in a list (such as the "global" bound and connected tables
+ * and the listener socket's pending list and connected queue) ensures a
+ * reference. When we defer work until process context and pass a socket as our
+ * argument, we must ensure the reference count is increased to ensure the
+ * socket isn't freed before the function is run; the deferred function will
+ * then drop the reference.
+ */
+
+#include <linux/types.h>
+
+#define EXPORT_SYMTAB
+#include <linux/bitops.h>
+#include <linux/cred.h>
+#include <linux/init.h>
+#include <linux/io.h>
+#include <linux/kernel.h>
+#include <linux/kmod.h>
+#include <linux/list.h>
+#include <linux/miscdevice.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/net.h>
+#include <linux/poll.h>
+#include <linux/skbuff.h>
+#include <linux/smp.h>
+#include <linux/socket.h>
+#include <linux/stddef.h>
+#include <linux/unistd.h>
+#include <linux/wait.h>
+#include <linux/workqueue.h>
+#include <net/sock.h>
+
+#include "af_vsock.h"
+#include "vsock_version.h"
+
+static int __vsock_bind(struct sock *sk, struct sockaddr_vm *addr);
+static void vsock_sk_destruct(struct sock *sk);
+static int vsock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb);
+
+/* Protocol family. */
+static struct proto vsock_proto = {
+ .name = "AF_VSOCK",
+ .owner = THIS_MODULE,
+ .obj_size = sizeof(struct vsock_sock),
+};
+
+/* The default peer timeout indicates how long we will wait for a peer response
+ * to a control message.
+ */
+#define VSOCK_DEFAULT_CONNECT_TIMEOUT (2 * HZ)
+
+#define SS_LISTEN 255
+
+static const struct vsock_transport *transport;
+static DEFINE_MUTEX(vsock_register_mutex);
+
+/**** EXPORTS ****/
+
+/* Get the ID of the local context. This is transport dependent. */
+
+int vm_sockets_get_local_cid(void)
+{
+ return transport->get_local_cid();
+}
+EXPORT_SYMBOL_GPL(vm_sockets_get_local_cid);
+
+/**** UTILS ****/
+
+/* Each bound VSocket is stored in the bind hash table and each connected
+ * VSocket is stored in the connected hash table.
+ *
+ * Unbound sockets are all put on the same list attached to the end of the hash
+ * table (vsock_unbound_sockets). Bound sockets are added to the hash table in
+ * the bucket that their local address hashes to (vsock_bound_sockets(addr)
+ * represents the list that addr hashes to).
+ *
+ * Specifically, we initialize the vsock_bind_table array to a size of
+ * VSOCK_HASH_SIZE + 1 so that vsock_bind_table[0] through
+ * vsock_bind_table[VSOCK_HASH_SIZE - 1] are for bound sockets and
+ * vsock_bind_table[VSOCK_HASH_SIZE] is for unbound sockets. The hash function
+ * mods with VSOCK_HASH_SIZE - 1 to ensure this.
+ */
+#define VSOCK_HASH_SIZE 251
+#define MAX_PORT_RETRIES 24
+
+#define VSOCK_HASH(addr) ((addr)->svm_port % (VSOCK_HASH_SIZE - 1))
+#define vsock_bound_sockets(addr) (&vsock_bind_table[VSOCK_HASH(addr)])
+#define vsock_unbound_sockets (&vsock_bind_table[VSOCK_HASH_SIZE])
+
+/* XXX This can probably be implemented in a better way. */
+#define VSOCK_CONN_HASH(src, dst) \
+ (((src)->svm_cid ^ (dst)->svm_port) % (VSOCK_HASH_SIZE - 1))
+#define vsock_connected_sockets(src, dst) \
+ (&vsock_connected_table[VSOCK_CONN_HASH(src, dst)])
+#define vsock_connected_sockets_vsk(vsk) \
+ vsock_connected_sockets(&(vsk)->remote_addr, &(vsk)->local_addr)
+
+static struct list_head vsock_bind_table[VSOCK_HASH_SIZE + 1];
+static struct list_head vsock_connected_table[VSOCK_HASH_SIZE];
+static DEFINE_SPINLOCK(vsock_table_lock);
+
+static __init void vsock_init_tables(void)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(vsock_bind_table); i++)
+ INIT_LIST_HEAD(&vsock_bind_table[i]);
+
+ for (i = 0; i < ARRAY_SIZE(vsock_connected_table); i++)
+ INIT_LIST_HEAD(&vsock_connected_table[i]);
+}
+
+static void __vsock_insert_bound(struct list_head *list,
+ struct vsock_sock *vsk)
+{
+ sock_hold(&vsk->sk);
+ list_add(&vsk->bound_table, list);
+}
+
+static void __vsock_insert_connected(struct list_head *list,
+ struct vsock_sock *vsk)
+{
+ sock_hold(&vsk->sk);
+ list_add(&vsk->connected_table, list);
+}
+
+static void __vsock_remove_bound(struct vsock_sock *vsk)
+{
+ list_del_init(&vsk->bound_table);
+ sock_put(&vsk->sk);
+}
+
+static void __vsock_remove_connected(struct vsock_sock *vsk)
+{
+ list_del_init(&vsk->connected_table);
+ sock_put(&vsk->sk);
+}
+
+static struct sock *__vsock_find_bound_socket(struct sockaddr_vm *addr)
+{
+ struct vsock_sock *vsk;
+
+ list_for_each_entry(vsk, vsock_bound_sockets(addr), bound_table)
+ if (vsock_addr_equals_addr_any(addr, &vsk->local_addr))
+ return sk_vsock(vsk);
+
+ return NULL;
+}
+
+static struct sock *__vsock_find_connected_socket(struct sockaddr_vm *src,
+ struct sockaddr_vm *dst)
+{
+ struct vsock_sock *vsk;
+
+ list_for_each_entry(vsk, vsock_connected_sockets(src, dst),
+ connected_table) {
+ if (vsock_addr_equals_addr(src, &vsk->remote_addr)
+ && vsock_addr_equals_addr(dst, &vsk->local_addr)) {
+ return sk_vsock(vsk);
+ }
+ }
+
+ return NULL;
+}
+
+static bool __vsock_in_bound_table(struct vsock_sock *vsk)
+{
+ return !list_empty(&vsk->bound_table);
+}
+
+static bool __vsock_in_connected_table(struct vsock_sock *vsk)
+{
+ return !list_empty(&vsk->connected_table);
+}
+
+static void vsock_insert_unbound(struct vsock_sock *vsk)
+{
+ spin_lock_bh(&vsock_table_lock);
+ __vsock_insert_bound(vsock_unbound_sockets, vsk);
+ spin_unlock_bh(&vsock_table_lock);
+}
+
+void vsock_insert_connected(struct vsock_sock *vsk)
+{
+ struct list_head *list = vsock_connected_sockets(
+ &vsk->remote_addr, &vsk->local_addr);
+
+ spin_lock_bh(&vsock_table_lock);
+ __vsock_insert_connected(list, vsk);
+ spin_unlock_bh(&vsock_table_lock);
+}
+EXPORT_SYMBOL_GPL(vsock_insert_connected);
+
+void vsock_remove_bound(struct vsock_sock *vsk)
+{
+ spin_lock_bh(&vsock_table_lock);
+ __vsock_remove_bound(vsk);
+ spin_unlock_bh(&vsock_table_lock);
+}
+EXPORT_SYMBOL_GPL(vsock_remove_bound);
+
+void vsock_remove_connected(struct vsock_sock *vsk)
+{
+ spin_lock_bh(&vsock_table_lock);
+ __vsock_remove_connected(vsk);
+ spin_unlock_bh(&vsock_table_lock);
+}
+EXPORT_SYMBOL_GPL(vsock_remove_connected);
+
+struct sock *vsock_find_bound_socket(struct sockaddr_vm *addr)
+{
+ struct sock *sk;
+
+ spin_lock_bh(&vsock_table_lock);
+ sk = __vsock_find_bound_socket(addr);
+ if (sk)
+ sock_hold(sk);
+
+ spin_unlock_bh(&vsock_table_lock);
+
+ return sk;
+}
+EXPORT_SYMBOL_GPL(vsock_find_bound_socket);
+
+struct sock *vsock_find_connected_socket(struct sockaddr_vm *src,
+ struct sockaddr_vm *dst)
+{
+ struct sock *sk;
+
+ spin_lock_bh(&vsock_table_lock);
+ sk = __vsock_find_connected_socket(src, dst);
+ if (sk)
+ sock_hold(sk);
+
+ spin_unlock_bh(&vsock_table_lock);
+
+ return sk;
+}
+EXPORT_SYMBOL_GPL(vsock_find_connected_socket);
+
+static bool vsock_in_bound_table(struct vsock_sock *vsk)
+{
+ bool ret;
+
+ spin_lock_bh(&vsock_table_lock);
+ ret = __vsock_in_bound_table(vsk);
+ spin_unlock_bh(&vsock_table_lock);
+
+ return ret;
+}
+
+static bool vsock_in_connected_table(struct vsock_sock *vsk)
+{
+ bool ret;
+
+ spin_lock_bh(&vsock_table_lock);
+ ret = __vsock_in_connected_table(vsk);
+ spin_unlock_bh(&vsock_table_lock);
+
+ return ret;
+}
+
+void vsock_for_each_connected_socket(void (*fn)(struct sock *sk))
+{
+ int i;
+
+ spin_lock_bh(&vsock_table_lock);
+
+ for (i = 0; i < ARRAY_SIZE(vsock_connected_table); i++) {
+ struct vsock_sock *vsk;
+ list_for_each_entry(vsk, &vsock_connected_table[i],
+ connected_table);
+ fn(sk_vsock(vsk));
+ }
+
+ spin_unlock_bh(&vsock_table_lock);
+}
+EXPORT_SYMBOL_GPL(vsock_for_each_connected_socket);
+
+void vsock_add_pending(struct sock *listener, struct sock *pending)
+{
+ struct vsock_sock *vlistener;
+ struct vsock_sock *vpending;
+
+ vlistener = vsock_sk(listener);
+ vpending = vsock_sk(pending);
+
+ sock_hold(pending);
+ sock_hold(listener);
+ list_add_tail(&vpending->pending_links, &vlistener->pending_links);
+}
+EXPORT_SYMBOL_GPL(vsock_add_pending);
+
+void vsock_remove_pending(struct sock *listener, struct sock *pending)
+{
+ struct vsock_sock *vpending = vsock_sk(pending);
+
+ list_del_init(&vpending->pending_links);
+ sock_put(listener);
+ sock_put(pending);
+}
+EXPORT_SYMBOL_GPL(vsock_remove_pending);
+
+void vsock_enqueue_accept(struct sock *listener, struct sock *connected)
+{
+ struct vsock_sock *vlistener;
+ struct vsock_sock *vconnected;
+
+ vlistener = vsock_sk(listener);
+ vconnected = vsock_sk(connected);
+
+ sock_hold(connected);
+ sock_hold(listener);
+ list_add_tail(&vconnected->accept_queue, &vlistener->accept_queue);
+}
+EXPORT_SYMBOL_GPL(vsock_enqueue_accept);
+
+static struct sock *vsock_dequeue_accept(struct sock *listener)
+{
+ struct vsock_sock *vlistener;
+ struct vsock_sock *vconnected;
+
+ vlistener = vsock_sk(listener);
+
+ if (list_empty(&vlistener->accept_queue))
+ return NULL;
+
+ vconnected = list_entry(vlistener->accept_queue.next,
+ struct vsock_sock, accept_queue);
+
+ list_del_init(&vconnected->accept_queue);
+ sock_put(listener);
+ /* The caller will need a reference on the connected socket so we let
+ * it call sock_put().
+ */
+
+ return sk_vsock(vconnected);
+}
+
+static bool vsock_is_accept_queue_empty(struct sock *sk)
+{
+ struct vsock_sock *vsk = vsock_sk(sk);
+ return list_empty(&vsk->accept_queue);
+}
+
+static bool vsock_is_pending(struct sock *sk)
+{
+ struct vsock_sock *vsk = vsock_sk(sk);
+ return !list_empty(&vsk->pending_links);
+}
+
+static int vsock_send_shutdown(struct sock *sk, int mode)
+{
+ return transport->shutdown(vsock_sk(sk), mode);
+}
+
+void vsock_pending_work(struct work_struct *work)
+{
+ struct sock *sk;
+ struct sock *listener;
+ struct vsock_sock *vsk;
+ bool cleanup;
+
+ vsk = container_of(work, struct vsock_sock, dwork.work);
+ sk = sk_vsock(vsk);
+ listener = vsk->listener;
+ cleanup = true;
+
+ lock_sock(listener);
+ lock_sock(sk);
+
+ if (vsock_is_pending(sk)) {
+ vsock_remove_pending(listener, sk);
+ } else if (!vsk->rejected) {
+ /* We are not on the pending list and accept() did not reject
+ * us, so we must have been accepted by our user process. We
+ * just need to drop our references to the sockets and be on
+ * our way.
+ */
+ cleanup = false;
+ goto out;
+ }
+
+ listener->sk_ack_backlog--;
+
+ /* We need to remove ourself from the global connected sockets list so
+ * incoming packets can't find this socket, and to reduce the reference
+ * count.
+ */
+ if (vsock_in_connected_table(vsk))
+ vsock_remove_connected(vsk);
+
+ sk->sk_state = SS_FREE;
+
+out:
+ release_sock(sk);
+ release_sock(listener);
+ if (cleanup)
+ sock_put(sk);
+
+ sock_put(sk);
+ sock_put(listener);
+}
+EXPORT_SYMBOL_GPL(vsock_pending_work);
+
+/**** SOCKET OPERATIONS ****/
+
+static int __vsock_bind_stream(struct vsock_sock *vsk,
+ struct sockaddr_vm *addr)
+{
+ static u32 port = LAST_RESERVED_PORT + 1;
+ struct sockaddr_vm new_addr;
+
+ vsock_addr_init(&new_addr, addr->svm_cid, addr->svm_port);
+
+ if (addr->svm_port == VMADDR_PORT_ANY) {
+ bool found = false;
+ unsigned int i;
+
+ for (i = 0; i < MAX_PORT_RETRIES; i++) {
+ if (port <= LAST_RESERVED_PORT)
+ port = LAST_RESERVED_PORT + 1;
+
+ new_addr.svm_port = port++;
+
+ if (!__vsock_find_bound_socket(&new_addr)) {
+ found = true;
+ break;
+ }
+ }
+
+ if (!found)
+ return -EADDRNOTAVAIL;
+ } else {
+ /* If port is in reserved range, ensure caller
+ * has necessary privileges.
+ */
+ if (addr->svm_port <= LAST_RESERVED_PORT &&
+ !capable(CAP_NET_BIND_SERVICE)) {
+ return -EACCES;
+ }
+
+ if (__vsock_find_bound_socket(&new_addr))
+ return -EADDRINUSE;
+ }
+
+ vsock_addr_init(&vsk->local_addr, new_addr.svm_cid, new_addr.svm_port);
+
+ /* Remove stream sockets from the unbound list and add them to the hash
+ * table for easy lookup by its address. The unbound list is simply an
+ * extra entry at the end of the hash table, a trick used by AF_UNIX.
+ */
+ __vsock_remove_bound(vsk);
+ __vsock_insert_bound(vsock_bound_sockets(&vsk->local_addr), vsk);
+
+ return 0;
+}
+
+static int __vsock_bind_dgram(struct vsock_sock *vsk,
+ struct sockaddr_vm *addr)
+{
+ return transport->dgram_bind(vsk, addr);
+}
+
+static int __vsock_bind(struct sock *sk, struct sockaddr_vm *addr)
+{
+ struct vsock_sock *vsk = vsock_sk(sk);
+ u32 cid;
+ int retval;
+
+ /* First ensure this socket isn't already bound. */
+ if (vsock_addr_bound(&vsk->local_addr))
+ return -EINVAL;
+
+ /* Now bind to the provided address or select appropriate values if
+ * none are provided (VMADDR_CID_ANY and VMADDR_PORT_ANY). Note that
+ * like AF_INET prevents binding to a non-local IP address (in most
+ * cases), we only allow binding to the local CID.
+ */
+ cid = transport->get_local_cid();
+ if (addr->svm_cid != cid && addr->svm_cid != VMADDR_CID_ANY)
+ return -EADDRNOTAVAIL;
+
+ switch (sk->sk_socket->type) {
+ case SOCK_STREAM:
+ spin_lock_bh(&vsock_table_lock);
+ retval = __vsock_bind_stream(vsk, addr);
+ spin_unlock_bh(&vsock_table_lock);
+ break;
+
+ case SOCK_DGRAM:
+ retval = __vsock_bind_dgram(vsk, addr);
+ break;
+
+ default:
+ retval = -EINVAL;
+ break;
+ }
+
+ return retval;
+}
+
+struct sock *__vsock_create(struct net *net,
+ struct socket *sock,
+ struct sock *parent,
+ gfp_t priority,
+ unsigned short type)
+{
+ struct sock *sk;
+ struct vsock_sock *psk;
+ struct vsock_sock *vsk;
+
+ sk = sk_alloc(net, AF_VSOCK, priority, &vsock_proto);
+ if (!sk)
+ return NULL;
+
+ sock_init_data(sock, sk);
+
+ /* sk->sk_type is normally set in sock_init_data, but only if sock is
+ * non-NULL. We make sure that our sockets always have a type by
+ * setting it here if needed.
+ */
+ if (!sock)
+ sk->sk_type = type;
+
+ vsk = vsock_sk(sk);
+ vsock_addr_init(&vsk->local_addr, VMADDR_CID_ANY, VMADDR_PORT_ANY);
+ vsock_addr_init(&vsk->remote_addr, VMADDR_CID_ANY, VMADDR_PORT_ANY);
+
+ sk->sk_destruct = vsock_sk_destruct;
+ sk->sk_backlog_rcv = vsock_queue_rcv_skb;
+ sk->sk_state = 0;
+ sock_reset_flag(sk, SOCK_DONE);
+
+ INIT_LIST_HEAD(&vsk->bound_table);
+ INIT_LIST_HEAD(&vsk->connected_table);
+ vsk->listener = NULL;
+ INIT_LIST_HEAD(&vsk->pending_links);
+ INIT_LIST_HEAD(&vsk->accept_queue);
+ vsk->rejected = false;
+ vsk->sent_request = false;
+ vsk->ignore_connecting_rst = false;
+ vsk->peer_shutdown = 0;
+
+ psk = parent ? vsock_sk(parent) : NULL;
+ if (parent) {
+ vsk->trusted = psk->trusted;
+ vsk->owner = get_cred(psk->owner);
+ vsk->connect_timeout = psk->connect_timeout;
+ } else {
+ vsk->trusted = capable(CAP_NET_ADMIN);
+ vsk->owner = get_current_cred();
+ vsk->connect_timeout = VSOCK_DEFAULT_CONNECT_TIMEOUT;
+ }
+
+ if (transport->init(vsk, psk) < 0) {
+ sk_free(sk);
+ return NULL;
+ }
+
+ if (sock)
+ vsock_insert_unbound(vsk);
+
+ return sk;
+}
+EXPORT_SYMBOL_GPL(__vsock_create);
+
+static void __vsock_release(struct sock *sk)
+{
+ if (sk) {
+ struct sk_buff *skb;
+ struct sock *pending;
+ struct vsock_sock *vsk;
+
+ vsk = vsock_sk(sk);
+ pending = NULL; /* Compiler warning. */
+
+ if (vsock_in_bound_table(vsk))
+ vsock_remove_bound(vsk);
+
+ if (vsock_in_connected_table(vsk))
+ vsock_remove_connected(vsk);
+
+ transport->release(vsk);
+
+ lock_sock(sk);
+ sock_orphan(sk);
+ sk->sk_shutdown = SHUTDOWN_MASK;
+
+ while ((skb = skb_dequeue(&sk->sk_receive_queue)))
+ kfree_skb(skb);
+
+ /* Clean up any sockets that never were accepted. */
+ while ((pending = vsock_dequeue_accept(sk)) != NULL) {
+ __vsock_release(pending);
+ sock_put(pending);
+ }
+
+ release_sock(sk);
+ sock_put(sk);
+ }
+}
+
+static void vsock_sk_destruct(struct sock *sk)
+{
+ struct vsock_sock *vsk = vsock_sk(sk);
+
+ transport->destruct(vsk);
+
+ /* When clearing these addresses, there's no need to set the family and
+ * possibly register the address family with the kernel.
+ */
+ vsock_addr_init(&vsk->local_addr, VMADDR_CID_ANY, VMADDR_PORT_ANY);
+ vsock_addr_init(&vsk->remote_addr, VMADDR_CID_ANY, VMADDR_PORT_ANY);
+
+ put_cred(vsk->owner);
+}
+
+static int vsock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
+{
+ int err;
+
+ err = sock_queue_rcv_skb(sk, skb);
+ if (err)
+ kfree_skb(skb);
+
+ return err;
+}
+
+s64 vsock_stream_has_data(struct vsock_sock *vsk)
+{
+ return transport->stream_has_data(vsk);
+}
+EXPORT_SYMBOL_GPL(vsock_stream_has_data);
+
+s64 vsock_stream_has_space(struct vsock_sock *vsk)
+{
+ return transport->stream_has_space(vsk);
+}
+EXPORT_SYMBOL_GPL(vsock_stream_has_space);
+
+static int vsock_release(struct socket *sock)
+{
+ __vsock_release(sock->sk);
+ sock->sk = NULL;