summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--net/rds/af_rds.c138
-rw-r--r--net/rds/bind.c91
-rw-r--r--net/rds/cong.c23
-rw-r--r--net/rds/connection.c132
-rw-r--r--net/rds/ib.c17
-rw-r--r--net/rds/ib.h51
-rw-r--r--net/rds/ib_cm.c299
-rw-r--r--net/rds/ib_rdma.c15
-rw-r--r--net/rds/ib_recv.c18
-rw-r--r--net/rds/ib_send.c10
-rw-r--r--net/rds/loop.c7
-rw-r--r--net/rds/rdma.c6
-rw-r--r--net/rds/rdma_transport.c56
-rw-r--r--net/rds/rds.h70
-rw-r--r--net/rds/recv.c51
-rw-r--r--net/rds/send.c67
-rw-r--r--net/rds/tcp.c32
-rw-r--r--net/rds/tcp_connect.c34
-rw-r--r--net/rds/tcp_listen.c18
-rw-r--r--net/rds/tcp_recv.c9
-rw-r--r--net/rds/tcp_send.c4
-rw-r--r--net/rds/threads.c69
-rw-r--r--net/rds/transport.c15
23 files changed, 863 insertions, 369 deletions
diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c
index ab751a150f70..fc1a5c63b783 100644
--- a/net/rds/af_rds.c
+++ b/net/rds/af_rds.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2006 Oracle. All rights reserved.
+ * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
@@ -35,6 +35,7 @@
#include <linux/kernel.h>
#include <linux/gfp.h>
#include <linux/in.h>
+#include <linux/ipv6.h>
#include <linux/poll.h>
#include <net/sock.h>
@@ -113,26 +114,63 @@ void rds_wake_sk_sleep(struct rds_sock *rs)
static int rds_getname(struct socket *sock, struct sockaddr *uaddr,
int peer)
{
- struct sockaddr_in *sin = (struct sockaddr_in *)uaddr;
struct rds_sock *rs = rds_sk_to_rs(sock->sk);
-
- memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
+ struct sockaddr_in6 *sin6;
+ struct sockaddr_in *sin;
+ int uaddr_len;
/* racey, don't care */
if (peer) {
- if (!rs->rs_conn_addr)
+ if (ipv6_addr_any(&rs->rs_conn_addr))
return -ENOTCONN;
- sin->sin_port = rs->rs_conn_port;
- sin->sin_addr.s_addr = rs->rs_conn_addr;
+ if (ipv6_addr_v4mapped(&rs->rs_conn_addr)) {
+ sin = (struct sockaddr_in *)uaddr;
+ memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
+ sin->sin_family = AF_INET;
+ sin->sin_port = rs->rs_conn_port;
+ sin->sin_addr.s_addr = rs->rs_conn_addr_v4;
+ uaddr_len = sizeof(*sin);
+ } else {
+ sin6 = (struct sockaddr_in6 *)uaddr;
+ sin6->sin6_family = AF_INET6;
+ sin6->sin6_port = rs->rs_conn_port;
+ sin6->sin6_addr = rs->rs_conn_addr;
+ sin6->sin6_flowinfo = 0;
+ /* scope_id is the same as in the bound address. */
+ sin6->sin6_scope_id = rs->rs_bound_scope_id;
+ uaddr_len = sizeof(*sin6);
+ }
} else {
- sin->sin_port = rs->rs_bound_port;
- sin->sin_addr.s_addr = rs->rs_bound_addr;
+ /* If socket is not yet bound, set the return address family
+ * to be AF_UNSPEC (value 0) and the address size to be that
+ * of an IPv4 address.
+ */
+ if (ipv6_addr_any(&rs->rs_bound_addr)) {
+ sin = (struct sockaddr_in *)uaddr;
+ memset(sin, 0, sizeof(*sin));
+ sin->sin_family = AF_UNSPEC;
+ return sizeof(*sin);
+ }
+ if (ipv6_addr_v4mapped(&rs->rs_bound_addr)) {
+ sin = (struct sockaddr_in *)uaddr;
+ memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
+ sin->sin_family = AF_INET;
+ sin->sin_port = rs->rs_bound_port;
+ sin->sin_addr.s_addr = rs->rs_bound_addr_v4;
+ uaddr_len = sizeof(*sin);
+ } else {
+ sin6 = (struct sockaddr_in6 *)uaddr;
+ sin6->sin6_family = AF_INET6;
+ sin6->sin6_port = rs->rs_bound_port;
+ sin6->sin6_addr = rs->rs_bound_addr;
+ sin6->sin6_flowinfo = 0;
+ sin6->sin6_scope_id = rs->rs_bound_scope_id;
+ uaddr_len = sizeof(*sin6);
+ }
}
- sin->sin_family = AF_INET;
-
- return sizeof(*sin);
+ return uaddr_len;
}
/*
@@ -203,11 +241,12 @@ static int rds_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
static int rds_cancel_sent_to(struct rds_sock *rs, char __user *optval,
int len)
{
+ struct sockaddr_in6 sin6;
struct sockaddr_in sin;
int ret = 0;
/* racing with another thread binding seems ok here */
- if (rs->rs_bound_addr == 0) {
+ if (ipv6_addr_any(&rs->rs_bound_addr)) {
ret = -ENOTCONN; /* XXX not a great errno */
goto out;
}
@@ -215,14 +254,23 @@ static int rds_cancel_sent_to(struct rds_sock *rs, char __user *optval,
if (len < sizeof(struct sockaddr_in)) {
ret = -EINVAL;
goto out;
+ } else if (len < sizeof(struct sockaddr_in6)) {
+ /* Assume IPv4 */
+ if (copy_from_user(&sin, optval, sizeof(struct sockaddr_in))) {
+ ret = -EFAULT;
+ goto out;
+ }
+ ipv6_addr_set_v4mapped(sin.sin_addr.s_addr, &sin6.sin6_addr);
+ sin6.sin6_port = sin.sin_port;
+ } else {
+ if (copy_from_user(&sin6, optval,
+ sizeof(struct sockaddr_in6))) {
+ ret = -EFAULT;
+ goto out;
+ }
}
- if (copy_from_user(&sin, optval, sizeof(sin))) {
- ret = -EFAULT;
- goto out;
- }
-
- rds_send_drop_to(rs, &sin);
+ rds_send_drop_to(rs, &sin6);
out:
return ret;
}
@@ -435,31 +483,41 @@ static int rds_connect(struct socket *sock, struct sockaddr *uaddr,
int addr_len, int flags)
{
struct sock *sk = sock->sk;
- struct sockaddr_in *sin = (struct sockaddr_in *)uaddr;
+ struct sockaddr_in *sin;
struct rds_sock *rs = rds_sk_to_rs(sk);
int ret = 0;
lock_sock(sk);
- if (addr_len != sizeof(struct sockaddr_in)) {
- ret = -EINVAL;
- goto out;
- }
+ switch (addr_len) {
+ case sizeof(struct sockaddr_in):
+ sin = (struct sockaddr_in *)uaddr;
+ if (sin->sin_family != AF_INET) {
+ ret = -EAFNOSUPPORT;
+ break;
+ }
+ if (sin->sin_addr.s_addr == htonl(INADDR_ANY)) {
+ ret = -EDESTADDRREQ;
+ break;
+ }
+ if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr)) ||
+ sin->sin_addr.s_addr == htonl(INADDR_BROADCAST)) {
+ ret = -EINVAL;
+ break;
+ }
+ ipv6_addr_set_v4mapped(sin->sin_addr.s_addr, &rs->rs_conn_addr);
+ rs->rs_conn_port = sin->sin_port;
+ break;
- if (sin->sin_family != AF_INET) {
- ret = -EAFNOSUPPORT;
- goto out;
- }
+ case sizeof(struct sockaddr_in6):
+ ret = -EPROTONOSUPPORT;
+ break;
- if (sin->sin_addr.s_addr == htonl(INADDR_ANY)) {
- ret = -EDESTADDRREQ;
- goto out;
+ default:
+ ret = -EINVAL;
+ break;
}
- rs->rs_conn_addr = sin->sin_addr.s_addr;
- rs->rs_conn_port = sin->sin_port;
-
-out:
release_sock(sk);
return ret;
}
@@ -578,8 +636,10 @@ static void rds_sock_inc_info(struct socket *sock, unsigned int len,
list_for_each_entry(inc, &rs->rs_recv_queue, i_item) {
total++;
if (total <= len)
- rds_inc_info_copy(inc, iter, inc->i_saddr,
- rs->rs_bound_addr, 1);
+ rds_inc_info_copy(inc, iter,
+ inc->i_saddr.s6_addr32[3],
+ rs->rs_bound_addr_v4,
+ 1);
}
read_unlock(&rs->rs_recv_lock);
@@ -608,8 +668,8 @@ static void rds_sock_info(struct socket *sock, unsigned int len,
list_for_each_entry(rs, &rds_sock_list, rs_item) {
sinfo.sndbuf = rds_sk_sndbuf(rs);
sinfo.rcvbuf = rds_sk_rcvbuf(rs);
- sinfo.bound_addr = rs->rs_bound_addr;
- sinfo.connected_addr = rs->rs_conn_addr;
+ sinfo.bound_addr = rs->rs_bound_addr_v4;
+ sinfo.connected_addr = rs->rs_conn_addr_v4;
sinfo.bound_port = rs->rs_bound_port;
sinfo.connected_port = rs->rs_conn_port;
sinfo.inum = sock_i_ino(rds_rs_to_sk(rs));
diff --git a/net/rds/bind.c b/net/rds/bind.c
index 5aa3a64aa4f0..c401776ad938 100644
--- a/net/rds/bind.c
+++ b/net/rds/bind.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2006 Oracle. All rights reserved.
+ * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
@@ -33,6 +33,7 @@
#include <linux/kernel.h>
#include <net/sock.h>
#include <linux/in.h>
+#include <linux/ipv6.h>
#include <linux/if_arp.h>
#include <linux/jhash.h>
#include <linux/ratelimit.h>
@@ -42,42 +43,58 @@ static struct rhashtable bind_hash_table;
static const struct rhashtable_params ht_parms = {
.nelem_hint = 768,
- .key_len = sizeof(u64),
+ .key_len = RDS_BOUND_KEY_LEN,
.key_offset = offsetof(struct rds_sock, rs_bound_key),
.head_offset = offsetof(struct rds_sock, rs_bound_node),
.max_size = 16384,
.min_size = 1024,
};
+/* Create a key for the bind hash table manipulation. Port is in network byte
+ * order.
+ */
+static inline void __rds_create_bind_key(u8 *key, const struct in6_addr *addr,
+ __be16 port, __u32 scope_id)
+{
+ memcpy(key, addr, sizeof(*addr));
+ key += sizeof(*addr);
+ memcpy(key, &port, sizeof(port));
+ key += sizeof(port);
+ memcpy(key, &scope_id, sizeof(scope_id));
+}
+
/*
* Return the rds_sock bound at the given local address.
*
* The rx path can race with rds_release. We notice if rds_release() has
* marked this socket and don't return a rs ref to the rx path.
*/
-struct rds_sock *rds_find_bound(__be32 addr, __be16 port)
+struct rds_sock *rds_find_bound(const struct in6_addr *addr, __be16 port,
+ __u32 scope_id)
{
- u64 key = ((u64)addr << 32) | port;
+ u8 key[RDS_BOUND_KEY_LEN];
struct rds_sock *rs;
- rs = rhashtable_lookup_fast(&bind_hash_table, &key, ht_parms);
+ __rds_create_bind_key(key, addr, port, scope_id);
+ rs = rhashtable_lookup_fast(&bind_hash_table, key, ht_parms);
if (rs && !sock_flag(rds_rs_to_sk(rs), SOCK_DEAD))
rds_sock_addref(rs);
else
rs = NULL;
- rdsdebug("returning rs %p for %pI4:%u\n", rs, &addr,
- ntohs(port));
+ rdsdebug("returning rs %p for %pI6c:%u\n", rs, addr,
+ ntohs(port));
return rs;
}
/* returns -ve errno or +ve port */
-static int rds_add_bound(struct rds_sock *rs, __be32 addr, __be16 *port)
+static int rds_add_bound(struct rds_sock *rs, const struct in6_addr *addr,
+ __be16 *port, __u32 scope_id)
{
int ret = -EADDRINUSE;
u16 rover, last;
- u64 key;
+ u8 key[RDS_BOUND_KEY_LEN];
if (*port != 0) {
rover = be16_to_cpu(*port);
@@ -95,12 +112,13 @@ static int rds_add_bound(struct rds_sock *rs, __be32 addr, __be16 *port)
if (rover == RDS_FLAG_PROBE_PORT)
continue;
- key = ((u64)addr << 32) | cpu_to_be16(rover);
- if (rhashtable_lookup_fast(&bind_hash_table, &key, ht_parms))
+ __rds_create_bind_key(key, addr, cpu_to_be16(rover),
+ scope_id);
+ if (rhashtable_lookup_fast(&bind_hash_table, key, ht_parms))
continue;
- rs->rs_bound_key = key;
- rs->rs_bound_addr = addr;
+ memcpy(rs->rs_bound_key, key, sizeof(rs->rs_bound_key));
+ rs->rs_bound_addr = *addr;
net_get_random_once(&rs->rs_hash_initval,
sizeof(rs->rs_hash_initval));
rs->rs_bound_port = cpu_to_be16(rover);
@@ -114,7 +132,7 @@ static int rds_add_bound(struct rds_sock *rs, __be32 addr, __be16 *port)
rs, &addr, (int)ntohs(*port));
break;
} else {
- rs->rs_bound_addr = 0;
+ rs->rs_bound_addr = in6addr_any;
rds_sock_put(rs);
ret = -ENOMEM;
break;
@@ -127,44 +145,61 @@ static int rds_add_bound(struct rds_sock *rs, __be32 addr, __be16 *port)
void rds_remove_bound(struct rds_sock *rs)
{
- if (!rs->rs_bound_addr)
+ if (ipv6_addr_any(&rs->rs_bound_addr))
return;
- rdsdebug("rs %p unbinding from %pI4:%d\n",
+ rdsdebug("rs %p unbinding from %pI6c:%d\n",
rs, &rs->rs_bound_addr,
ntohs(rs->rs_bound_port));
rhashtable_remove_fast(&bind_hash_table, &rs->rs_bound_node, ht_parms);
rds_sock_put(rs);
- rs->rs_bound_addr = 0;
+ rs->rs_bound_addr = in6addr_any;
}
int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
{
struct sock *sk = sock->sk;
- struct sockaddr_in *sin = (struct sockaddr_in *)uaddr;
struct rds_sock *rs = rds_sk_to_rs(sk);
+ struct in6_addr v6addr, *binding_addr;
struct rds_transport *trans;
+ __u32 scope_id = 0;
int ret = 0;
+ __be16 port;
+ /* We only allow an RDS socket to be bound to an IPv4 address. IPv6
+ * address support will be added later.
+ */
+ if (addr_len == sizeof(struct sockaddr_in)) {
+ struct sockaddr_in *sin = (struct sockaddr_in *)uaddr;
+
+ if (sin->sin_family != AF_INET ||
+ sin->sin_addr.s_addr == htonl(INADDR_ANY))
+ return -EINVAL;
+ ipv6_addr_set_v4mapped(sin->sin_addr.s_addr, &v6addr);
+ binding_addr = &v6addr;
+ port = sin->sin_port;
+ } else if (addr_len == sizeof(struct sockaddr_in6)) {
+ return -EPROTONOSUPPORT;
+ } else {
+ return -EINVAL;
+ }
lock_sock(sk);
- if (addr_len != sizeof(struct sockaddr_in) ||
- sin->sin_family != AF_INET ||
- rs->rs_bound_addr ||
- sin->sin_addr.s_addr == htonl(INADDR_ANY)) {
+ /* RDS socket does not allow re-binding. */
+ if (!ipv6_addr_any(&rs->rs_bound_addr)) {
ret = -EINVAL;
goto out;
}
- ret = rds_add_bound(rs, sin->sin_addr.s_addr, &sin->sin_port);
+ ret = rds_add_bound(rs, binding_addr, &port, scope_id);
if (ret)
goto out;
if (rs->rs_transport) { /* previously bound */
trans = rs->rs_transport;
if (trans->laddr_check(sock_net(sock->sk),
- sin->sin_addr.s_addr) != 0) {
+ binding_addr, scope_id) != 0) {
ret = -ENOPROTOOPT;
rds_remove_bound(rs);
} else {
@@ -172,13 +207,13 @@ int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
}
goto out;
}
- trans = rds_trans_get_preferred(sock_net(sock->sk),
- sin->sin_addr.s_addr);
+ trans = rds_trans_get_preferred(sock_net(sock->sk), binding_addr,
+ scope_id);
if (!trans) {
ret = -EADDRNOTAVAIL;
rds_remove_bound(rs);
- pr_info_ratelimited("RDS: %s could not find a transport for %pI4, load rds_tcp or rds_rdma?\n",
- __func__, &sin->sin_addr.s_addr);
+ pr_info_ratelimited("RDS: %s could not find a transport for %pI6c, load rds_tcp or rds_rdma?\n",
+ __func__, binding_addr);
goto out;
}
diff --git a/net/rds/cong.c b/net/rds/cong.c
index 63da9d2f142d..ccdff09a79c8 100644
--- a/net/rds/cong.c
+++ b/net/rds/cong.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2007 Oracle. All rights reserved.
+ * Copyright (c) 2007, 2017 Oracle and/or its affiliates. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
@@ -101,7 +101,7 @@ static DEFINE_RWLOCK(rds_cong_monitor_lock);
static DEFINE_SPINLOCK(rds_cong_lock);
static struct rb_root rds_cong_tree = RB_ROOT;
-static struct rds_cong_map *rds_cong_tree_walk(__be32 addr,
+static struct rds_cong_map *rds_cong_tree_walk(const struct in6_addr *addr,
struct rds_cong_map *insert)
{
struct rb_node **p = &rds_cong_tree.rb_node;
@@ -109,12 +109,15 @@ static struct rds_cong_map *rds_cong_tree_walk(__be32 addr,
struct rds_cong_map *map;
while (*p) {
+ int diff;
+
parent = *p;
map = rb_entry(parent, struct rds_cong_map, m_rb_node);
- if (addr < map->m_addr)
+ diff = rds_addr_cmp(addr, &map->m_addr);
+ if (diff < 0)
p = &(*p)->rb_left;
- else if (addr > map->m_addr)
+ else if (diff > 0)
p = &(*p)->rb_right;
else
return map;
@@ -132,7 +135,7 @@ static struct rds_cong_map *rds_cong_tree_walk(__be32 addr,
* these bitmaps in the process getting pointers to them. The bitmaps are only
* ever freed as the module is removed after all connections have been freed.
*/
-static struct rds_cong_map *rds_cong_from_addr(__be32 addr)
+static struct rds_cong_map *rds_cong_from_addr(const struct in6_addr *addr)
{
struct rds_cong_map *map;
struct rds_cong_map *ret = NULL;
@@ -144,7 +147,7 @@ static struct rds_cong_map *rds_cong_from_addr(__be32 addr)
if (!map)
return NULL;
- map->m_addr = addr;
+ map->m_addr = *addr;
init_waitqueue_head(&map->m_waitq);
INIT_LIST_HEAD(&map->m_conn_list);
@@ -171,7 +174,7 @@ out:
kfree(map);
}
- rdsdebug("map %p for addr %x\n", ret, be32_to_cpu(addr));
+ rdsdebug("map %p for addr %pI6c\n", ret, addr);
return ret;
}
@@ -202,8 +205,8 @@ void rds_cong_remove_conn(struct rds_connection *conn)
int rds_cong_get_maps(struct rds_connection *conn)
{
- conn->c_lcong = rds_cong_from_addr(conn->c_laddr);
- conn->c_fcong = rds_cong_from_addr(conn->c_faddr);
+ conn->c_lcong = rds_cong_from_addr(&conn->c_laddr);
+ conn->c_fcong = rds_cong_from_addr(&conn->c_faddr);
if (!(conn->c_lcong && conn->c_fcong))
return -ENOMEM;
@@ -353,7 +356,7 @@ void rds_cong_remove_socket(struct rds_sock *rs)
/* update congestion map for now-closed port */
spin_lock_irqsave(&rds_cong_lock, flags);
- map = rds_cong_tree_walk(rs->rs_bound_addr, NULL);
+ map = rds_cong_tree_walk(&rs->rs_bound_addr, NULL);
spin_unlock_irqrestore(&rds_cong_lock, flags);
if (map && rds_cong_test_bit(map, rs->rs_bound_port)) {
diff --git a/net/rds/connection.c b/net/rds/connection.c
index cfb05953b0e5..3176ead0ab4d 100644
--- a/net/rds/connection.c
+++ b/net/rds/connection.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2006 Oracle. All rights reserved.
+ * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
@@ -34,7 +34,8 @@
#include <linux/list.h>
#include <linux/slab.h>
#include <linux/export.h>
-#include <net/inet_hashtables.h>
+#include <net/ipv6.h>
+#include <net/inet6_hashtables.h>
#include "rds.h"
#include "loop.h"
@@ -49,18 +50,21 @@ static unsigned long rds_conn_count;
static struct hlist_head rds_conn_hash[RDS_CONNECTION_HASH_ENTRIES];
static struct kmem_cache *rds_conn_slab;
-static struct hlist_head *rds_conn_bucket(__be32 laddr, __be32 faddr)
+static struct hlist_head *rds_conn_bucket(const struct in6_addr *laddr,
+ const struct in6_addr *faddr)
{
+ static u32 rds6_hash_secret __read_mostly;
static u32 rds_hash_secret __read_mostly;
- unsigned long hash;
+ u32 lhash, fhash, hash;
net_get_random_once(&rds_hash_secret, sizeof(rds_hash_secret));
+ net_get_random_once(&rds6_hash_secret, sizeof(rds6_hash_secret));
+
+ lhash = (__force u32)laddr->s6_addr32[3];
+ fhash = __ipv6_addr_jhash(faddr, rds6_hash_secret);
+ hash = __inet6_ehashfn(lhash, 0, fhash, 0, rds_hash_secret);
- /* Pass NULL, don't need struct net for hash */
- hash = __inet_ehashfn(be32_to_cpu(laddr), 0,
- be32_to_cpu(faddr), 0,
- rds_hash_secret);
return &rds_conn_hash[hash & RDS_CONNECTION_HASH_MASK];
}
@@ -72,20 +76,25 @@ static struct hlist_head *rds_conn_bucket(__be32 laddr, __be32 faddr)
/* rcu read lock must be held or the connection spinlock */
static struct rds_connection *rds_conn_lookup(struct net *net,
struct hlist_head *head,
- __be32 laddr, __be32 faddr,
- struct rds_transport *trans)
+ const struct in6_addr *laddr,
+ const struct in6_addr *faddr,
+ struct rds_transport *trans,
+ int dev_if)
{
struct rds_connection *conn, *ret = NULL;
hlist_for_each_entry_rcu(conn, head, c_hash_node) {
- if (conn->c_faddr == faddr && conn->c_laddr == laddr &&
- conn->c_trans == trans && net == rds_conn_net(conn)) {
+ if (ipv6_addr_equal(&conn->c_faddr, faddr) &&
+ ipv6_addr_equal(&conn->c_laddr, laddr) &&
+ conn->c_trans == trans &&
+ net == rds_conn_net(conn) &&
+ conn->c_dev_if == dev_if) {
ret = conn;
break;
}
}
- rdsdebug("returning conn %p for %pI4 -> %pI4\n", ret,
- &laddr, &faddr);
+ rdsdebug("returning conn %p for %pI6c -> %pI6c\n", ret,
+ laddr, faddr);
return ret;
}
@@ -99,8 +108,8 @@ static void rds_conn_path_reset(struct rds_conn_path *cp)
{
struct rds_connection *conn = cp->cp_conn;
- rdsdebug("connection %pI4 to %pI4 reset\n",
- &conn->c_laddr, &conn->c_faddr);
+ rdsdebug("connection %pI6c to %pI6c reset\n",
+ &conn->c_laddr, &conn->c_faddr);
rds_stats_inc(s_conn_reset);
rds_send_path_reset(cp);
@@ -142,9 +151,12 @@ static void __rds_conn_path_init(struct rds_connection *conn,
* are torn down as the module is removed, if ever.
*/
static struct rds_connection *__rds_conn_create(struct net *net,
- __be32 laddr, __be32 faddr,
- struct rds_transport *trans, gfp_t gfp,
- int is_outgoing)
+ const struct in6_addr *laddr,
+ const struct in6_addr *faddr,
+ struct rds_transport *trans,
+ gfp_t gfp,
+ int is_outgoing,
+ int dev_if)
{
struct rds_connection *conn, *parent = NULL;
struct hlist_head *head = rds_conn_bucket(laddr, faddr);
@@ -154,9 +166,12 @@ static struct rds_connection *__rds_conn_create(struct net *net,
int npaths = (trans->t_mp_capable ? RDS_MPATH_WORKERS : 1);
rcu_read_lock();
- conn = rds_conn_lookup(net, head, laddr, faddr, trans);
- if (conn && conn->c_loopback && conn->c_trans != &rds_loop_transport &&
- laddr == faddr && !is_outgoing) {
+ conn = rds_conn_lookup(net, head, laddr, faddr, trans, dev_if);
+ if (conn &&
+ conn->c_loopback &&
+ conn->c_trans != &rds_loop_transport &&
+ ipv6_addr_equal(laddr, faddr) &&
+ !is_outgoing) {
/* This is a looped back IB connection, and we're
* called by the code handling the incoming connect.
* We need a second connection object into which we
@@ -181,8 +196,10 @@ static struct rds_connection *__rds_conn_create(struct net *net,
}
INIT_HLIST_NODE(&conn->c_hash_node);
- conn->c_laddr = laddr;
- conn->c_faddr = faddr;
+ conn->c_laddr = *laddr;
+ conn->c_isv6 = !ipv6_addr_v4mapped(laddr);
+ conn->c_faddr = *faddr;
+ conn->c_dev_if = dev_if;
rds_conn_net_set(conn, net);
@@ -199,7 +216,7 @@ static struct rds_connection *__rds_conn_create(struct net *net,
* can bind to the destination address then we'd rather the messages
* flow through loopback rather than either transport.
*/
- loop_trans = rds_trans_get_preferred(net, faddr);
+ loop_trans = rds_trans_get_preferred(net, faddr, conn->c_dev_if);
if (loop_trans) {
rds_trans_put(loop_trans);
conn->c_loopback = 1;
@@ -233,10 +250,10 @@ static struct rds_connection *__rds_conn_create(struct net *net,
goto out;
}
- rdsdebug("allocated conn %p for %pI4 -> %pI4 over %s %s\n",
- conn, &laddr, &faddr,
- strnlen(trans->t_name, sizeof(trans->t_name)) ? trans->t_name :
- "[unknown]", is_outgoing ? "(outgoing)" : "");
+ rdsdebug("allocated conn %p for %pI6c -> %pI6c over %s %s\n",
+ conn, laddr, faddr,
+ strnlen(trans->t_name, sizeof(trans->t_name)) ?
+ trans->t_name : "[unknown]", is_outgoing ? "(outgoing)" : "");
/*
* Since we ran without holding the conn lock, someone could
@@ -262,7 +279,8 @@ static struct rds_connection *__rds_conn_create(struct net *net,
/* Creating normal conn */
struct rds_connection *found;
- found = rds_conn_lookup(net, head, laddr, faddr, trans);
+ found = rds_conn_lookup(net, head, laddr, faddr, trans,
+ dev_if);
if (found) {
struct rds_conn_path *cp;
int i;
@@ -295,18 +313,22 @@ out:
}
struct rds_connection *rds_conn_create(struct net *net,
- __be32 laddr, __be32 faddr,
- struct rds_transport *trans, gfp_t gfp)
+ const struct in6_addr *laddr,
+ const struct in6_addr *faddr,
+ struct rds_transport *trans, gfp_t gfp,
+ int dev_if)
{
- return __rds_conn_create(net, laddr, faddr, trans, gfp, 0);
+ return __rds_conn_create(net, laddr, faddr, trans, gfp, 0, dev_if);
}
EXPORT_SYMBOL_GPL(rds_conn_create);
struct rds_connection *rds_conn_create_outgoing(struct net *net,
- __be32 laddr, __be32 faddr,
- struct rds_transport *trans, gfp_t gfp)
+ const struct in6_addr *laddr,
+ const struct in6_addr *faddr,
+ struct rds_transport *trans,
+ gfp_t gfp, int dev_if)
{
- return __rds_conn_create(net, laddr, faddr, trans, gfp, 1);
+ return __rds_conn_create(net, laddr, faddr, trans, gfp, 1, dev_if);
}
EXPORT_SYMBOL_GPL(rds_conn_create_outgoing);
@@ -502,12 +524,17 @@ static void rds_conn_message_info(struct socket *sock, unsigned int len,
/* XXX too lazy to maintain counts.. */
list_for_each_entry(rm, list, m_conn_item) {
+ __be32 laddr;
+ __be32 faddr;
+
total++;
+ laddr = conn->c_laddr.s6_addr32[3];
+ faddr = conn->c_faddr.s6_addr32[3];
if (total <= len)
rds_inc_info_copy(&rm->m_inc,
iter,
- conn->c_laddr,
- conn->c_faddr,
+ laddr,
+ faddr,
0);
}
@@ -584,7 +611,6 @@ static void rds_walk_conn_path_info(struct socket *sock, unsigned int len,
struct hlist_head *head;
struct rds_connection *conn;
size_t i;
- int j;
rcu_read_lock();
@@ -595,17 +621,20 @@ static void rds_walk_conn_path_info(struct socket *sock, unsigned int len,
i++, head++) {
hlist_for_each_entry_rcu(conn, head, c_hash_node) {
struct rds_conn_path *cp;
- int npaths;
- npaths = (conn->c_trans->t_mp_capable ?
- RDS_MPATH_WORKERS : 1);
- for (j = 0; j < npaths; j++) {
- cp = &conn->c_path[j];
+ /* XXX We only copy the information from the first
+ * path for now. The problem is that if there are
+ * more than one underlying paths, we cannot report
+ * information of all of them using the existing
+ * API. For example, there is only one next_tx_seq,
+ * which path's next_tx_seq should we report? It is
+ * a bug in the design of MPRDS.
+ */
+ cp = conn->c_path;
- /* XXX no cp_lock usage.. */
- if (!visitor(cp, buffer))
- continue;
- }
+ /* XXX no cp_lock usage.. */
+ if (!visitor(cp, buffer))
+ continue;
/* We copy as much as we can fit in the buffer,
* but we count all items so that the caller
@@ -624,12 +653,13 @@ static void rds_walk_conn_path_info(struct socket *sock, unsigned int len,
static int rds_conn_info_visitor(struct rds_conn_path *cp, void *buffer)
{
struct rds_info_connection *cinfo = buffer;
+ struct rds_connection *conn = cp->cp_conn;
cinfo->next_tx_seq = cp->cp_next_tx_seq;
cinfo->next_rx_seq = cp->cp_next_rx_seq;
- cinfo->laddr = cp->cp_conn->c_laddr;
- cinfo->faddr = cp->cp_conn->c_faddr;
- strncpy(cinfo->transport, cp->cp_conn->c_trans->t_name,
+ cinfo->laddr = conn->c_laddr.s6_addr32[3];
+ cinfo->faddr = conn->c_faddr.s6_addr32[3];
+ strncpy(cinfo->transport, conn->c_trans->t_name,
sizeof(cinfo->transport));
cinfo->flags = 0;
diff --git a/net/rds/ib.c b/net/rds/ib.c
index b6ad38e48f62..c712a848957d 100644
--- a/net/rds/ib.c
+++ b/net/rds/ib.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2006 Oracle. All rights reserved.
+ * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
@@ -296,8 +296,8 @@ static int rds_ib_conn_info_visitor(struct rds_connection *conn,
if (conn->c_trans != &rds_ib_transport)
return 0;
- iinfo->src_addr = conn->c_laddr;
- iinfo->dst_addr = conn->c_faddr;
+ iinfo->src_addr = conn->c_laddr.s6_addr32[3];
+ iinfo->dst_addr = conn->c_faddr.s6_addr32[3];
memset(&iinfo->src_gid, 0, sizeof(iinfo->src_gid));
memset(&iinfo->dst_gid, 0, sizeof(iinfo->dst_gid));
@@ -341,7 +341,8 @@ static void rds_ib_ic_info(struct socket *sock, unsigned int len,
* allowed to influence which paths have priority. We could call userspace
* asserting this policy "routing".
*/
-static int rds_ib_laddr_check(struct net *net, __be32 addr)
+static int rds_ib_laddr_check(struct net *net, const struct in6_addr *addr,
+ __u32 scope_id)
{
int ret;
struct rdma_cm_id *cm_id;
@@ -357,7 +358,7 @@ static int rds_ib_laddr_check(struct net *net, __be32 addr)
memset(&sin, 0, sizeof(sin));
sin.sin_family = AF_INET;
- sin.sin_addr.s_addr = addr;
+ sin.sin_addr.s_addr = addr->s6_addr32[3];
/* rdma_bind_addr will only succeed for IB & iWARP devices */
ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin);
@@ -367,9 +368,9 @@ static int rds_ib_laddr_check(struct net *net, __be32 addr)
cm_id->device->node_type != RDMA_NODE_IB_CA)
ret = -EADDRNOTAVAIL;
- rdsdebug("addr %pI4 ret %d node type %d\n",
- &addr, ret,
- cm_id->device ? cm_id->device->node_type : -1);
+ rdsdebug("addr %pI6c ret %d node type %d\n",
+ addr, ret,
+ cm_id->device ? cm_id->device->node_type : -1);
rdma_destroy_id(cm_id);
diff --git a/net/rds/ib.h b/net/rds/ib.h
index a6f4d7d68e95..beb95b893f78 100644
--- a/net/rds/ib.h
+++ b/net/rds/ib.h
@@ -57,16 +57,44 @@ struct rds_ib_refill_cache {
struct list_head *ready;
};
+/* This is the common structure for the IB private data exchange in setting up
+ * an RDS connection. The exchange is different for IPv4 and IPv6 connections.
+ * The reason is that the address size is different and the addresses
+ * exchanged are in the beginning of the structure. Hence it is not possible
+ * for interoperability if same structure is used.
+ */
+struct rds_ib_conn_priv_cmn {
+ u8 ricpc_protocol_major;
+ u8 ricpc_protocol_minor;
+ __be16 ricpc_protocol_minor_mask; /* bitmask */
+ __be32 ricpc_reserved1;
+ __be64 ricpc_ack_seq;
+ __be32 ricpc_credit; /* non-zero enables flow ctl */
+};
+
struct rds_ib_connect_private {
/* Add new fields at the end, and don't permute existing fields. */
- __be32 dp_saddr;
- __be32 dp_daddr;
- u8 dp_protocol_major;
- u8 dp_protocol_minor;
- __be16 dp_protocol_minor_mask; /* bitmask */
- __be32 dp_reserved1;
- __be64 dp_ack_seq;
- __be32 dp_credit; /* non-zero enables flow ctl */
+ __be32 dp_saddr;
+ __be32 dp_daddr;
+ struct rds_ib_conn_priv_cmn dp_cmn;
+};
+
+struct rds6_ib_connect_private {
+ /* Add new fields at the end, and don't permute existing fields. */
+ struct in6_addr dp_saddr;
+ struct in6_addr dp_daddr;
+ struct rds_ib_conn_priv_cmn dp_cmn;
+};
+
+#define dp_protocol_major dp_cmn.ricpc_protocol_major
+#define dp_protocol_minor dp_cmn.ricpc_protocol_minor
+#define dp_protocol_minor_mask dp_cmn.ricpc_protocol_minor_mask
+#define dp_ack_seq dp_cmn.ricpc_ack_seq
+#define dp_credit dp_cmn.ricpc_credit
+
+union rds_ib_conn_priv {
+ struct rds_ib_connect_private ricp_v4;
+ struct rds6_ib_connect_private ricp_v6;
};
struct rds_ib_send_work {
@@ -351,8 +379,8 @@ void rds_ib_listen_stop(void);
__printf(2, 3)
void __rds_ib_conn_error(struct rds_connection *conn, const char *, ...);
int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
- struct rdma_cm_event *event);
-int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id);
+ struct rdma_cm_event *event, bool isv6);
+int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id, bool isv6);
void rds_ib_cm_connect_complete(struct rds_connection *conn,
struct rdma_cm_event *event);
@@ -361,7 +389,8 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn,
__rds_ib_conn_error(conn, KERN_WARNING "RDS/IB: " fmt)
/* ib_rdma.c */
-int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr);
+int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev,
+ struct in6_addr *ipaddr);
void rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn);
void rds_ib_remove_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn);
voi