summaryrefslogtreecommitdiffstats
path: root/net/rds
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2016-03-19 10:05:34 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2016-03-19 10:05:34 -0700
commit1200b6809dfd9d73bc4c7db76d288c35fa4b2ebe (patch)
tree552e03de245cdbd0780ca1215914edc4a26540f7 /net/rds
parent6b5f04b6cf8ebab9a65d9c0026c650bb2538fd0f (diff)
parentfe30937b65354c7fec244caebbdaae68e28ca797 (diff)
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next
Pull networking updates from David Miller: "Highlights: 1) Support more Realtek wireless chips, from Jes Sorenson. 2) New BPF types for per-cpu hash and arrap maps, from Alexei Starovoitov. 3) Make several TCP sysctls per-namespace, from Nikolay Borisov. 4) Allow the use of SO_REUSEPORT in order to do per-thread processing of incoming TCP/UDP connections. The muxing can be done using a BPF program which hashes the incoming packet. From Craig Gallek. 5) Add a multiplexer for TCP streams, to provide a messaged based interface. BPF programs can be used to determine the message boundaries. From Tom Herbert. 6) Add 802.1AE MACSEC support, from Sabrina Dubroca. 7) Avoid factorial complexity when taking down an inetdev interface with lots of configured addresses. We were doing things like traversing the entire address less for each address removed, and flushing the entire netfilter conntrack table for every address as well. 8) Add and use SKB bulk free infrastructure, from Jesper Brouer. 9) Allow offloading u32 classifiers to hardware, and implement for ixgbe, from John Fastabend. 10) Allow configuring IRQ coalescing parameters on a per-queue basis, from Kan Liang. 11) Extend ethtool so that larger link mode masks can be supported. From David Decotigny. 12) Introduce devlink, which can be used to configure port link types (ethernet vs Infiniband, etc.), port splitting, and switch device level attributes as a whole. From Jiri Pirko. 13) Hardware offload support for flower classifiers, from Amir Vadai. 14) Add "Local Checksum Offload". Basically, for a tunneled packet the checksum of the outer header is 'constant' (because with the checksum field filled into the inner protocol header, the payload of the outer frame checksums to 'zero'), and we can take advantage of that in various ways. From Edward Cree" * git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next: (1548 commits) bonding: fix bond_get_stats() net: bcmgenet: fix dma api length mismatch net/mlx4_core: Fix backward compatibility on VFs phy: mdio-thunder: Fix some Kconfig typos lan78xx: add ndo_get_stats64 lan78xx: handle statistics counter rollover RDS: TCP: Remove unused constant RDS: TCP: Add sysctl tunables for sndbuf/rcvbuf on rds-tcp socket net: smc911x: convert pxa dma to dmaengine team: remove duplicate set of flag IFF_MULTICAST bonding: remove duplicate set of flag IFF_MULTICAST net: fix a comment typo ethernet: micrel: fix some error codes ip_tunnels, bpf: define IP_TUNNEL_OPTS_MAX and use it bpf, dst: add and use dst_tclassid helper bpf: make skb->tc_classid also readable net: mvneta: bm: clarify dependencies cls_bpf: reset class and reuse major in da ldmvsw: Checkpatch sunvnet.c and sunvnet_common.c ldmvsw: Add ldmvsw.c driver code ...
Diffstat (limited to 'net/rds')
-rw-r--r--net/rds/Kconfig7
-rw-r--r--net/rds/Makefile4
-rw-r--r--net/rds/af_rds.c26
-rw-r--r--net/rds/ib.c47
-rw-r--r--net/rds/ib.h37
-rw-r--r--net/rds/ib_cm.c59
-rw-r--r--net/rds/ib_fmr.c248
-rw-r--r--net/rds/ib_frmr.c376
-rw-r--r--net/rds/ib_mr.h148
-rw-r--r--net/rds/ib_rdma.c495
-rw-r--r--net/rds/ib_send.c6
-rw-r--r--net/rds/ib_stats.c2
-rw-r--r--net/rds/iw.c312
-rw-r--r--net/rds/iw.h398
-rw-r--r--net/rds/iw_cm.c769
-rw-r--r--net/rds/iw_rdma.c837
-rw-r--r--net/rds/iw_recv.c904
-rw-r--r--net/rds/iw_ring.c169
-rw-r--r--net/rds/iw_send.c981
-rw-r--r--net/rds/iw_stats.c95
-rw-r--r--net/rds/iw_sysctl.c123
-rw-r--r--net/rds/rdma_transport.c21
-rw-r--r--net/rds/rdma_transport.h5
-rw-r--r--net/rds/rds.h1
-rw-r--r--net/rds/recv.c20
-rw-r--r--net/rds/tcp.c146
26 files changed, 1193 insertions, 5043 deletions
diff --git a/net/rds/Kconfig b/net/rds/Kconfig
index f2c670ba7b9b..bffde4b46c5d 100644
--- a/net/rds/Kconfig
+++ b/net/rds/Kconfig
@@ -4,14 +4,13 @@ config RDS
depends on INET
---help---
The RDS (Reliable Datagram Sockets) protocol provides reliable,
- sequenced delivery of datagrams over Infiniband, iWARP,
- or TCP.
+ sequenced delivery of datagrams over Infiniband or TCP.
config RDS_RDMA
- tristate "RDS over Infiniband and iWARP"
+ tristate "RDS over Infiniband"
depends on RDS && INFINIBAND && INFINIBAND_ADDR_TRANS
---help---
- Allow RDS to use Infiniband and iWARP as a transport.
+ Allow RDS to use Infiniband as a transport.
This transport supports RDMA operations.
config RDS_TCP
diff --git a/net/rds/Makefile b/net/rds/Makefile
index 56d3f6023ced..0e72bec1529f 100644
--- a/net/rds/Makefile
+++ b/net/rds/Makefile
@@ -6,9 +6,7 @@ rds-y := af_rds.o bind.o cong.o connection.o info.o message.o \
obj-$(CONFIG_RDS_RDMA) += rds_rdma.o
rds_rdma-y := rdma_transport.o \
ib.o ib_cm.o ib_recv.o ib_ring.o ib_send.o ib_stats.o \
- ib_sysctl.o ib_rdma.o \
- iw.o iw_cm.o iw_recv.o iw_ring.o iw_send.o iw_stats.o \
- iw_sysctl.o iw_rdma.o
+ ib_sysctl.o ib_rdma.o ib_fmr.o ib_frmr.o
obj-$(CONFIG_RDS_TCP) += rds_tcp.o
diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c
index b5476aebd68d..6beaeb1138f3 100644
--- a/net/rds/af_rds.c
+++ b/net/rds/af_rds.c
@@ -277,6 +277,27 @@ static int rds_set_transport(struct rds_sock *rs, char __user *optval,
return rs->rs_transport ? 0 : -ENOPROTOOPT;
}
+static int rds_enable_recvtstamp(struct sock *sk, char __user *optval,
+ int optlen)
+{
+ int val, valbool;
+
+ if (optlen != sizeof(int))
+ return -EFAULT;
+
+ if (get_user(val, (int __user *)optval))
+ return -EFAULT;
+
+ valbool = val ? 1 : 0;
+
+ if (valbool)
+ sock_set_flag(sk, SOCK_RCVTSTAMP);
+ else
+ sock_reset_flag(sk, SOCK_RCVTSTAMP);
+
+ return 0;
+}
+
static int rds_setsockopt(struct socket *sock, int level, int optname,
char __user *optval, unsigned int optlen)
{
@@ -312,6 +333,11 @@ static int rds_setsockopt(struct socket *sock, int level, int optname,
ret = rds_set_transport(rs, optval, optlen);
release_sock(sock->sk);
break;
+ case SO_TIMESTAMP:
+ lock_sock(sock->sk);
+ ret = rds_enable_recvtstamp(sock->sk, optval, optlen);
+ release_sock(sock->sk);
+ break;
default:
ret = -ENOPROTOOPT;
}
diff --git a/net/rds/ib.c b/net/rds/ib.c
index 9481d55ff6cb..b5342fddaf98 100644
--- a/net/rds/ib.c
+++ b/net/rds/ib.c
@@ -42,15 +42,16 @@
#include "rds.h"
#include "ib.h"
+#include "ib_mr.h"
-unsigned int rds_ib_fmr_1m_pool_size = RDS_FMR_1M_POOL_SIZE;
-unsigned int rds_ib_fmr_8k_pool_size = RDS_FMR_8K_POOL_SIZE;
+unsigned int rds_ib_mr_1m_pool_size = RDS_MR_1M_POOL_SIZE;
+unsigned int rds_ib_mr_8k_pool_size = RDS_MR_8K_POOL_SIZE;
unsigned int rds_ib_retry_count = RDS_IB_DEFAULT_RETRY_COUNT;
-module_param(rds_ib_fmr_1m_pool_size, int, 0444);
-MODULE_PARM_DESC(rds_ib_fmr_1m_pool_size, " Max number of 1M fmr per HCA");
-module_param(rds_ib_fmr_8k_pool_size, int, 0444);
-MODULE_PARM_DESC(rds_ib_fmr_8k_pool_size, " Max number of 8K fmr per HCA");
+module_param(rds_ib_mr_1m_pool_size, int, 0444);
+MODULE_PARM_DESC(rds_ib_mr_1m_pool_size, " Max number of 1M mr per HCA");
+module_param(rds_ib_mr_8k_pool_size, int, 0444);
+MODULE_PARM_DESC(rds_ib_mr_8k_pool_size, " Max number of 8K mr per HCA");
module_param(rds_ib_retry_count, int, 0444);
MODULE_PARM_DESC(rds_ib_retry_count, " Number of hw retries before reporting an error");
@@ -139,14 +140,20 @@ static void rds_ib_add_one(struct ib_device *device)
rds_ibdev->max_wrs = device->attrs.max_qp_wr;
rds_ibdev->max_sge = min(device->attrs.max_sge, RDS_IB_MAX_SGE);
+ rds_ibdev->has_fr = (device->attrs.device_cap_flags &
+ IB_DEVICE_MEM_MGT_EXTENSIONS);
+ rds_ibdev->has_fmr = (device->alloc_fmr && device->dealloc_fmr &&
+ device->map_phys_fmr && device->unmap_fmr);
+ rds_ibdev->use_fastreg = (rds_ibdev->has_fr && !rds_ibdev->has_fmr);
+
rds_ibdev->fmr_max_remaps = device->attrs.max_map_per_fmr?: 32;
- rds_ibdev->max_1m_fmrs = device->attrs.max_mr ?
+ rds_ibdev->max_1m_mrs = device->attrs.max_mr ?
min_t(unsigned int, (device->attrs.max_mr / 2),
- rds_ib_fmr_1m_pool_size) : rds_ib_fmr_1m_pool_size;
+ rds_ib_mr_1m_pool_size) : rds_ib_mr_1m_pool_size;
- rds_ibdev->max_8k_fmrs = device->attrs.max_mr ?
+ rds_ibdev->max_8k_mrs = device->attrs.max_mr ?
min_t(unsigned int, ((device->attrs.max_mr / 2) * RDS_MR_8K_SCALE),
- rds_ib_fmr_8k_pool_size) : rds_ib_fmr_8k_pool_size;
+ rds_ib_mr_8k_pool_size) : rds_ib_mr_8k_pool_size;
rds_ibdev->max_initiator_depth = device->attrs.max_qp_init_rd_atom;
rds_ibdev->max_responder_resources = device->attrs.max_qp_rd_atom;
@@ -172,10 +179,14 @@ static void rds_ib_add_one(struct ib_device *device)
goto put_dev;
}
- rdsdebug("RDS/IB: max_mr = %d, max_wrs = %d, max_sge = %d, fmr_max_remaps = %d, max_1m_fmrs = %d, max_8k_fmrs = %d\n",
+ rdsdebug("RDS/IB: max_mr = %d, max_wrs = %d, max_sge = %d, fmr_max_remaps = %d, max_1m_mrs = %d, max_8k_mrs = %d\n",
device->attrs.max_fmr, rds_ibdev->max_wrs, rds_ibdev->max_sge,
- rds_ibdev->fmr_max_remaps, rds_ibdev->max_1m_fmrs,
- rds_ibdev->max_8k_fmrs);
+ rds_ibdev->fmr_max_remaps, rds_ibdev->max_1m_mrs,
+ rds_ibdev->max_8k_mrs);
+
+ pr_info("RDS/IB: %s: %s supported and preferred\n",
+ device->name,
+ rds_ibdev->use_fastreg ? "FRMR" : "FMR");
INIT_LIST_HEAD(&rds_ibdev->ipaddr_list);
INIT_LIST_HEAD(&rds_ibdev->conn_list);
@@ -364,7 +375,7 @@ void rds_ib_exit(void)
rds_ib_sysctl_exit();
rds_ib_recv_exit();
rds_trans_unregister(&rds_ib_transport);
- rds_ib_fmr_exit();
+ rds_ib_mr_exit();
}
struct rds_transport rds_ib_transport = {
@@ -400,13 +411,13 @@ int rds_ib_init(void)
INIT_LIST_HEAD(&rds_ib_devices);
- ret = rds_ib_fmr_init();
+ ret = rds_ib_mr_init();
if (ret)
goto out;
ret = ib_register_client(&rds_ib_client);
if (ret)
- goto out_fmr_exit;
+ goto out_mr_exit;
ret = rds_ib_sysctl_init();
if (ret)
@@ -430,8 +441,8 @@ out_sysctl:
rds_ib_sysctl_exit();
out_ibreg:
rds_ib_unregister_client();
-out_fmr_exit:
- rds_ib_fmr_exit();
+out_mr_exit:
+ rds_ib_mr_exit();
out:
return ret;
}
diff --git a/net/rds/ib.h b/net/rds/ib.h
index b3fdebb57460..627fb79aee65 100644
--- a/net/rds/ib.h
+++ b/net/rds/ib.h
@@ -9,17 +9,12 @@
#include "rds.h"
#include "rdma_transport.h"
-#define RDS_FMR_1M_POOL_SIZE (8192 / 2)
-#define RDS_FMR_1M_MSG_SIZE 256
-#define RDS_FMR_8K_MSG_SIZE 2
-#define RDS_MR_8K_SCALE (256 / (RDS_FMR_8K_MSG_SIZE + 1))
-#define RDS_FMR_8K_POOL_SIZE (RDS_MR_8K_SCALE * (8192 / 2))
-
#define RDS_IB_MAX_SGE 8
#define RDS_IB_RECV_SGE 2
#define RDS_IB_DEFAULT_RECV_WR 1024
#define RDS_IB_DEFAULT_SEND_WR 256
+#define RDS_IB_DEFAULT_FR_WR 512
#define RDS_IB_DEFAULT_RETRY_COUNT 2
@@ -28,7 +23,6 @@
#define RDS_IB_RECYCLE_BATCH_COUNT 32
#define RDS_IB_WC_MAX 32
-#define RDS_IB_SEND_OP BIT_ULL(63)
extern struct rw_semaphore rds_ib_devices_lock;
extern struct list_head rds_ib_devices;
@@ -129,6 +123,9 @@ struct rds_ib_connection {
struct ib_wc i_send_wc[RDS_IB_WC_MAX];
struct ib_wc i_recv_wc[RDS_IB_WC_MAX];
+ /* To control the number of wrs from fastreg */
+ atomic_t i_fastreg_wrs;
+
/* interrupt handling */
struct tasklet_struct i_send_tasklet;
struct tasklet_struct i_recv_tasklet;
@@ -207,12 +204,16 @@ struct rds_ib_device {
struct list_head conn_list;
struct ib_device *dev;
struct ib_pd *pd;
- unsigned int max_fmrs;
+ bool has_fmr;
+ bool has_fr;
+ bool use_fastreg;
+
+ unsigned int max_mrs;
struct rds_ib_mr_pool *mr_1m_pool;
struct rds_ib_mr_pool *mr_8k_pool;
unsigned int fmr_max_remaps;
- unsigned int max_8k_fmrs;
- unsigned int max_1m_fmrs;
+ unsigned int max_8k_mrs;
+ unsigned int max_1m_mrs;
int max_sge;
unsigned int max_wrs;
unsigned int max_initiator_depth;
@@ -266,6 +267,8 @@ struct rds_ib_statistics {
uint64_t s_ib_rdma_mr_1m_pool_flush;
uint64_t s_ib_rdma_mr_1m_pool_wait;
uint64_t s_ib_rdma_mr_1m_pool_depleted;
+ uint64_t s_ib_rdma_mr_8k_reused;
+ uint64_t s_ib_rdma_mr_1m_reused;
uint64_t s_ib_atomic_cswp;
uint64_t s_ib_atomic_fadd;
};
@@ -317,8 +320,6 @@ struct rds_ib_device *rds_ib_get_client_data(struct ib_device *device);
void rds_ib_dev_put(struct rds_ib_device *rds_ibdev);
extern struct ib_client rds_ib_client;
-extern unsigned int rds_ib_fmr_1m_pool_size;
-extern unsigned int rds_ib_fmr_8k_pool_size;
extern unsigned int rds_ib_retry_count;
extern spinlock_t ib_nodev_conns_lock;
@@ -348,17 +349,7 @@ int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr);
void rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn);
void rds_ib_remove_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn);
void rds_ib_destroy_nodev_conns(void);
-struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_dev,
- int npages);
-void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_connection *iinfo);
-void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *);
-void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
- struct rds_sock *rs, u32 *key_ret);
-void rds_ib_sync_mr(void *trans_private, int dir);
-void rds_ib_free_mr(void *trans_private, int invalidate);
-void rds_ib_flush_mrs(void);
-int rds_ib_fmr_init(void);
-void rds_ib_fmr_exit(void);
+void rds_ib_mr_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc);
/* ib_recv.c */
int rds_ib_recv_init(void);
diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c
index da5a7fb98c77..8764970f0c24 100644
--- a/net/rds/ib_cm.c
+++ b/net/rds/ib_cm.c
@@ -236,12 +236,10 @@ static void rds_ib_cq_comp_handler_recv(struct ib_cq *cq, void *context)
tasklet_schedule(&ic->i_recv_tasklet);
}
-static void poll_cq(struct rds_ib_connection *ic, struct ib_cq *cq,
- struct ib_wc *wcs,
- struct rds_ib_ack_state *ack_state)
+static void poll_scq(struct rds_ib_connection *ic, struct ib_cq *cq,
+ struct ib_wc *wcs)
{
- int nr;
- int i;
+ int nr, i;
struct ib_wc *wc;
while ((nr = ib_poll_cq(cq, RDS_IB_WC_MAX, wcs)) > 0) {
@@ -251,10 +249,12 @@ static void poll_cq(struct rds_ib_connection *ic, struct ib_cq *cq,
(unsigned long long)wc->wr_id, wc->status,
wc->byte_len, be32_to_cpu(wc->ex.imm_data));
- if (wc->wr_id & RDS_IB_SEND_OP)
+ if (wc->wr_id <= ic->i_send_ring.w_nr ||
+ wc->wr_id == RDS_IB_ACK_WR_ID)
rds_ib_send_cqe_handler(ic, wc);
else
- rds_ib_recv_cqe_handler(ic, wc, ack_state);
+ rds_ib_mr_cqe_handler(ic, wc);
+
}
}
}
@@ -263,14 +263,12 @@ static void rds_ib_tasklet_fn_send(unsigned long data)
{
struct rds_ib_connection *ic = (struct rds_ib_connection *)data;
struct rds_connection *conn = ic->conn;
- struct rds_ib_ack_state state;
rds_ib_stats_inc(s_ib_tasklet_call);
- memset(&state, 0, sizeof(state));
- poll_cq(ic, ic->i_send_cq, ic->i_send_wc, &state);
+ poll_scq(ic, ic->i_send_cq, ic->i_send_wc);
ib_req_notify_cq(ic->i_send_cq, IB_CQ_NEXT_COMP);
- poll_cq(ic, ic->i_send_cq, ic->i_send_wc, &state);
+ poll_scq(ic, ic->i_send_cq, ic->i_send_wc);
if (rds_conn_up(conn) &&
(!test_bit(RDS_LL_SEND_FULL, &conn->c_flags) ||
@@ -278,6 +276,25 @@ static void rds_ib_tasklet_fn_send(unsigned long data)
rds_send_xmit(ic->conn);
}
+static void poll_rcq(struct rds_ib_connection *ic, struct ib_cq *cq,
+ struct ib_wc *wcs,
+ struct rds_ib_ack_state *ack_state)
+{
+ int nr, i;
+ struct ib_wc *wc;
+
+ while ((nr = ib_poll_cq(cq, RDS_IB_WC_MAX, wcs)) > 0) {
+ for (i = 0; i < nr; i++) {
+ wc = wcs + i;
+ rdsdebug("wc wr_id 0x%llx status %u byte_len %u imm_data %u\n",
+ (unsigned long long)wc->wr_id, wc->status,
+ wc->byte_len, be32_to_cpu(wc->ex.imm_data));
+
+ rds_ib_recv_cqe_handler(ic, wc, ack_state);
+ }
+ }
+}
+
static void rds_ib_tasklet_fn_recv(unsigned long data)
{
struct rds_ib_connection *ic = (struct rds_ib_connection *)data;
@@ -291,9 +308,9 @@ static void rds_ib_tasklet_fn_recv(unsigned long data)
rds_ib_stats_inc(s_ib_tasklet_call);
memset(&state, 0, sizeof(state));
- poll_cq(ic, ic->i_recv_cq, ic->i_recv_wc, &state);
+ poll_rcq(ic, ic->i_recv_cq, ic->i_recv_wc, &state);
ib_req_notify_cq(ic->i_recv_cq, IB_CQ_SOLICITED);
- poll_cq(ic, ic->i_recv_cq, ic->i_recv_wc, &state);
+ poll_rcq(ic, ic->i_recv_cq, ic->i_recv_wc, &state);
if (state.ack_next_valid)
rds_ib_set_ack(ic, state.ack_next, state.ack_required);
@@ -351,7 +368,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
struct ib_qp_init_attr attr;
struct ib_cq_init_attr cq_attr = {};
struct rds_ib_device *rds_ibdev;
- int ret;
+ int ret, fr_queue_space;
/*
* It's normal to see a null device if an incoming connection races
@@ -361,6 +378,12 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
if (!rds_ibdev)
return -EOPNOTSUPP;
+ /* The fr_queue_space is currently set to 512, to add extra space on
+ * completion queue and send queue. This extra space is used for FRMR
+ * registration and invalidation work requests
+ */
+ fr_queue_space = (rds_ibdev->use_fastreg ? RDS_IB_DEFAULT_FR_WR : 0);
+
/* add the conn now so that connection establishment has the dev */
rds_ib_add_conn(rds_ibdev, conn);
@@ -372,7 +395,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
/* Protection domain and memory range */
ic->i_pd = rds_ibdev->pd;
- cq_attr.cqe = ic->i_send_ring.w_nr + 1;
+ cq_attr.cqe = ic->i_send_ring.w_nr + fr_queue_space + 1;
ic->i_send_cq = ib_create_cq(dev, rds_ib_cq_comp_handler_send,
rds_ib_cq_event_handler, conn,
@@ -412,7 +435,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
attr.event_handler = rds_ib_qp_event_handler;
attr.qp_context = conn;
/* + 1 to allow for the single ack message */
- attr.cap.max_send_wr = ic->i_send_ring.w_nr + 1;
+ attr.cap.max_send_wr = ic->i_send_ring.w_nr + fr_queue_space + 1;
attr.cap.max_recv_wr = ic->i_recv_ring.w_nr + 1;
attr.cap.max_send_sge = rds_ibdev->max_sge;
attr.cap.max_recv_sge = RDS_IB_RECV_SGE;
@@ -420,6 +443,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
attr.qp_type = IB_QPT_RC;
attr.send_cq = ic->i_send_cq;
attr.recv_cq = ic->i_recv_cq;
+ atomic_set(&ic->i_fastreg_wrs, RDS_IB_DEFAULT_FR_WR);
/*
* XXX this can fail if max_*_wr is too large? Are we supposed
@@ -739,7 +763,8 @@ void rds_ib_conn_shutdown(struct rds_connection *conn)
*/
wait_event(rds_ib_ring_empty_wait,
rds_ib_ring_empty(&ic->i_recv_ring) &&
- (atomic_read(&ic->i_signaled_sends) == 0));
+ (atomic_read(&ic->i_signaled_sends) == 0) &&
+ (atomic_read(&ic->i_fastreg_wrs) == RDS_IB_DEFAULT_FR_WR));
tasklet_kill(&ic->i_send_tasklet);
tasklet_kill(&ic->i_recv_tasklet);
diff --git a/net/rds/ib_fmr.c b/net/rds/ib_fmr.c
new file mode 100644
index 000000000000..4fe8f4fec4ee
--- /dev/null
+++ b/net/rds/ib_fmr.c
@@ -0,0 +1,248 @@
+/*
+ * Copyright (c) 2016 Oracle. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "ib_mr.h"
+
+struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev, int npages)
+{
+ struct rds_ib_mr_pool *pool;
+ struct rds_ib_mr *ibmr = NULL;
+ struct rds_ib_fmr *fmr;
+ int err = 0;
+
+ if (npages <= RDS_MR_8K_MSG_SIZE)
+ pool = rds_ibdev->mr_8k_pool;
+ else
+ pool = rds_ibdev->mr_1m_pool;
+
+ ibmr = rds_ib_try_reuse_ibmr(pool);
+ if (ibmr)
+ return ibmr;
+
+ ibmr = kzalloc_node(sizeof(*ibmr), GFP_KERNEL,
+ rdsibdev_to_node(rds_ibdev));
+ if (!ibmr) {
+ err = -ENOMEM;
+ goto out_no_cigar;
+ }
+
+ fmr = &ibmr->u.fmr;
+ fmr->fmr = ib_alloc_fmr(rds_ibdev->pd,
+ (IB_ACCESS_LOCAL_WRITE |
+ IB_ACCESS_REMOTE_READ |
+ IB_ACCESS_REMOTE_WRITE |
+ IB_ACCESS_REMOTE_ATOMIC),
+ &pool->fmr_attr);
+ if (IS_ERR(fmr->fmr)) {
+ err = PTR_ERR(fmr->fmr);
+ fmr->fmr = NULL;
+ pr_warn("RDS/IB: %s failed (err=%d)\n", __func__, err);
+ goto out_no_cigar;
+ }
+
+ ibmr->pool = pool;
+ if (pool->pool_type == RDS_IB_MR_8K_POOL)
+ rds_ib_stats_inc(s_ib_rdma_mr_8k_alloc);
+ else
+ rds_ib_stats_inc(s_ib_rdma_mr_1m_alloc);
+
+ return ibmr;
+
+out_no_cigar:
+ if (ibmr) {
+ if (fmr->fmr)
+ ib_dealloc_fmr(fmr->fmr);
+ kfree(ibmr);
+ }
+ atomic_dec(&pool->item_count);
+ return ERR_PTR(err);
+}
+
+int rds_ib_map_fmr(struct rds_ib_device *rds_ibdev, struct rds_ib_mr *ibmr,
+ struct scatterlist *sg, unsigned int nents)
+{
+ struct ib_device *dev = rds_ibdev->dev;
+ struct rds_ib_fmr *fmr = &ibmr->u.fmr;
+ struct scatterlist *scat = sg;
+ u64 io_addr = 0;
+ u64 *dma_pages;
+ u32 len;
+ int page_cnt, sg_dma_len;
+ int i, j;
+ int ret;
+
+ sg_dma_len = ib_dma_map_sg(dev, sg, nents, DMA_BIDIRECTIONAL);
+ if (unlikely(!sg_dma_len)) {
+ pr_warn("RDS/IB: %s failed!\n", __func__);
+ return -EBUSY;
+ }
+
+ len = 0;
+ page_cnt = 0;
+
+ for (i = 0; i < sg_dma_len; ++i) {
+ unsigned int dma_len = ib_sg_dma_len(dev, &scat[i]);
+ u64 dma_addr = ib_sg_dma_address(dev, &scat[i]);
+
+ if (dma_addr & ~PAGE_MASK) {
+ if (i > 0)
+ return -EINVAL;
+ else
+ ++page_cnt;
+ }
+ if ((dma_addr + dma_len) & ~PAGE_MASK) {
+ if (i < sg_dma_len - 1)
+ return -EINVAL;
+ else
+ ++page_cnt;
+ }
+
+ len += dma_len;
+ }
+
+ page_cnt += len >> PAGE_SHIFT;
+ if (page_cnt > ibmr->pool->fmr_attr.max_pages)
+ return -EINVAL;
+
+ dma_pages = kmalloc_node(sizeof(u64) * page_cnt, GFP_ATOMIC,
+ rdsibdev_to_node(rds_ibdev));
+ if (!dma_pages)
+ return -ENOMEM;
+
+ page_cnt = 0;
+ for (i = 0; i < sg_dma_len; ++i) {
+ unsigned int dma_len = ib_sg_dma_len(dev, &scat[i]);
+ u64 dma_addr = ib_sg_dma_address(dev, &scat[i]);
+
+ for (j = 0; j < dma_len; j += PAGE_SIZE)
+ dma_pages[page_cnt++] =
+ (dma_addr & PAGE_MASK) + j;
+ }
+
+ ret = ib_map_phys_fmr(fmr->fmr, dma_pages, page_cnt, io_addr);
+ if (ret)
+ goto out;
+
+ /* Success - we successfully remapped the MR, so we can
+ * safely tear down the old mapping.
+ */
+ rds_ib_teardown_mr(ibmr);
+
+ ibmr->sg = scat;
+ ibmr->sg_len = nents;
+ ibmr->sg_dma_len = sg_dma_len;
+ ibmr->remap_count++;
+
+ if (ibmr->pool->pool_type == RDS_IB_MR_8K_POOL)
+ rds_ib_stats_inc(s_ib_rdma_mr_8k_used);
+ else
+ rds_ib_stats_inc(s_ib_rdma_mr_1m_used);
+ ret = 0;
+
+out:
+ kfree(dma_pages);
+
+ return ret;
+}
+
+struct rds_ib_mr *rds_ib_reg_fmr(struct rds_ib_device *rds_ibdev,
+ struct scatterlist *sg,
+ unsigned long nents,
+ u32 *key)
+{
+ struct rds_ib_mr *ibmr = NULL;
+ struct rds_ib_fmr *fmr;
+ int ret;
+
+ ibmr = rds_ib_alloc_fmr(rds_ibdev, nents);
+ if (IS_ERR(ibmr))
+ return ibmr;
+
+ ibmr->device = rds_ibdev;
+ fmr = &ibmr->u.fmr;
+ ret = rds_ib_map_fmr(rds_ibdev, ibmr, sg, nents);
+ if (ret == 0)
+ *key = fmr->fmr->rkey;
+ else
+ rds_ib_free_mr(ibmr, 0);
+
+ return ibmr;
+}
+
+void rds_ib_unreg_fmr(struct list_head *list, unsigned int *nfreed,
+ unsigned long *unpinned, unsigned int goal)
+{
+ struct rds_ib_mr *ibmr, *next;
+ struct rds_ib_fmr *fmr;
+ LIST_HEAD(fmr_list);
+ int ret = 0;
+ unsigned int freed = *nfreed;
+
+ /* String all ib_mr's onto one list and hand them to ib_unmap_fmr */
+ list_for_each_entry(ibmr, list, unmap_list) {
+ fmr = &ibmr->u.fmr;
+ list_add(&fmr->fmr->list, &fmr_list);
+ }
+
+ ret = ib_unmap_fmr(&fmr_list);
+ if (ret)
+ pr_warn("RDS/IB: FMR invalidation failed (err=%d)\n", ret);
+
+ /* Now we can destroy the DMA mapping and unpin any pages */
+ list_for_each_entry_safe(ibmr, next, list, unmap_list) {
+ fmr = &ibmr->u.fmr;
+ *unpinned += ibmr->sg_len;
+ __rds_ib_teardown_mr(ibmr);
+ if (freed < goal ||
+ ibmr->remap_count >= ibmr->pool->fmr_attr.max_maps) {
+ if (ibmr->pool->pool_type == RDS_IB_MR_8K_POOL)
+ rds_ib_stats_inc(s_ib_rdma_mr_8k_free);
+ else
+ rds_ib_stats_inc(s_ib_rdma_mr_1m_free);
+ list_del(&ibmr->unmap_list);
+ ib_dealloc_fmr(fmr->fmr);
+ kfree(ibmr);
+ freed++;
+ }
+ }
+ *nfreed = freed;
+}
+
+void rds_ib_free_fmr_list(struct rds_ib_mr *ibmr)
+{
+ struct rds_ib_mr_pool *pool = ibmr->pool;
+
+ if (ibmr->remap_count >= pool->fmr_attr.max_maps)
+ llist_add(&ibmr->llnode, &pool->drop_list);
+ else
+ llist_add(&ibmr->llnode, &pool->free_list);
+}
diff --git a/net/rds/ib_frmr.c b/net/rds/ib_frmr.c
new file mode 100644
index 000000000000..93ff038ea9d1
--- /dev/null
+++ b/net/rds/ib_frmr.c
@@ -0,0 +1,376 @@
+/*
+ * Copyright (c) 2016 Oracle. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "ib_mr.h"
+
+static struct rds_ib_mr *rds_ib_alloc_frmr(struct rds_ib_device *rds_ibdev,
+ int npages)
+{
+ struct rds_ib_mr_pool *pool;
+ struct rds_ib_mr *ibmr = NULL;
+ struct rds_ib_frmr *frmr;
+ int err = 0;
+
+ if (npages <= RDS_MR_8K_MSG_SIZE)
+ pool = rds_ibdev->mr_8k_pool;
+ else
+ pool = rds_ibdev->mr_1m_pool;
+
+ ibmr = rds_ib_try_reuse_ibmr(pool);
+ if (ibmr)
+ return ibmr;
+
+ ibmr = kzalloc_node(sizeof(*ibmr), GFP_KERNEL,
+ rdsibdev_to_node(rds_ibdev));
+ if (!ibmr) {
+ err = -ENOMEM;
+ goto out_no_cigar;
+ }
+
+ frmr = &ibmr->u.frmr;
+ frmr->mr = ib_alloc_mr(rds_ibdev->pd, IB_MR_TYPE_MEM_REG,
+ pool->fmr_attr.max_pages);
+ if (IS_ERR(frmr->mr)) {
+ pr_warn("RDS/IB: %s failed to allocate MR", __func__);
+ goto out_no_cigar;
+ }
+
+ ibmr->pool = pool;
+ if (pool->pool_type == RDS_IB_MR_8K_POOL)
+ rds_ib_stats_inc(s_ib_rdma_mr_8k_alloc);
+ else
+ rds_ib_stats_inc(s_ib_rdma_mr_1m_alloc);
+
+ if (atomic_read(&pool->item_count) > pool->max_items_soft)
+ pool->max_items_soft = pool->max_items;
+
+ frmr->fr_state = FRMR_IS_FREE;
+ return ibmr;
+
+out_no_cigar:
+ kfree(ibmr);
+ atomic_dec(&pool->item_count);
+ return ERR_PTR(err);
+}
+
+static void rds_ib_free_frmr(struct rds_ib_mr *ibmr, bool drop)
+{
+ struct rds_ib_mr_pool *pool = ibmr->pool;
+
+ if (drop)
+ llist_add(&ibmr->llnode, &pool->drop_list);
+ else
+ llist_add(&ibmr->llnode, &pool->free_list);
+ atomic_add(ibmr->sg_len, &pool->free_pinned);
+ atomic_inc(&pool->dirty_count);
+
+ /* If we've pinned too many pages, request a flush */
+ if (atomic_read(&pool->free_pinned) >= pool->max_free_pinned ||
+ atomic_read(&pool->dirty_count) >= pool->max_items / 5)
+ queue_delayed_work(rds_ib_mr_wq, &pool->flush_worker, 10);
+}
+
+static int rds_ib_post_reg_frmr(struct rds_ib_mr *ibmr)
+{
+ struct rds_ib_frmr *frmr = &ibmr->u.frmr;
+ struct ib_send_wr *failed_wr;
+ struct ib_reg_wr reg_wr;
+ int ret;
+
+ while (atomic_dec_return(&ibmr->ic->i_fastreg_wrs) <= 0) {
+ atomic_inc(&ibmr->ic->i_fastreg_wrs);
+ cpu_relax();
+ }
+
+ ret = ib_map_mr_sg_zbva(frmr->mr, ibmr->sg, ibmr->sg_len, PAGE_SIZE);
+ if (unlikely(ret != ibmr->sg_len))
+ return ret < 0 ? ret : -EINVAL;
+
+ /* Perform a WR for the fast_reg_mr. Each individual page
+ * in the sg list is added to the fast reg page list and placed
+ * inside the fast_reg_mr WR. The key used is a rolling 8bit
+ * counter, which should guarantee uniqueness.
+ */
+ ib_update_fast_reg_key(frmr->mr, ibmr->remap_count++);
+ frmr->fr_state = FRMR_IS_INUSE;
+
+ memset(&reg_wr, 0, sizeof(reg_wr));
+ reg_wr.wr.wr_id = (unsigned long)(void *)ibmr;
+ reg_wr.wr.opcode = IB_WR_REG_MR;
+ reg_wr.wr.num_sge = 0;
+ reg_wr.mr = frmr->mr;
+ reg_wr.key = frmr->mr->rkey;
+ reg_wr.access = IB_ACCESS_LOCAL_WRITE |
+ IB_ACCESS_REMOTE_READ |
+ IB_ACCESS_REMOTE_WRITE;
+ reg_wr.wr.send_flags = IB_SEND_SIGNALED;
+
+ failed_wr = &reg_wr.wr;