summaryrefslogtreecommitdiffstats
path: root/drivers/infiniband/hw
diff options
context:
space:
mode:
authorDoug Ledford <dledford@redhat.com>2019-01-31 11:56:25 -0500
committerDoug Ledford <dledford@redhat.com>2019-01-31 11:56:25 -0500
commit2a6423961edf9db98d1e567992560e3bab65a9fc (patch)
tree142ea39376f3cba0e6cef13fba18441ec35b3820 /drivers/infiniband/hw
parentdb421a54996c602503204345171c662e65f20527 (diff)
parenta131d16460971353e7dd6916d9fd34c1c946a782 (diff)
Merge branch 'opfn' into hfi1-tid
This series adds the OPFN feature, which is used as the negotiation protocol by TID RDMA. This adds a totally hidden, in-band negotiation transfer that happens on the consumer's queue pair but without the consumer's knowledge. For that reason, things like completions for OPFN transfers must be filtered out of the completion queue and not sent to the consumer. This feature does not impact any consumer APIs, but does impact the driver/driver wire API. At a high level OPFN enables exchanging parameters between two hosts using IB compare and swap requests to a special virtual address. The request uses a reserved IB work request opcode (see patch 3). * opfn: IB/hfi1: Add static trace for OPFN IB/hfi1: Integrate OPFN into RC transactions IB/hfi1, IB/rdmavt: Allow for extending of QP's s_ack_queue IB/hfi1: OPFN interface IB/hfi1: Add OPFN helper functions for TID RDMA feature IB/hfi1: OPFN support discovery Signed-off-by: Doug Ledford <dledford@redhat.com>
Diffstat (limited to 'drivers/infiniband/hw')
-rw-r--r--drivers/infiniband/hw/hfi1/Makefile1
-rw-r--r--drivers/infiniband/hw/hfi1/chip.c11
-rw-r--r--drivers/infiniband/hw/hfi1/chip.h3
-rw-r--r--drivers/infiniband/hw/hfi1/hfi.h3
-rw-r--r--drivers/infiniband/hw/hfi1/init.c10
-rw-r--r--drivers/infiniband/hw/hfi1/opfn.c318
-rw-r--r--drivers/infiniband/hw/hfi1/opfn.h85
-rw-r--r--drivers/infiniband/hw/hfi1/qp.c13
-rw-r--r--drivers/infiniband/hw/hfi1/rc.c87
-rw-r--r--drivers/infiniband/hw/hfi1/ruc.c16
-rw-r--r--drivers/infiniband/hw/hfi1/tid_rdma.c216
-rw-r--r--drivers/infiniband/hw/hfi1/tid_rdma.h29
-rw-r--r--drivers/infiniband/hw/hfi1/trace.h1
-rw-r--r--drivers/infiniband/hw/hfi1/trace_rx.h107
-rw-r--r--drivers/infiniband/hw/hfi1/trace_tid.h332
-rw-r--r--drivers/infiniband/hw/hfi1/uc.c3
-rw-r--r--drivers/infiniband/hw/hfi1/verbs.c2
-rw-r--r--drivers/infiniband/hw/hfi1/verbs.h8
18 files changed, 1105 insertions, 140 deletions
diff --git a/drivers/infiniband/hw/hfi1/Makefile b/drivers/infiniband/hw/hfi1/Makefile
index 3ce9dc8c3463..4044a8c8dbf4 100644
--- a/drivers/infiniband/hw/hfi1/Makefile
+++ b/drivers/infiniband/hw/hfi1/Makefile
@@ -24,6 +24,7 @@ hfi1-y := \
mad.o \
mmu_rb.o \
msix.o \
+ opfn.o \
pcie.o \
pio.o \
pio_copy.o \
diff --git a/drivers/infiniband/hw/hfi1/chip.c b/drivers/infiniband/hw/hfi1/chip.c
index b443642eac02..4d40311f082e 100644
--- a/drivers/infiniband/hw/hfi1/chip.c
+++ b/drivers/infiniband/hw/hfi1/chip.c
@@ -5222,6 +5222,17 @@ int is_bx(struct hfi1_devdata *dd)
return (chip_rev_minor & 0xF0) == 0x10;
}
+/* return true is kernel urg disabled for rcd */
+bool is_urg_masked(struct hfi1_ctxtdata *rcd)
+{
+ u64 mask;
+ u32 is = IS_RCVURGENT_START + rcd->ctxt;
+ u8 bit = is % 64;
+
+ mask = read_csr(rcd->dd, CCE_INT_MASK + (8 * (is / 64)));
+ return !(mask & BIT_ULL(bit));
+}
+
/*
* Append string s to buffer buf. Arguments curp and len are the current
* position and remaining length, respectively.
diff --git a/drivers/infiniband/hw/hfi1/chip.h b/drivers/infiniband/hw/hfi1/chip.h
index 6b9c8f12dff8..ba3d99e6e33b 100644
--- a/drivers/infiniband/hw/hfi1/chip.h
+++ b/drivers/infiniband/hw/hfi1/chip.h
@@ -1,7 +1,7 @@
#ifndef _CHIP_H
#define _CHIP_H
/*
- * Copyright(c) 2015 - 2017 Intel Corporation.
+ * Copyright(c) 2015 - 2018 Intel Corporation.
*
* This file is provided under a dual BSD/GPLv2 license. When using or
* redistributing this file, you may do so under either license.
@@ -804,6 +804,7 @@ void clear_linkup_counters(struct hfi1_devdata *dd);
u32 hdrqempty(struct hfi1_ctxtdata *rcd);
int is_ax(struct hfi1_devdata *dd);
int is_bx(struct hfi1_devdata *dd);
+bool is_urg_masked(struct hfi1_ctxtdata *rcd);
u32 read_physical_state(struct hfi1_devdata *dd);
u32 chip_to_opa_pstate(struct hfi1_devdata *dd, u32 chip_pstate);
const char *opa_lstate_name(u32 lstate);
diff --git a/drivers/infiniband/hw/hfi1/hfi.h b/drivers/infiniband/hw/hfi1/hfi.h
index 6db2276f5c13..9aa0357e17b7 100644
--- a/drivers/infiniband/hw/hfi1/hfi.h
+++ b/drivers/infiniband/hw/hfi1/hfi.h
@@ -73,6 +73,7 @@
#include "chip_registers.h"
#include "common.h"
+#include "opfn.h"
#include "verbs.h"
#include "pio.h"
#include "chip.h"
@@ -98,6 +99,8 @@
#define NEIGHBOR_TYPE_HFI 0
#define NEIGHBOR_TYPE_SWITCH 1
+#define HFI1_MAX_ACTIVE_WORKQUEUE_ENTRIES 5
+
extern unsigned long hfi1_cap_mask;
#define HFI1_CAP_KGET_MASK(mask, cap) ((mask) & HFI1_CAP_##cap)
#define HFI1_CAP_UGET_MASK(mask, cap) \
diff --git a/drivers/infiniband/hw/hfi1/init.c b/drivers/infiniband/hw/hfi1/init.c
index 7835eb52e7c5..a8dbd0f191f5 100644
--- a/drivers/infiniband/hw/hfi1/init.c
+++ b/drivers/infiniband/hw/hfi1/init.c
@@ -72,7 +72,6 @@
#undef pr_fmt
#define pr_fmt(fmt) DRIVER_NAME ": " fmt
-#define HFI1_MAX_ACTIVE_WORKQUEUE_ENTRIES 5
/*
* min buffers we want to have per context, after driver
*/
@@ -927,6 +926,8 @@ int hfi1_init(struct hfi1_devdata *dd, int reinit)
lastfail = hfi1_create_rcvhdrq(dd, rcd);
if (!lastfail)
lastfail = hfi1_setup_eagerbufs(rcd);
+ if (!lastfail)
+ lastfail = hfi1_kern_exp_rcv_init(rcd, reinit);
if (lastfail) {
dd_dev_err(dd,
"failed to allocate kernel ctxt's rcvhdrq and/or egr bufs\n");
@@ -1497,6 +1498,12 @@ static int __init hfi1_mod_init(void)
/* sanitize link CRC options */
link_crc_mask &= SUPPORTED_CRCS;
+ ret = opfn_init();
+ if (ret < 0) {
+ pr_err("Failed to allocate opfn_wq");
+ goto bail_dev;
+ }
+
/*
* These must be called before the driver is registered with
* the PCI subsystem.
@@ -1527,6 +1534,7 @@ module_init(hfi1_mod_init);
static void __exit hfi1_mod_cleanup(void)
{
pci_unregister_driver(&hfi1_pci_driver);
+ opfn_exit();
node_affinity_destroy_all();
hfi1_dbg_exit();
diff --git a/drivers/infiniband/hw/hfi1/opfn.c b/drivers/infiniband/hw/hfi1/opfn.c
new file mode 100644
index 000000000000..2ca070690b2f
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/opfn.c
@@ -0,0 +1,318 @@
+// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
+/*
+ * Copyright(c) 2018 Intel Corporation.
+ *
+ */
+#include "hfi.h"
+#include "trace.h"
+#include "qp.h"
+#include "opfn.h"
+
+#define IB_BTHE_E BIT(IB_BTHE_E_SHIFT)
+
+#define OPFN_CODE(code) BIT((code) - 1)
+#define OPFN_MASK(code) OPFN_CODE(STL_VERBS_EXTD_##code)
+
+struct hfi1_opfn_type {
+ bool (*request)(struct rvt_qp *qp, u64 *data);
+ bool (*response)(struct rvt_qp *qp, u64 *data);
+ bool (*reply)(struct rvt_qp *qp, u64 data);
+ void (*error)(struct rvt_qp *qp);
+};
+
+static struct hfi1_opfn_type hfi1_opfn_handlers[STL_VERBS_EXTD_MAX] = {
+ [STL_VERBS_EXTD_TID_RDMA] = {
+ .request = tid_rdma_conn_req,
+ .response = tid_rdma_conn_resp,
+ .reply = tid_rdma_conn_reply,
+ .error = tid_rdma_conn_error,
+ },
+};
+
+static struct workqueue_struct *opfn_wq;
+
+static void opfn_schedule_conn_request(struct rvt_qp *qp);
+
+static bool hfi1_opfn_extended(u32 bth1)
+{
+ return !!(bth1 & IB_BTHE_E);
+}
+
+static void opfn_conn_request(struct rvt_qp *qp)
+{
+ struct hfi1_qp_priv *priv = qp->priv;
+ struct ib_atomic_wr wr;
+ u16 mask, capcode;
+ struct hfi1_opfn_type *extd;
+ u64 data;
+ unsigned long flags;
+ int ret = 0;
+
+ trace_hfi1_opfn_state_conn_request(qp);
+ spin_lock_irqsave(&priv->opfn.lock, flags);
+ /*
+ * Exit if the extended bit is not set, or if nothing is requested, or
+ * if we have completed all requests, or if a previous request is in
+ * progress
+ */
+ if (!priv->opfn.extended || !priv->opfn.requested ||
+ priv->opfn.requested == priv->opfn.completed || priv->opfn.curr)
+ goto done;
+
+ mask = priv->opfn.requested & ~priv->opfn.completed;
+ capcode = ilog2(mask & ~(mask - 1)) + 1;
+ if (capcode >= STL_VERBS_EXTD_MAX) {
+ priv->opfn.completed |= OPFN_CODE(capcode);
+ goto done;
+ }
+
+ extd = &hfi1_opfn_handlers[capcode];
+ if (!extd || !extd->request || !extd->request(qp, &data)) {
+ /*
+ * Either there is no handler for this capability or the request
+ * packet could not be generated. Either way, mark it as done so
+ * we don't keep attempting to complete it.
+ */
+ priv->opfn.completed |= OPFN_CODE(capcode);
+ goto done;
+ }
+
+ trace_hfi1_opfn_data_conn_request(qp, capcode, data);
+ data = (data & ~0xf) | capcode;
+
+ memset(&wr, 0, sizeof(wr));
+ wr.wr.opcode = IB_WR_OPFN;
+ wr.remote_addr = HFI1_VERBS_E_ATOMIC_VADDR;
+ wr.compare_add = data;
+
+ priv->opfn.curr = capcode; /* A new request is now in progress */
+ /* Drop opfn.lock before calling ib_post_send() */
+ spin_unlock_irqrestore(&priv->opfn.lock, flags);
+
+ ret = ib_post_send(&qp->ibqp, &wr.wr, NULL);
+ if (ret)
+ goto err;
+ trace_hfi1_opfn_state_conn_request(qp);
+ return;
+err:
+ trace_hfi1_msg_opfn_conn_request(qp, "ib_ost_send failed: ret = ",
+ (u64)ret);
+ spin_lock_irqsave(&priv->opfn.lock, flags);
+ /*
+ * In case of an unexpected error return from ib_post_send
+ * clear opfn.curr and reschedule to try again
+ */
+ priv->opfn.curr = STL_VERBS_EXTD_NONE;
+ opfn_schedule_conn_request(qp);
+done:
+ spin_unlock_irqrestore(&priv->opfn.lock, flags);
+}
+
+void opfn_send_conn_request(struct work_struct *work)
+{
+ struct hfi1_opfn_data *od;
+ struct hfi1_qp_priv *qpriv;
+
+ od = container_of(work, struct hfi1_opfn_data, opfn_work);
+ qpriv = container_of(od, struct hfi1_qp_priv, opfn);
+
+ opfn_conn_request(qpriv->owner);
+}
+
+/*
+ * When QP s_lock is held in the caller, the OPFN request must be scheduled
+ * to a different workqueue to avoid double locking QP s_lock in call to
+ * ib_post_send in opfn_conn_request
+ */
+static void opfn_schedule_conn_request(struct rvt_qp *qp)
+{
+ struct hfi1_qp_priv *priv = qp->priv;
+
+ trace_hfi1_opfn_state_sched_conn_request(qp);
+ queue_work(opfn_wq, &priv->opfn.opfn_work);
+}
+
+void opfn_conn_response(struct rvt_qp *qp, struct rvt_ack_entry *e,
+ struct ib_atomic_eth *ateth)
+{
+ struct hfi1_qp_priv *priv = qp->priv;
+ u64 data = be64_to_cpu(ateth->compare_data);
+ struct hfi1_opfn_type *extd;
+ u8 capcode;
+ unsigned long flags;
+
+ trace_hfi1_opfn_state_conn_response(qp);
+ capcode = data & 0xf;
+ trace_hfi1_opfn_data_conn_response(qp, capcode, data);
+ if (!capcode || capcode >= STL_VERBS_EXTD_MAX)
+ return;
+
+ extd = &hfi1_opfn_handlers[capcode];
+
+ if (!extd || !extd->response) {
+ e->atomic_data = capcode;
+ return;
+ }
+
+ spin_lock_irqsave(&priv->opfn.lock, flags);
+ if (priv->opfn.completed & OPFN_CODE(capcode)) {
+ /*
+ * We are receiving a request for a feature that has already
+ * been negotiated. This may mean that the other side has reset
+ */
+ priv->opfn.completed &= ~OPFN_CODE(capcode);
+ if (extd->error)
+ extd->error(qp);
+ }
+
+ if (extd->response(qp, &data))
+ priv->opfn.completed |= OPFN_CODE(capcode);
+ e->atomic_data = (data & ~0xf) | capcode;
+ trace_hfi1_opfn_state_conn_response(qp);
+ spin_unlock_irqrestore(&priv->opfn.lock, flags);
+}
+
+void opfn_conn_reply(struct rvt_qp *qp, u64 data)
+{
+ struct hfi1_qp_priv *priv = qp->priv;
+ struct hfi1_opfn_type *extd;
+ u8 capcode;
+ unsigned long flags;
+
+ trace_hfi1_opfn_state_conn_reply(qp);
+ capcode = data & 0xf;
+ trace_hfi1_opfn_data_conn_reply(qp, capcode, data);
+ if (!capcode || capcode >= STL_VERBS_EXTD_MAX)
+ return;
+
+ spin_lock_irqsave(&priv->opfn.lock, flags);
+ /*
+ * Either there is no previous request or the reply is not for the
+ * current request
+ */
+ if (!priv->opfn.curr || capcode != priv->opfn.curr)
+ goto done;
+
+ extd = &hfi1_opfn_handlers[capcode];
+
+ if (!extd || !extd->reply)
+ goto clear;
+
+ if (extd->reply(qp, data))
+ priv->opfn.completed |= OPFN_CODE(capcode);
+clear:
+ /*
+ * Clear opfn.curr to indicate that the previous request is no longer in
+ * progress
+ */
+ priv->opfn.curr = STL_VERBS_EXTD_NONE;
+ trace_hfi1_opfn_state_conn_reply(qp);
+done:
+ spin_unlock_irqrestore(&priv->opfn.lock, flags);
+}
+
+void opfn_conn_error(struct rvt_qp *qp)
+{
+ struct hfi1_qp_priv *priv = qp->priv;
+ struct hfi1_opfn_type *extd = NULL;
+ unsigned long flags;
+ u16 capcode;
+
+ trace_hfi1_opfn_state_conn_error(qp);
+ trace_hfi1_msg_opfn_conn_error(qp, "error. qp state ", (u64)qp->state);
+ /*
+ * The QP has gone into the Error state. We have to invalidate all
+ * negotiated feature, including the one in progress (if any). The RC
+ * QP handling will clean the WQE for the connection request.
+ */
+ spin_lock_irqsave(&priv->opfn.lock, flags);
+ while (priv->opfn.completed) {
+ capcode = priv->opfn.completed & ~(priv->opfn.completed - 1);
+ extd = &hfi1_opfn_handlers[ilog2(capcode) + 1];
+ if (extd->error)
+ extd->error(qp);
+ priv->opfn.completed &= ~OPFN_CODE(capcode);
+ }
+ priv->opfn.extended = 0;
+ priv->opfn.requested = 0;
+ priv->opfn.curr = STL_VERBS_EXTD_NONE;
+ spin_unlock_irqrestore(&priv->opfn.lock, flags);
+}
+
+void opfn_qp_init(struct rvt_qp *qp, struct ib_qp_attr *attr, int attr_mask)
+{
+ struct ib_qp *ibqp = &qp->ibqp;
+ struct hfi1_qp_priv *priv = qp->priv;
+ unsigned long flags;
+
+ spin_lock_irqsave(&priv->opfn.lock, flags);
+ if (ibqp->qp_type == IB_QPT_RC && HFI1_CAP_IS_KSET(TID_RDMA)) {
+ struct tid_rdma_params *local = &priv->tid_rdma.local;
+
+ if (qp->pmtu == enum_to_mtu(OPA_MTU_4096) ||
+ qp->pmtu == enum_to_mtu(OPA_MTU_8192)) {
+ tid_rdma_opfn_init(qp, local);
+ /*
+ * We only want to set the OPFN requested bit when the
+ * QP transitions to RTS.
+ */
+ if (attr_mask & IB_QP_STATE &&
+ attr->qp_state == IB_QPS_RTS) {
+ priv->opfn.requested |= OPFN_MASK(TID_RDMA);
+ /*
+ * If the QP is transitioning to RTS and the
+ * opfn.completed for TID RDMA has already been
+ * set, the QP is being moved *back* into RTS.
+ * We can now renegotiate the TID RDMA
+ * parameters.
+ */
+ if (priv->opfn.completed &
+ OPFN_MASK(TID_RDMA)) {
+ priv->opfn.completed &=
+ ~OPFN_MASK(TID_RDMA);
+ /*
+ * Since the opfn.completed bit was
+ * already set, it is safe to assume
+ * that the opfn.extended is also set.
+ */
+ opfn_schedule_conn_request(qp);
+ }
+ }
+ } else {
+ memset(local, 0, sizeof(*local));
+ }
+ }
+ spin_unlock_irqrestore(&priv->opfn.lock, flags);
+}
+
+void opfn_trigger_conn_request(struct rvt_qp *qp, u32 bth1)
+{
+ struct hfi1_qp_priv *priv = qp->priv;
+
+ if (!priv->opfn.extended && hfi1_opfn_extended(bth1) &&
+ HFI1_CAP_IS_KSET(OPFN)) {
+ priv->opfn.extended = 1;
+ if (qp->state == IB_QPS_RTS)
+ opfn_conn_request(qp);
+ }
+}
+
+int opfn_init(void)
+{
+ opfn_wq = alloc_workqueue("hfi_opfn",
+ WQ_SYSFS | WQ_HIGHPRI | WQ_CPU_INTENSIVE |
+ WQ_MEM_RECLAIM,
+ HFI1_MAX_ACTIVE_WORKQUEUE_ENTRIES);
+ if (!opfn_wq)
+ return -ENOMEM;
+
+ return 0;
+}
+
+void opfn_exit(void)
+{
+ if (opfn_wq) {
+ destroy_workqueue(opfn_wq);
+ opfn_wq = NULL;
+ }
+}
diff --git a/drivers/infiniband/hw/hfi1/opfn.h b/drivers/infiniband/hw/hfi1/opfn.h
new file mode 100644
index 000000000000..5f2011cabc25
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/opfn.h
@@ -0,0 +1,85 @@
+/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */
+/*
+ * Copyright(c) 2018 Intel Corporation.
+ *
+ */
+#ifndef _HFI1_OPFN_H
+#define _HFI1_OPFN_H
+
+/**
+ * DOC: Omni Path Feature Negotion (OPFN)
+ *
+ * OPFN is a discovery protocol for Intel Omni-Path fabric that
+ * allows two RC QPs to negotiate a common feature that both QPs
+ * can support. Currently, the only OPA feature that OPFN
+ * supports is TID RDMA.
+ *
+ * Architecture
+ *
+ * OPFN involves the communication between two QPs on the HFI
+ * level on an Omni-Path fabric, and ULPs have no knowledge of
+ * OPFN at all.
+ *
+ * Implementation
+ *
+ * OPFN extends the existing IB RC protocol with the following
+ * changes:
+ * -- Uses Bit 24 (reserved) of DWORD 1 of Base Transport
+ * Header (BTH1) to indicate that the RC QP supports OPFN;
+ * -- Uses a combination of RC COMPARE_SWAP opcode (0x13) and
+ * the address U64_MAX (0xFFFFFFFFFFFFFFFF) as an OPFN
+ * request; The 64-bit data carried with the request/response
+ * contains the parameters for negotiation and will be
+ * defined in tid_rdma.c file;
+ * -- Defines IB_WR_RESERVED3 as IB_WR_OPFN.
+ *
+ * The OPFN communication will be triggered when an RC QP
+ * receives a request with Bit 24 of BTH1 set. The responder QP
+ * will then post send an OPFN request with its local
+ * parameters, which will be sent to the requester QP once all
+ * existing requests on the responder QP side have been sent.
+ * Once the requester QP receives the OPFN request, it will
+ * keep a copy of the responder QP's parameters, and return a
+ * response packet with its own local parameters. The responder
+ * QP receives the response packet and keeps a copy of the requester
+ * QP's parameters. After this exchange, each side has the parameters
+ * for both sides and therefore can select the right parameters
+ * for future transactions
+ */
+
+/* STL Verbs Extended */
+#define IB_BTHE_E_SHIFT 24
+#define HFI1_VERBS_E_ATOMIC_VADDR U64_MAX
+
+struct ib_atomic_eth;
+
+enum hfi1_opfn_codes {
+ STL_VERBS_EXTD_NONE = 0,
+ STL_VERBS_EXTD_TID_RDMA,
+ STL_VERBS_EXTD_MAX
+};
+
+struct hfi1_opfn_data {
+ u8 extended;
+ u16 requested;
+ u16 completed;
+ enum hfi1_opfn_codes curr;
+ /* serialize opfn function calls */
+ spinlock_t lock;
+ struct work_struct opfn_work;
+};
+
+/* WR opcode for OPFN */
+#define IB_WR_OPFN IB_WR_RESERVED3
+
+void opfn_send_conn_request(struct work_struct *work);
+void opfn_conn_response(struct rvt_qp *qp, struct rvt_ack_entry *e,
+ struct ib_atomic_eth *ateth);
+void opfn_conn_reply(struct rvt_qp *qp, u64 data);
+void opfn_conn_error(struct rvt_qp *qp);
+void opfn_qp_init(struct rvt_qp *qp, struct ib_qp_attr *attr, int attr_mask);
+void opfn_trigger_conn_request(struct rvt_qp *qp, u32 bth1);
+int opfn_init(void);
+void opfn_exit(void);
+
+#endif /* _HFI1_OPFN_H */
diff --git a/drivers/infiniband/hw/hfi1/qp.c b/drivers/infiniband/hw/hfi1/qp.c
index 5344e8993b28..f822f92b415f 100644
--- a/drivers/infiniband/hw/hfi1/qp.c
+++ b/drivers/infiniband/hw/hfi1/qp.c
@@ -132,6 +132,12 @@ const struct rvt_operation_params hfi1_post_parms[RVT_OPERATION_MAX] = {
.qpt_support = BIT(IB_QPT_RC),
},
+[IB_WR_OPFN] = {
+ .length = sizeof(struct ib_atomic_wr),
+ .qpt_support = BIT(IB_QPT_RC),
+ .flags = RVT_OPERATION_USE_RESERVE,
+},
+
};
static void flush_list_head(struct list_head *l)
@@ -285,6 +291,8 @@ void hfi1_modify_qp(struct rvt_qp *qp, struct ib_qp_attr *attr,
priv->s_sendcontext = qp_to_send_context(qp, priv->s_sc);
qp_set_16b(qp);
}
+
+ opfn_qp_init(qp, attr, attr_mask);
}
/**
@@ -696,6 +704,7 @@ void qp_priv_free(struct rvt_dev_info *rdi, struct rvt_qp *qp)
{
struct hfi1_qp_priv *priv = qp->priv;
+ hfi1_qp_priv_tid_free(rdi, qp);
kfree(priv->s_ahg);
kfree(priv);
}
@@ -751,6 +760,10 @@ void notify_qp_reset(struct rvt_qp *qp)
{
qp->r_adefered = 0;
clear_ahg(qp);
+
+ /* Clear any OPFN state */
+ if (qp->ibqp.qp_type == IB_QPT_RC)
+ opfn_conn_error(qp);
}
/*
diff --git a/drivers/infiniband/hw/hfi1/rc.c b/drivers/infiniband/hw/hfi1/rc.c
index be603f35d7e4..092d5eba980f 100644
--- a/drivers/infiniband/hw/hfi1/rc.c
+++ b/drivers/infiniband/hw/hfi1/rc.c
@@ -57,6 +57,10 @@
/* cut down ridiculously long IB macro names */
#define OP(x) RC_OP(x)
+static struct rvt_swqe *do_rc_completion(struct rvt_qp *qp,
+ struct rvt_swqe *wqe,
+ struct hfi1_ibport *ibp);
+
static u32 restart_sge(struct rvt_sge_state *ss, struct rvt_swqe *wqe,
u32 psn, u32 pmtu)
{
@@ -89,8 +93,8 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp,
struct rvt_ack_entry *e;
u32 hwords;
u32 len;
- u32 bth0;
- u32 bth2;
+ u32 bth0, bth2;
+ u32 bth1 = qp->remote_qpn | (HFI1_CAP_IS_KSET(OPFN) << IB_BTHE_E_SHIFT);
int middle = 0;
u32 pmtu = qp->pmtu;
struct hfi1_qp_priv *priv = qp->priv;
@@ -122,7 +126,8 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp,
* response has been sent instead of only being
* constructed.
*/
- if (++qp->s_tail_ack_queue > HFI1_MAX_RDMA_ATOMIC)
+ if (++qp->s_tail_ack_queue >
+ rvt_size_atomic(ib_to_rvt(qp->ibqp.device)))
qp->s_tail_ack_queue = 0;
/* FALLTHROUGH */
case OP(SEND_ONLY):
@@ -229,7 +234,7 @@ normal:
ps->s_txreq->sde = priv->s_sde;
ps->s_txreq->s_cur_size = len;
ps->s_txreq->hdr_dwords = hwords;
- hfi1_make_ruc_header(qp, ohdr, bth0, bth2, middle, ps);
+ hfi1_make_ruc_header(qp, ohdr, bth0, bth1, bth2, middle, ps);
return 1;
bail:
@@ -262,8 +267,8 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
struct rvt_swqe *wqe;
u32 hwords;
u32 len;
- u32 bth0 = 0;
- u32 bth2;
+ u32 bth0 = 0, bth2;
+ u32 bth1 = qp->remote_qpn | (HFI1_CAP_IS_KSET(OPFN) << IB_BTHE_E_SHIFT);
u32 pmtu = qp->pmtu;
char newreq;
int middle = 0;
@@ -516,10 +521,14 @@ no_flow_control:
goto bail;
}
qp->s_num_rd_atomic++;
- if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT))
- qp->s_lsn++;
}
- if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP) {
+
+ /* FALLTHROUGH */
+ case IB_WR_OPFN:
+ if (newreq && !(qp->s_flags & RVT_S_UNLIMITED_CREDIT))
+ qp->s_lsn++;
+ if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
+ wqe->wr.opcode == IB_WR_OPFN) {
qp->s_state = OP(COMPARE_SWAP);
put_ib_ateth_swap(wqe->atomic_wr.swap,
&ohdr->u.atomic_eth);
@@ -693,6 +702,7 @@ no_flow_control:
qp,
ohdr,
bth0 | (qp->s_state << 24),
+ bth1,
bth2,
middle,
ps);
@@ -796,6 +806,11 @@ static inline void hfi1_make_rc_ack_9B(struct hfi1_packet *packet,
if (qp->s_mig_state == IB_MIG_MIGRATED)
bth0 |= IB_BTH_MIG_REQ;
bth1 = (!!is_fecn) << IB_BECN_SHIFT;
+ /*
+ * Inline ACKs go out without the use of the Verbs send engine, so
+ * we need to set the STL Verbs Extended bit here
+ */
+ bth1 |= HFI1_CAP_IS_KSET(OPFN) << IB_BTHE_E_SHIFT;
hfi1_make_bth_aeth(qp, ohdr, bth0, bth1);
}
@@ -1033,6 +1048,7 @@ done:
*/
void hfi1_restart_rc(struct rvt_qp *qp, u32 psn, int wait)
{
+ struct hfi1_qp_priv *priv = qp->priv;
struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
struct hfi1_ibport *ibp;
@@ -1043,8 +1059,26 @@ void hfi1_restart_rc(struct rvt_qp *qp, u32 psn, int wait)
hfi1_migrate_qp(qp);
qp->s_retry = qp->s_retry_cnt;
} else if (qp->s_last == qp->s_acked) {
- rvt_send_complete(qp, wqe, IB_WC_RETRY_EXC_ERR);
- rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR);
+ /*
+ * We need special handling for the OPFN request WQEs as
+ * they are not allowed to generate real user errors
+ */
+ if (wqe->wr.opcode == IB_WR_OPFN) {
+ struct hfi1_ibport *ibp =
+ to_iport(qp->ibqp.device, qp->port_num);
+ /*
+ * Call opfn_conn_reply() with capcode and
+ * remaining data as 0 to close out the
+ * current request
+ */
+ opfn_conn_reply(qp, priv->opfn.curr);
+ wqe = do_rc_completion(qp, wqe, ibp);
+ qp->s_flags &= ~RVT_S_WAIT_ACK;
+ } else {
+ rvt_send_complete(qp, wqe,
+ IB_WC_RETRY_EXC_ERR);
+ rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR);
+ }
return;
} else { /* need to handle delayed completion */
return;
@@ -1356,6 +1390,9 @@ static int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode,
u64 *vaddr = wqe->sg_list[0].vaddr;
*vaddr = val;
}
+ if (wqe->wr.opcode == IB_WR_OPFN)
+ opfn_conn_reply(qp, val);
+
if (qp->s_num_rd_atomic &&
(wqe->wr.opcode == IB_WR_RDMA_READ ||
wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
@@ -1812,7 +1849,7 @@ static noinline int rc_rcv_error(struct ib_other_headers *ohdr, void *data,
if (i)
prev = i - 1;
else
- prev = HFI1_MAX_RDMA_ATOMIC;
+ prev = rvt_size_atomic(ib_to_rvt(qp->ibqp.device));
if (prev == qp->r_head_ack_queue) {
e = NULL;
break;
@@ -1936,7 +1973,7 @@ static inline void update_ack_queue(struct rvt_qp *qp, unsigned n)
unsigned next;
next = n + 1;
- if (next > HFI1_MAX_RDMA_ATOMIC)
+ if (next > rvt_size_atomic(ib_to_rvt(qp->ibqp.device)))
next = 0;
qp->s_tail_ack_queue = next;
qp->s_ack_state = OP(ACKNOWLEDGE);
@@ -2061,6 +2098,7 @@ void hfi1_rc_rcv(struct hfi1_packet *packet)
return;
fecn = process_ecn(qp, packet);
+ opfn_trigger_conn_request(qp, be32_to_cpu(ohdr->bth[1]));
/*
* Process responses (ACKs) before anything else. Note that the
@@ -2292,8 +2330,8 @@ send_last:
if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ)))
goto nack_inv;
next = qp->r_head_ack_queue + 1;
- /* s_ack_queue is size HFI1_MAX_RDMA_ATOMIC+1 so use > not >= */
- if (next > HFI1_MAX_RDMA_ATOMIC)
+ /* s_ack_queue is size rvt_size_atomic()+1 so use > not >= */
+ if (next > rvt_size_atomic(ib_to_rvt(qp->ibqp.device)))
next = 0;
spin_lock_irqsave(&qp->s_lock, flags);
if (unlikely(next == qp->s_tail_ack_queue)) {
@@ -2356,18 +2394,21 @@ send_last:
case OP(COMPARE_SWAP):
case OP(FETCH_ADD): {
- struct ib_atomic_eth *ateth;
+ struct ib_atomic_eth *ateth = &ohdr->u.atomic_eth;
+ u64 vaddr = get_ib_ateth_vaddr(ateth);
+ bool opfn = opcode == OP(COMPARE_SWAP) &&
+ vaddr == HFI1_VERBS_E_ATOMIC_VADDR;
struct rvt_ack_entry *e;
- u64 vaddr;
atomic64_t *maddr;
u64 sdata;
u32 rkey;
u8 next;
- if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC)))
+ if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC) &&
+ !opfn))
goto nack_inv;
next = qp->r_head_ack_queue + 1;
- if (next > HFI1_MAX_RDMA_ATOMIC)
+ if (next > rvt_size_atomic(ib_to_rvt(qp->ibqp.device)))
next = 0;
spin_lock_irqsave(&qp->s_lock, flags);
if (unlikely(next == qp->s_tail_ack_queue)) {
@@ -2380,8 +2421,11 @@ send_last:
rvt_put_mr(e->rdma_sge.mr);
e->rdma_sge.mr = NULL;
}
- ateth = &ohdr->u.atomic_eth;
- vaddr = get_ib_ateth_vaddr(ateth);
+ /* Process OPFN special virtual address */
+ if (opfn) {
+ opfn_conn_response(qp, e, ateth);
+ goto ack;
+ }
if (unlikely(vaddr & (sizeof(u64) - 1)))
goto nack_inv_unlck;
rkey = be32_to_cpu(ateth->rkey);
@@ -2400,6 +2444,7 @@ send_last:
sdata);
rvt_put_mr(qp->r_sge.sge.mr);
qp->r_sge.num_sge = 0;
+ack:
e->opcode = opcode;
e->sent = 0;
e->psn = psn;
diff --git a/drivers/infiniband/hw/hfi1/ruc.c b/drivers/infiniband/hw/hfi1/ruc.c
index 7fb317c711df..f96c0f544cb0 100644
--- a/drivers/infiniband/hw/hfi1/ruc.c
+++ b/drivers/infiniband/hw/hfi1/ruc.c
@@ -250,7 +250,6 @@ static inline void hfi1_make_ruc_bth(struct rvt_qp *qp,
struct ib_other_headers *ohdr,
u32 bth0, u32 bth1, u32 bth2)
{
- bth1 |= qp->remote_qpn;
ohdr->bth[0] = cpu_to_be32(bth0);
ohdr->bth[1] = cpu_to_be32(bth1);
ohdr->bth[2] = cpu_to_be32(bth2);
@@ -272,13 +271,13 @@ static inline void hfi1_make_ruc_bth(struct rvt_qp *qp,
*/
static inline void hfi1_make_ruc_header_16B(struct rvt_qp *qp,
struct ib_other_headers *ohdr,
- u32 bth0, u32 bth2, int middle,
+ u32 bth0, u32 bth1, u32 bth2,
+ int middle,
struct hfi1_pkt_state *ps)
{
struct hfi1_qp_priv *priv = qp->priv;
struct hfi1_ibport *ibp = ps->ibp;
struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
- u32 bth1 = 0;
u32 slid;
u16 pkey = hfi1_get_pkey(ibp, qp->s_pkey_index);
u8 l4 = OPA_16B_L4_IB_LOCAL;
@@ -360,12 +359,12 @@ static inline void hfi1_make_ruc_header_16B(struct rvt_qp *qp,
*/
static inline void hfi1_make_ruc_header_9B(struct rvt_qp *qp,
struct ib_other_headers *ohdr,
- u32 bth0, u32 bth2, int middle,
+ u32 bth0, u32 bth1, u32 bth2,
+ int middle,
struct hfi1_pkt_state *ps)
{
struct hfi1_qp_priv *priv = qp->priv;
struct hfi1_ibport *ibp = ps->ibp;
- u32 bth1 = 0;
u16 pkey = hfi1_get_pkey(ibp, qp->s_pkey_index);
u16 lrh0 = HFI1_LRH_BTH;
u8 extra_bytes = -ps->s_txreq->s_cur_size & 3;
@@ -415,7 +414,7 @@ static inline void hfi1_make_ruc_header_9B(struct rvt_qp *qp,
typedef void (*hfi1_make_ruc_hdr)(struct rvt_qp *qp,
struct ib_other_headers *ohdr,
- u32 bth0, u32 bth2, int middle,
+ u32 bth0, u32 bth1, u32 bth2, int middle,
struct hfi1_pkt_state *ps);
/* We support only two types - 9B and 16B for now */
@@ -425,7 +424,7 @@ static const hfi1_make_ruc_hdr hfi1_ruc_header_tbl[2] = {
};
void hfi1_make_ruc_header(struct rvt_qp *qp, struct ib_other_headers *ohdr,
- u32 bth0, u32 bth2, int middle,
+ u32 bth0, u32 bth1, u32 bth2, int middle,
struct hfi1_pkt_state *ps)
{
struct hfi1_qp_priv *priv = qp->priv;
@@ -446,7 +445,8 @@ void hfi1_make_ruc_header(struct rvt_qp *qp, struct ib_other_headers *ohdr,
priv->s_ahg->ahgidx = 0;
/* Make the appropriate header */
- hfi1_ruc_header_tbl[priv->hdr_type](qp, ohdr, bth0, bth2, middle, ps);
+ hfi1_ruc_header_tbl[priv->hdr_type](qp, ohdr, bth0, bth1, bth2, middle,
+ ps);
}
/* when sending, force a reschedule every one of these periods */
diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.c b/drivers/infiniband/hw/hfi1/tid_rdma.c
index da1ecb68a928..e8f57c0cd8bc 100644
--- a/drivers/infiniband/hw/hfi1/tid_rdma.c
+++ b/drivers/infiniband/hw/hfi1/tid_rdma.c
@@ -7,6 +7,211 @@
#include "hfi.h"
#include "verbs.h"
#include "tid_rdma.h"
+#include "trace.h"
+
+/*
+ * J_KEY for kernel contexts when TID RDMA is used.
+ * See generate_jkey() in hfi.h for more information.
+ */
+#define TID_RDMA_JKEY 32
+#define HFI1_KERNEL_MIN_JKEY HFI1_ADMIN_JKEY_RANGE
+#define HFI1_KERNEL_MAX_JKEY (2 * HFI1_ADMIN_JKEY_RANGE - 1)
+
+#define TID_RDMA_MAX_READ_SEGS_PER_REQ 6
+#define TID_RDMA_MAX_WRITE_SEGS_PER_REQ 4
+
+#define TID_OPFN_QP_CTXT_MASK 0xff
+#define TID_OPFN_QP_CTXT_SHIFT 56
+#define TID_OPFN_QP_KDETH_MASK 0xff
+#define TID_OPFN_QP_KDETH_SHIFT 48
+#define TID_OPFN_MAX_LEN_MASK 0x7ff
+#define TID_OPFN_MAX_LEN_SHIFT 37
+#define TID_OPFN_TIMEOUT_MASK 0x1f
+#define TID_OPFN_TIMEOUT_SHIFT 32
+#define TID_OPFN_RESERVED_MASK 0x3f
+#define TID_OPFN_RESERVED_SHIFT 26
+#define TID_OPFN_URG_MASK 0x1
+#define TID_OPFN_URG_SHIFT 25
+