diff options
Diffstat (limited to 'drivers/staging/rdma/hfi1/verbs.c')
-rw-r--r-- | drivers/staging/rdma/hfi1/verbs.c | 2142 |
1 files changed, 2142 insertions, 0 deletions
diff --git a/drivers/staging/rdma/hfi1/verbs.c b/drivers/staging/rdma/hfi1/verbs.c new file mode 100644 index 000000000000..5f4b6617677a --- /dev/null +++ b/drivers/staging/rdma/hfi1/verbs.c @@ -0,0 +1,2142 @@ +/* + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2015 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Copyright(c) 2015 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include <rdma/ib_mad.h> +#include <rdma/ib_user_verbs.h> +#include <linux/io.h> +#include <linux/module.h> +#include <linux/utsname.h> +#include <linux/rculist.h> +#include <linux/mm.h> +#include <linux/random.h> +#include <linux/vmalloc.h> + +#include "hfi.h" +#include "common.h" +#include "device.h" +#include "trace.h" +#include "qp.h" +#include "sdma.h" + +unsigned int hfi1_lkey_table_size = 16; +module_param_named(lkey_table_size, hfi1_lkey_table_size, uint, + S_IRUGO); +MODULE_PARM_DESC(lkey_table_size, + "LKEY table size in bits (2^n, 1 <= n <= 23)"); + +static unsigned int hfi1_max_pds = 0xFFFF; +module_param_named(max_pds, hfi1_max_pds, uint, S_IRUGO); +MODULE_PARM_DESC(max_pds, + "Maximum number of protection domains to support"); + +static unsigned int hfi1_max_ahs = 0xFFFF; +module_param_named(max_ahs, hfi1_max_ahs, uint, S_IRUGO); +MODULE_PARM_DESC(max_ahs, "Maximum number of address handles to support"); + +unsigned int hfi1_max_cqes = 0x2FFFF; +module_param_named(max_cqes, hfi1_max_cqes, uint, S_IRUGO); +MODULE_PARM_DESC(max_cqes, + "Maximum number of completion queue entries to support"); + +unsigned int hfi1_max_cqs = 0x1FFFF; +module_param_named(max_cqs, hfi1_max_cqs, uint, S_IRUGO); +MODULE_PARM_DESC(max_cqs, "Maximum number of completion queues to support"); + +unsigned int hfi1_max_qp_wrs = 0x3FFF; +module_param_named(max_qp_wrs, hfi1_max_qp_wrs, uint, S_IRUGO); +MODULE_PARM_DESC(max_qp_wrs, "Maximum number of QP WRs to support"); + +unsigned int hfi1_max_qps = 16384; +module_param_named(max_qps, hfi1_max_qps, uint, S_IRUGO); +MODULE_PARM_DESC(max_qps, "Maximum number of QPs to support"); + +unsigned int hfi1_max_sges = 0x60; +module_param_named(max_sges, hfi1_max_sges, uint, S_IRUGO); +MODULE_PARM_DESC(max_sges, "Maximum number of SGEs to support"); + +unsigned int hfi1_max_mcast_grps = 16384; +module_param_named(max_mcast_grps, hfi1_max_mcast_grps, uint, S_IRUGO); +MODULE_PARM_DESC(max_mcast_grps, + "Maximum number of multicast groups to support"); + +unsigned int hfi1_max_mcast_qp_attached = 16; +module_param_named(max_mcast_qp_attached, hfi1_max_mcast_qp_attached, + uint, S_IRUGO); +MODULE_PARM_DESC(max_mcast_qp_attached, + "Maximum number of attached QPs to support"); + +unsigned int hfi1_max_srqs = 1024; +module_param_named(max_srqs, hfi1_max_srqs, uint, S_IRUGO); +MODULE_PARM_DESC(max_srqs, "Maximum number of SRQs to support"); + +unsigned int hfi1_max_srq_sges = 128; +module_param_named(max_srq_sges, hfi1_max_srq_sges, uint, S_IRUGO); +MODULE_PARM_DESC(max_srq_sges, "Maximum number of SRQ SGEs to support"); + +unsigned int hfi1_max_srq_wrs = 0x1FFFF; +module_param_named(max_srq_wrs, hfi1_max_srq_wrs, uint, S_IRUGO); +MODULE_PARM_DESC(max_srq_wrs, "Maximum number of SRQ WRs support"); + +static void verbs_sdma_complete( + struct sdma_txreq *cookie, + int status, + int drained); + +/* + * Note that it is OK to post send work requests in the SQE and ERR + * states; hfi1_do_send() will process them and generate error + * completions as per IB 1.2 C10-96. + */ +const int ib_hfi1_state_ops[IB_QPS_ERR + 1] = { + [IB_QPS_RESET] = 0, + [IB_QPS_INIT] = HFI1_POST_RECV_OK, + [IB_QPS_RTR] = HFI1_POST_RECV_OK | HFI1_PROCESS_RECV_OK, + [IB_QPS_RTS] = HFI1_POST_RECV_OK | HFI1_PROCESS_RECV_OK | + HFI1_POST_SEND_OK | HFI1_PROCESS_SEND_OK | + HFI1_PROCESS_NEXT_SEND_OK, + [IB_QPS_SQD] = HFI1_POST_RECV_OK | HFI1_PROCESS_RECV_OK | + HFI1_POST_SEND_OK | HFI1_PROCESS_SEND_OK, + [IB_QPS_SQE] = HFI1_POST_RECV_OK | HFI1_PROCESS_RECV_OK | + HFI1_POST_SEND_OK | HFI1_FLUSH_SEND, + [IB_QPS_ERR] = HFI1_POST_RECV_OK | HFI1_FLUSH_RECV | + HFI1_POST_SEND_OK | HFI1_FLUSH_SEND, +}; + +struct hfi1_ucontext { + struct ib_ucontext ibucontext; +}; + +static inline struct hfi1_ucontext *to_iucontext(struct ib_ucontext + *ibucontext) +{ + return container_of(ibucontext, struct hfi1_ucontext, ibucontext); +} + +/* + * Translate ib_wr_opcode into ib_wc_opcode. + */ +const enum ib_wc_opcode ib_hfi1_wc_opcode[] = { + [IB_WR_RDMA_WRITE] = IB_WC_RDMA_WRITE, + [IB_WR_RDMA_WRITE_WITH_IMM] = IB_WC_RDMA_WRITE, + [IB_WR_SEND] = IB_WC_SEND, + [IB_WR_SEND_WITH_IMM] = IB_WC_SEND, + [IB_WR_RDMA_READ] = IB_WC_RDMA_READ, + [IB_WR_ATOMIC_CMP_AND_SWP] = IB_WC_COMP_SWAP, + [IB_WR_ATOMIC_FETCH_AND_ADD] = IB_WC_FETCH_ADD +}; + +/* + * Length of header by opcode, 0 --> not supported + */ +const u8 hdr_len_by_opcode[256] = { + /* RC */ + [IB_OPCODE_RC_SEND_FIRST] = 12 + 8, + [IB_OPCODE_RC_SEND_MIDDLE] = 12 + 8, + [IB_OPCODE_RC_SEND_LAST] = 12 + 8, + [IB_OPCODE_RC_SEND_LAST_WITH_IMMEDIATE] = 12 + 8 + 4, + [IB_OPCODE_RC_SEND_ONLY] = 12 + 8, + [IB_OPCODE_RC_SEND_ONLY_WITH_IMMEDIATE] = 12 + 8 + 4, + [IB_OPCODE_RC_RDMA_WRITE_FIRST] = 12 + 8 + 16, + [IB_OPCODE_RC_RDMA_WRITE_MIDDLE] = 12 + 8, + [IB_OPCODE_RC_RDMA_WRITE_LAST] = 12 + 8, + [IB_OPCODE_RC_RDMA_WRITE_LAST_WITH_IMMEDIATE] = 12 + 8 + 4, + [IB_OPCODE_RC_RDMA_WRITE_ONLY] = 12 + 8 + 16, + [IB_OPCODE_RC_RDMA_WRITE_ONLY_WITH_IMMEDIATE] = 12 + 8 + 20, + [IB_OPCODE_RC_RDMA_READ_REQUEST] = 12 + 8 + 16, + [IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST] = 12 + 8 + 4, + [IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE] = 12 + 8, + [IB_OPCODE_RC_RDMA_READ_RESPONSE_LAST] = 12 + 8 + 4, + [IB_OPCODE_RC_RDMA_READ_RESPONSE_ONLY] = 12 + 8 + 4, + [IB_OPCODE_RC_ACKNOWLEDGE] = 12 + 8 + 4, + [IB_OPCODE_RC_ATOMIC_ACKNOWLEDGE] = 12 + 8 + 4, + [IB_OPCODE_RC_COMPARE_SWAP] = 12 + 8 + 28, + [IB_OPCODE_RC_FETCH_ADD] = 12 + 8 + 28, + /* UC */ + [IB_OPCODE_UC_SEND_FIRST] = 12 + 8, + [IB_OPCODE_UC_SEND_MIDDLE] = 12 + 8, + [IB_OPCODE_UC_SEND_LAST] = 12 + 8, + [IB_OPCODE_UC_SEND_LAST_WITH_IMMEDIATE] = 12 + 8 + 4, + [IB_OPCODE_UC_SEND_ONLY] = 12 + 8, + [IB_OPCODE_UC_SEND_ONLY_WITH_IMMEDIATE] = 12 + 8 + 4, + [IB_OPCODE_UC_RDMA_WRITE_FIRST] = 12 + 8 + 16, + [IB_OPCODE_UC_RDMA_WRITE_MIDDLE] = 12 + 8, + [IB_OPCODE_UC_RDMA_WRITE_LAST] = 12 + 8, + [IB_OPCODE_UC_RDMA_WRITE_LAST_WITH_IMMEDIATE] = 12 + 8 + 4, + [IB_OPCODE_UC_RDMA_WRITE_ONLY] = 12 + 8 + 16, + [IB_OPCODE_UC_RDMA_WRITE_ONLY_WITH_IMMEDIATE] = 12 + 8 + 20, + /* UD */ + [IB_OPCODE_UD_SEND_ONLY] = 12 + 8 + 8, + [IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE] = 12 + 8 + 12 +}; + +static const opcode_handler opcode_handler_tbl[256] = { + /* RC */ + [IB_OPCODE_RC_SEND_FIRST] = &hfi1_rc_rcv, + [IB_OPCODE_RC_SEND_MIDDLE] = &hfi1_rc_rcv, + [IB_OPCODE_RC_SEND_LAST] = &hfi1_rc_rcv, + [IB_OPCODE_RC_SEND_LAST_WITH_IMMEDIATE] = &hfi1_rc_rcv, + [IB_OPCODE_RC_SEND_ONLY] = &hfi1_rc_rcv, + [IB_OPCODE_RC_SEND_ONLY_WITH_IMMEDIATE] = &hfi1_rc_rcv, + [IB_OPCODE_RC_RDMA_WRITE_FIRST] = &hfi1_rc_rcv, + [IB_OPCODE_RC_RDMA_WRITE_MIDDLE] = &hfi1_rc_rcv, + [IB_OPCODE_RC_RDMA_WRITE_LAST] = &hfi1_rc_rcv, + [IB_OPCODE_RC_RDMA_WRITE_LAST_WITH_IMMEDIATE] = &hfi1_rc_rcv, + [IB_OPCODE_RC_RDMA_WRITE_ONLY] = &hfi1_rc_rcv, + [IB_OPCODE_RC_RDMA_WRITE_ONLY_WITH_IMMEDIATE] = &hfi1_rc_rcv, + [IB_OPCODE_RC_RDMA_READ_REQUEST] = &hfi1_rc_rcv, + [IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST] = &hfi1_rc_rcv, + [IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE] = &hfi1_rc_rcv, + [IB_OPCODE_RC_RDMA_READ_RESPONSE_LAST] = &hfi1_rc_rcv, + [IB_OPCODE_RC_RDMA_READ_RESPONSE_ONLY] = &hfi1_rc_rcv, + [IB_OPCODE_RC_ACKNOWLEDGE] = &hfi1_rc_rcv, + [IB_OPCODE_RC_ATOMIC_ACKNOWLEDGE] = &hfi1_rc_rcv, + [IB_OPCODE_RC_COMPARE_SWAP] = &hfi1_rc_rcv, + [IB_OPCODE_RC_FETCH_ADD] = &hfi1_rc_rcv, + /* UC */ + [IB_OPCODE_UC_SEND_FIRST] = &hfi1_uc_rcv, + [IB_OPCODE_UC_SEND_MIDDLE] = &hfi1_uc_rcv, + [IB_OPCODE_UC_SEND_LAST] = &hfi1_uc_rcv, + [IB_OPCODE_UC_SEND_LAST_WITH_IMMEDIATE] = &hfi1_uc_rcv, + [IB_OPCODE_UC_SEND_ONLY] = &hfi1_uc_rcv, + [IB_OPCODE_UC_SEND_ONLY_WITH_IMMEDIATE] = &hfi1_uc_rcv, + [IB_OPCODE_UC_RDMA_WRITE_FIRST] = &hfi1_uc_rcv, + [IB_OPCODE_UC_RDMA_WRITE_MIDDLE] = &hfi1_uc_rcv, + [IB_OPCODE_UC_RDMA_WRITE_LAST] = &hfi1_uc_rcv, + [IB_OPCODE_UC_RDMA_WRITE_LAST_WITH_IMMEDIATE] = &hfi1_uc_rcv, + [IB_OPCODE_UC_RDMA_WRITE_ONLY] = &hfi1_uc_rcv, + [IB_OPCODE_UC_RDMA_WRITE_ONLY_WITH_IMMEDIATE] = &hfi1_uc_rcv, + /* UD */ + [IB_OPCODE_UD_SEND_ONLY] = &hfi1_ud_rcv, + [IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE] = &hfi1_ud_rcv, + /* CNP */ + [IB_OPCODE_CNP] = &hfi1_cnp_rcv +}; + +/* + * System image GUID. + */ +__be64 ib_hfi1_sys_image_guid; + +/** + * hfi1_copy_sge - copy data to SGE memory + * @ss: the SGE state + * @data: the data to copy + * @length: the length of the data + */ +void hfi1_copy_sge( + struct hfi1_sge_state *ss, + void *data, u32 length, + int release) +{ + struct hfi1_sge *sge = &ss->sge; + + while (length) { + u32 len = sge->length; + + if (len > length) + len = length; + if (len > sge->sge_length) + len = sge->sge_length; + WARN_ON_ONCE(len == 0); + memcpy(sge->vaddr, data, len); + sge->vaddr += len; + sge->length -= len; + sge->sge_length -= len; + if (sge->sge_length == 0) { + if (release) + hfi1_put_mr(sge->mr); + if (--ss->num_sge) + *sge = *ss->sg_list++; + } else if (sge->length == 0 && sge->mr->lkey) { + if (++sge->n >= HFI1_SEGSZ) { + if (++sge->m >= sge->mr->mapsz) + break; + sge->n = 0; + } + sge->vaddr = + sge->mr->map[sge->m]->segs[sge->n].vaddr; + sge->length = + sge->mr->map[sge->m]->segs[sge->n].length; + } + data += len; + length -= len; + } +} + +/** + * hfi1_skip_sge - skip over SGE memory + * @ss: the SGE state + * @length: the number of bytes to skip + */ +void hfi1_skip_sge(struct hfi1_sge_state *ss, u32 length, int release) +{ + struct hfi1_sge *sge = &ss->sge; + + while (length) { + u32 len = sge->length; + + if (len > length) + len = length; + if (len > sge->sge_length) + len = sge->sge_length; + WARN_ON_ONCE(len == 0); + sge->vaddr += len; + sge->length -= len; + sge->sge_length -= len; + if (sge->sge_length == 0) { + if (release) + hfi1_put_mr(sge->mr); + if (--ss->num_sge) + *sge = *ss->sg_list++; + } else if (sge->length == 0 && sge->mr->lkey) { + if (++sge->n >= HFI1_SEGSZ) { + if (++sge->m >= sge->mr->mapsz) + break; + sge->n = 0; + } + sge->vaddr = + sge->mr->map[sge->m]->segs[sge->n].vaddr; + sge->length = + sge->mr->map[sge->m]->segs[sge->n].length; + } + length -= len; + } +} + +/** + * post_one_send - post one RC, UC, or UD send work request + * @qp: the QP to post on + * @wr: the work request to send + */ +static int post_one_send(struct hfi1_qp *qp, struct ib_send_wr *wr) +{ + struct hfi1_swqe *wqe; + u32 next; + int i; + int j; + int acc; + struct hfi1_lkey_table *rkt; + struct hfi1_pd *pd; + struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device); + struct hfi1_pportdata *ppd; + struct hfi1_ibport *ibp; + + /* IB spec says that num_sge == 0 is OK. */ + if (unlikely(wr->num_sge > qp->s_max_sge)) + return -EINVAL; + + ppd = &dd->pport[qp->port_num - 1]; + ibp = &ppd->ibport_data; + + /* + * Don't allow RDMA reads or atomic operations on UC or + * undefined operations. + * Make sure buffer is large enough to hold the result for atomics. + */ + if (wr->opcode == IB_WR_FAST_REG_MR) { + return -EINVAL; + } else if (qp->ibqp.qp_type == IB_QPT_UC) { + if ((unsigned) wr->opcode >= IB_WR_RDMA_READ) + return -EINVAL; + } else if (qp->ibqp.qp_type != IB_QPT_RC) { + /* Check IB_QPT_SMI, IB_QPT_GSI, IB_QPT_UD opcode */ + if (wr->opcode != IB_WR_SEND && + wr->opcode != IB_WR_SEND_WITH_IMM) + return -EINVAL; + /* Check UD destination address PD */ + if (qp->ibqp.pd != wr->wr.ud.ah->pd) + return -EINVAL; + } else if ((unsigned) wr->opcode > IB_WR_ATOMIC_FETCH_AND_ADD) + return -EINVAL; + else if (wr->opcode >= IB_WR_ATOMIC_CMP_AND_SWP && + (wr->num_sge == 0 || + wr->sg_list[0].length < sizeof(u64) || + wr->sg_list[0].addr & (sizeof(u64) - 1))) + return -EINVAL; + else if (wr->opcode >= IB_WR_RDMA_READ && !qp->s_max_rd_atomic) + return -EINVAL; + + next = qp->s_head + 1; + if (next >= qp->s_size) + next = 0; + if (next == qp->s_last) + return -ENOMEM; + + rkt = &to_idev(qp->ibqp.device)->lk_table; + pd = to_ipd(qp->ibqp.pd); + wqe = get_swqe_ptr(qp, qp->s_head); + wqe->wr = *wr; + wqe->length = 0; + j = 0; + if (wr->num_sge) { + acc = wr->opcode >= IB_WR_RDMA_READ ? + IB_ACCESS_LOCAL_WRITE : 0; + for (i = 0; i < wr->num_sge; i++) { + u32 length = wr->sg_list[i].length; + int ok; + + if (length == 0) + continue; + ok = hfi1_lkey_ok(rkt, pd, &wqe->sg_list[j], + &wr->sg_list[i], acc); + if (!ok) + goto bail_inval_free; + wqe->length += length; + j++; + } + wqe->wr.num_sge = j; + } + if (qp->ibqp.qp_type == IB_QPT_UC || + qp->ibqp.qp_type == IB_QPT_RC) { + if (wqe->length > 0x80000000U) + goto bail_inval_free; + } else { + struct hfi1_ah *ah = to_iah(wr->wr.ud.ah); + + atomic_inc(&ah->refcount); + } + wqe->ssn = qp->s_ssn++; + qp->s_head = next; + + return 0; + +bail_inval_free: + /* release mr holds */ + while (j) { + struct hfi1_sge *sge = &wqe->sg_list[--j]; + + hfi1_put_mr(sge->mr); + } + return -EINVAL; +} + +/** + * post_send - post a send on a QP + * @ibqp: the QP to post the send on + * @wr: the list of work requests to post + * @bad_wr: the first bad WR is put here + * + * This may be called from interrupt context. + */ +static int post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, + struct ib_send_wr **bad_wr) +{ + struct hfi1_qp *qp = to_iqp(ibqp); + int err = 0; + int call_send; + unsigned long flags; + unsigned nreq = 0; + + spin_lock_irqsave(&qp->s_lock, flags); + + /* Check that state is OK to post send. */ + if (unlikely(!(ib_hfi1_state_ops[qp->state] & HFI1_POST_SEND_OK))) { + spin_unlock_irqrestore(&qp->s_lock, flags); + return -EINVAL; + } + + /* sq empty and not list -> call send */ + call_send = qp->s_head == qp->s_last && !wr->next; + + for (; wr; wr = wr->next) { + err = post_one_send(qp, wr); + if (unlikely(err)) { + *bad_wr = wr; + goto bail; + } + nreq++; + } +bail: + if (nreq && !call_send) + hfi1_schedule_send(qp); + spin_unlock_irqrestore(&qp->s_lock, flags); + if (nreq && call_send) + hfi1_do_send(&qp->s_iowait.iowork); + return err; +} + +/** + * post_receive - post a receive on a QP + * @ibqp: the QP to post the receive on + * @wr: the WR to post + * @bad_wr: the first bad WR is put here + * + * This may be called from interrupt context. + */ +static int post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr, + struct ib_recv_wr **bad_wr) +{ + struct hfi1_qp *qp = to_iqp(ibqp); + struct hfi1_rwq *wq = qp->r_rq.wq; + unsigned long flags; + int ret; + + /* Check that state is OK to post receive. */ + if (!(ib_hfi1_state_ops[qp->state] & HFI1_POST_RECV_OK) || !wq) { + *bad_wr = wr; + ret = -EINVAL; + goto bail; + } + + for (; wr; wr = wr->next) { + struct hfi1_rwqe *wqe; + u32 next; + int i; + + if ((unsigned) wr->num_sge > qp->r_rq.max_sge) { + *bad_wr = wr; + ret = -EINVAL; + goto bail; + } + + spin_lock_irqsave(&qp->r_rq.lock, flags); + next = wq->head + 1; + if (next >= qp->r_rq.size) + next = 0; + if (next == wq->tail) { + spin_unlock_irqrestore(&qp->r_rq.lock, flags); + *bad_wr = wr; + ret = -ENOMEM; + goto bail; + } + + wqe = get_rwqe_ptr(&qp->r_rq, wq->head); + wqe->wr_id = wr->wr_id; + wqe->num_sge = wr->num_sge; + for (i = 0; i < wr->num_sge; i++) + wqe->sg_list[i] = wr->sg_list[i]; + /* Make sure queue entry is written before the head index. */ + smp_wmb(); + wq->head = next; + spin_unlock_irqrestore(&qp->r_rq.lock, flags); + } + ret = 0; + +bail: + return ret; +} + +/* + * Make sure the QP is ready and able to accept the given opcode. + */ +static inline int qp_ok(int opcode, struct hfi1_packet *packet) +{ + struct hfi1_ibport *ibp; + + if (!(ib_hfi1_state_ops[packet->qp->state] & HFI1_PROCESS_RECV_OK)) + goto dropit; + if (((opcode & OPCODE_QP_MASK) == packet->qp->allowed_ops) || + (opcode == IB_OPCODE_CNP)) + return 1; +dropit: + ibp = &packet->rcd->ppd->ibport_data; + ibp->n_pkt_drops++; + return 0; +} + + +/** + * hfi1_ib_rcv - process an incoming packet + * @packet: data packet information + * + * This is called to process an incoming packet at interrupt level. + * + * Tlen is the length of the header + data + CRC in bytes. + */ +void hfi1_ib_rcv(struct hfi1_packet *packet) +{ + struct hfi1_ctxtdata *rcd = packet->rcd; + struct hfi1_ib_header *hdr = packet->hdr; + u32 tlen = packet->tlen; + struct hfi1_pportdata *ppd = rcd->ppd; + struct hfi1_ibport *ibp = &ppd->ibport_data; + u32 qp_num; + int lnh; + u8 opcode; + u16 lid; + + /* Check for GRH */ + lnh = be16_to_cpu(hdr->lrh[0]) & 3; + if (lnh == HFI1_LRH_BTH) + packet->ohdr = &hdr->u.oth; + else if (lnh == HFI1_LRH_GRH) { + u32 vtf; + + packet->ohdr = &hdr->u.l.oth; + if (hdr->u.l.grh.next_hdr != IB_GRH_NEXT_HDR) + goto drop; + vtf = be32_to_cpu(hdr->u.l.grh.version_tclass_flow); + if ((vtf >> IB_GRH_VERSION_SHIFT) != IB_GRH_VERSION) + goto drop; + packet->rcv_flags |= HFI1_HAS_GRH; + } else + goto drop; + + trace_input_ibhdr(rcd->dd, hdr); + + opcode = (be32_to_cpu(packet->ohdr->bth[0]) >> 24); + inc_opstats(tlen, &rcd->opstats->stats[opcode]); + + /* Get the destination QP number. */ + qp_num = be32_to_cpu(packet->ohdr->bth[1]) & HFI1_QPN_MASK; + lid = be16_to_cpu(hdr->lrh[1]); + if (unlikely((lid >= HFI1_MULTICAST_LID_BASE) && + (lid != HFI1_PERMISSIVE_LID))) { + struct hfi1_mcast *mcast; + struct hfi1_mcast_qp *p; + + if (lnh != HFI1_LRH_GRH) + goto drop; + mcast = hfi1_mcast_find(ibp, &hdr->u.l.grh.dgid); + if (mcast == NULL) + goto drop; + list_for_each_entry_rcu(p, &mcast->qp_list, list) { + packet->qp = p->qp; + spin_lock(&packet->qp->r_lock); + if (likely((qp_ok(opcode, packet)))) + opcode_handler_tbl[opcode](packet); + spin_unlock(&packet->qp->r_lock); + } + /* + * Notify hfi1_multicast_detach() if it is waiting for us + * to finish. + */ + if (atomic_dec_return(&mcast->refcount) <= 1) + wake_up(&mcast->wait); + } else { + rcu_read_lock(); + packet->qp = hfi1_lookup_qpn(ibp, qp_num); + if (!packet->qp) { + rcu_read_unlock(); + goto drop; + } + spin_lock(&packet->qp->r_lock); + if (likely((qp_ok(opcode, packet)))) + opcode_handler_tbl[opcode](packet); + spin_unlock(&packet->qp->r_lock); + rcu_read_unlock(); + } + return; + +drop: + ibp->n_pkt_drops++; +} + +/* + * This is called from a timer to check for QPs + * which need kernel memory in order to send a packet. + */ +static void mem_timer(unsigned long data) +{ + struct hfi1_ibdev *dev = (struct hfi1_ibdev *)data; + struct list_head *list = &dev->memwait; + struct hfi1_qp *qp = NULL; + struct iowait *wait; + unsigned long flags; + + write_seqlock_irqsave(&dev->iowait_lock, flags); + if (!list_empty(list)) { + wait = list_first_entry(list, struct iowait, list); + qp = container_of(wait, struct hfi1_qp, s_iowait); + list_del_init(&qp->s_iowait.list); + /* refcount held until actual wake up */ + if (!list_empty(list)) + mod_timer(&dev->mem_timer, jiffies + 1); + } + write_sequnlock_irqrestore(&dev->iowait_lock, flags); + + if (qp) + hfi1_qp_wakeup(qp, HFI1_S_WAIT_KMEM); +} + +void update_sge(struct hfi1_sge_state *ss, u32 length) +{ + struct hfi1_sge *sge = &ss->sge; + + sge->vaddr += length; + sge->length -= length; + sge->sge_length -= length; + if (sge->sge_length == 0) { + if (--ss->num_sge) + *sge = *ss->sg_list++; + } else if (sge->length == 0 && sge->mr->lkey) { + if (++sge->n >= HFI1_SEGSZ) { + if (++sge->m >= sge->mr->mapsz) + return; + sge->n = 0; + } + sge->vaddr = sge->mr->map[sge->m]->segs[sge->n].vaddr; + sge->length = sge->mr->map[sge->m]->segs[sge->n].length; + } +} + +static noinline struct verbs_txreq *__get_txreq(struct hfi1_ibdev *dev, + struct hfi1_qp *qp) +{ + struct verbs_txreq *tx; + unsigned long flags; + + tx = kmem_cache_alloc(dev->verbs_txreq_cache, GFP_ATOMIC); + if (!tx) { + spin_lock_irqsave(&qp->s_lock, flags); + write_seqlock(&dev->iowait_lock); + if (ib_hfi1_state_ops[qp->state] & HFI1_PROCESS_RECV_OK && + list_empty(&qp->s_iowait.list)) { + dev->n_txwait++; + qp->s_flags |= HFI1_S_WAIT_TX; + list_add_tail(&qp->s_iowait.list, &dev->txwait); + trace_hfi1_qpsleep(qp, HFI1_S_WAIT_TX); + atomic_inc(&qp->refcount); + } + qp->s_flags &= ~HFI1_S_BUSY; + write_sequnlock(&dev->iowait_lock); + spin_unlock_irqrestore(&qp->s_lock, flags); + tx = ERR_PTR(-EBUSY); + } + return tx; +} + +static inline struct verbs_txreq *get_txreq(struct hfi1_ibdev *dev, + struct hfi1_qp *qp) +{ + struct verbs_txreq *tx; + + tx = kmem_cache_alloc(dev->verbs_txreq_cache, GFP_ATOMIC); + if (!tx) + /* call slow path to get the lock */ + tx = __get_txreq(dev, qp); + if (tx) + tx->qp = qp; + return tx; +} + +void hfi1_put_txreq(struct verbs_txreq *tx) +{ + struct hfi1_ibdev *dev; + struct hfi1_qp *qp; + unsigned long flags; + unsigned int seq; + + qp = tx->qp; + dev = to_idev(qp->ibqp.device); + + if (tx->mr) { + hfi1_put_mr(tx->mr); + tx->mr = NULL; + } + sdma_txclean(dd_from_dev(dev), &tx->txreq); + + /* Free verbs_txreq and return to slab cache */ + kmem_cache_free(dev->verbs_txreq_cache, tx); + + do { + seq = read_seqbegin(&dev->iowait_lock); + if (!list_empty(&dev->txwait)) { + struct iowait *wait; + + write_seqlock_irqsave(&dev->iowait_lock, flags); + /* Wake up first QP wanting a free struct */ + wait = list_first_entry(&dev->txwait, struct iowait, + list); + qp = container_of(wait, struct hfi1_qp, s_iowait); + list_del_init(&qp->s_iowait.list); + /* refcount held until actual wake up */ + write_sequnlock_irqrestore(&dev->iowait_lock, flags); + hfi1_qp_wakeup(qp, HFI1_S_WAIT_TX); + break; + } + } while (read_seqretry(&dev->iowait_lock, seq)); +} + +/* + * This is called with progress side lock held. + */ +/* New API */ +static void verbs_sdma_complete( + struct sdma_txreq *cookie, + int status, + int drained) +{ + struct verbs_txreq *tx = + container_of(cookie, struct verbs_txreq, txreq); + struct hfi1_qp *qp = tx->qp; + + spin_lock(&qp->s_lock); + if (tx->wqe) + hfi1_send_complete(qp, tx->wqe, IB_WC_SUCCESS); + else if (qp->ibqp.qp_type == IB_QPT_RC) { + struct hfi1_ib_header *hdr; + + hdr = &tx->phdr.hdr; + hfi1_rc_send_complete(qp, hdr); + } + if (drained) { + /* + * This happens when the send engine notes + * a QP in the error state and cannot + * do the flush work until that QP's + * sdma work has finished. + */ + if (qp->s_flags & HFI1_S_WAIT_DMA) { + qp->s_flags &= ~HFI1_S_WAIT_DMA; + hfi1_schedule_send(qp); + } + } + spin_unlock(&qp->s_lock); + + hfi1_put_txreq(tx); +} + +static int wait_kmem(struct hfi1_ibdev *dev, struct hfi1_qp *qp) +{ + unsigned long flags; + int ret = 0; + + spin_lock_irqsave(&qp->s_lock, flags); + if (ib_hfi1_state_ops[qp->state] & HFI1_PROCESS_RECV_OK) { + write_seqlock(&dev->iowait_lock); + if (list_empty(&qp->s_iowait.list)) { + if (list_empty(&dev->memwait)) + mod_timer(&dev->mem_timer, jiffies + 1); + qp->s_flags |= HFI1_S_WAIT_KMEM; + list_add_tail(&qp->s_iowait.list, &dev->memwait); + trace_hfi1_qpsleep(qp, HFI1_S_WAIT_KMEM); + atomic_inc(&qp->refcount); + } + write_sequnlock(&dev->iowait_lock); + qp->s_flags &= ~HFI1_S_BUSY; + ret = -EBUSY; + } + spin_unlock_irqrestore(&qp->s_lock, flags); + + return ret; +} + +/* + * This routine calls txadds for each sg entry. + * + * Add failures will revert the sge cursor + */ +static int build_verbs_ulp_payload( + struct sdma_engine *sde, + struct hfi1_sge_state *ss, + u32 length, + struct verbs_txreq *tx) +{ + struct hfi1_sge *sg_list = ss->sg_list; + struct hfi1_sge sge = ss->sge; + u8 num_sge = ss->num_sge; + u32 len; + int ret = 0; + + while (length) { + len = ss->sge.length; + if (len > length) + len = length; + if (len > ss->sge.sge_length) + len = ss->sge.sge_length; + WARN_ON_ONCE(len == 0); + ret = sdma_txadd_kvaddr( + sde->dd, + &tx->txreq, + ss->sge.vaddr, + len); + if (ret) + goto bail_txadd; + update_sge(ss, len); + length -= len; + } + return ret; +bail_txadd: + /* unwind cursor */ + ss->sge = sge; + ss->num_sge = num_sge; + ss->sg_list = sg_list; + return ret; +} + +/* + * Build the number of DMA descriptors needed to send length bytes of data. + * + * NOTE: DMA mapping is held in the tx until completed in the ring or + * the tx desc is freed without having been submitted to the ring + * + * This routine insures the following all the helper routine + * calls succeed. + */ +/* New API */ +static int build_verbs_tx_desc( + struct sdma_engine *sde, + struct hfi1_sge_state *ss, + u32 length, + struct verbs_txreq *tx, + struct ahg_ib_header *ahdr, + u64 pbc) +{ + int ret = 0; + struct hfi1_pio_header *phdr; + u16 hdrbytes = tx->hdr_dwords << 2; + + phdr = &tx->phdr; + if (!ahdr->ahgcount) { + ret = sdma_txinit_ahg( + &tx->txreq, + ahdr->tx_flags, + hdrbytes + length, + ahdr->ahgidx, + 0, + NULL, + 0, + verbs_sdma_complete); + if (ret) + goto bail_txadd; + phdr->pbc = cpu_to_le64(pbc); + memcpy(&phdr->hdr, &ahdr->ibh, hdrbytes - sizeof(phdr->pbc)); + /* add the header */ + ret = sdma_txadd_kvaddr( + sde->dd, + &tx->txreq, + &tx->phdr, + tx->hdr_dwords << 2); + if (ret) + goto bail_txadd; + } else { + struct hfi1_other_headers *sohdr = &ahdr->ibh.u.oth; + struct hfi1_other_headers *dohdr = &phdr->hdr.u.oth; + + /* needed in rc_send_complete() */ + phdr->hdr.lrh[0] = ahdr->ibh.lrh[0]; + if ((be16_to_cpu(phdr->hdr.lrh[0]) & 3) == HFI1_LRH_GRH) { + sohdr = &ahdr->ibh.u.l.oth; + dohdr = &phdr->hdr.u.l.oth; + } + /* opcode */ + dohdr->bth[0] = sohdr->bth[0]; + /* PSN/ACK */ + dohdr->bth[2] = sohdr->bth[2]; + ret = sdma_txinit_ahg( + &tx->txreq, + ahdr->tx_flags, + length, + ahdr->ahgidx, + ahdr->ahgcount, + ahdr->ahgdesc, + hdrbytes, + verbs_sdma_complete); + if (ret) + goto bail_txadd; + } + + /* add the ulp payload - if any. ss can be NULL for acks */ + if (ss) + ret = build_verbs_ulp_payload(sde, ss, length, tx); +bail_txadd: + return ret; +} + +int hfi1_verbs_send_dma(struct hfi1_qp *qp, struct ahg_ib_header *ahdr, + u32 hdrwords, struct hfi1_sge_state *ss, u32 len, + u32 plen, u32 dwords, u64 pbc) +{ + struct hfi1_ibdev *dev = to_idev(qp->ibqp.device); + struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num); + struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); + struct verbs_txreq *tx; + struct sdma_txreq *stx; + u64 pbc_flags = 0; + struct sdma_engine *sde; + u8 sc5 = qp->s_sc; + int ret; + + if (!list_empty(&qp->s_iowait.tx_head)) { + stx = list_first_entry( + &qp->s_iowait.tx_head, + struct sdma_txreq, + list); + list_del_init(&stx->list); + tx = container_of(stx, struct verbs_txreq, txreq); + ret = sdma_send_txreq(tx->sde, &qp->s_iowait, stx); + if (unlikely(ret == -ECOMM)) + goto bail_ecomm; + return ret; + } + + tx = get_txreq(dev, qp); + if (IS_ERR(tx)) + goto bail_tx; + + if (!qp->s_hdr->sde) { + tx->sde = sde = qp_to_sdma_engine(qp, sc5); + if (!sde) + goto bail_no_sde; + } else + tx->sde = sde = qp->s_hdr->sde; + + if (likely(pbc == 0)) { + u32 vl = sc_to_vlt(dd_from_ibdev(qp->ibqp.device), sc5); + /* No vl15 here */ + /* set PBC_DC_INFO bit (aka SC[4]) in pbc_flags */ + pbc_flags |= (!!(sc5 & 0x10)) << PBC_DC_INFO_SHIFT; + + pbc = create_pbc(ppd, pbc_flags, qp->srate_mbps, vl, plen); + } + tx->wqe = qp->s_wqe; + tx->mr = qp->s_rdma_mr; + if (qp->s_rdma_mr) + qp->s_rdma_mr = NULL; + tx->hdr_dwords = hdrwords + 2; + ret = build_verbs_tx_desc(sde, ss, len, tx, ahdr, pbc); + if (unlikely(ret)) + goto bail_build; + trace_output_ibhdr(dd_from_ibdev(qp->ibqp.device), &ahdr->ibh); + ret = sdma_send_txreq(sde, &qp->s_iowait, &tx->txreq); + if (unlikely(ret == -ECOMM)) + goto bail_ecomm; + return ret; + +bail_no_sde: + hfi1_put_txreq(tx); +bail_ecomm: + /* The current one got "sent" */ + return 0; +bail_build: + /* kmalloc or mapping fail */ + hfi1_put_txreq(tx); + return wait_kmem(dev, qp); +bail_tx: + return PTR_ERR(tx); +} + +/* + * If we are now in the error state, return zero to flush the + * send work request. + */ +static int no_bufs_available(struct hfi1_qp *qp, struct send_context *sc) +{ + struct hfi1_devdata *dd = sc->dd; + struct hfi1_ibdev *dev = &dd->verbs_dev; + unsigned long flags; + int ret = 0; + + /* + * Note that as soon as want_buffer() is called and + * possibly before it returns, sc_piobufavail() + * could be called. Therefore, put QP on the I/O wait list before + * enabling the PIO avail interrupt. + */ + spin_lock_irqsave(&qp->s_lock, flags); + if (ib_hfi1_state_ops[qp->state] & HFI1_PROCESS_RECV_OK) { + write_seqlock(&dev->iowait_lock); + if (list_empty(&qp->s_iowait.list)) { + struct hfi1_ibdev *dev = &dd->verbs_dev; + int was_empty; + + dev->n_piowait++; + qp->s_flags |= HFI1_S_WAIT_PIO; + was_empty = list_empty(&sc->piowait); + list_add_tail(&qp->s_iowait.list, &sc->piowait); + trace_hfi1_qpsleep(qp, HFI1_S_WAIT_PIO); + atomic_inc(&qp->refcount); + /* counting: only call wantpiobuf_intr if first user */ + if (was_empty) + hfi1_sc_wantpiobuf_intr(sc, 1); + } + write_sequnlock(&dev->iowait_lock); + qp->s_flags &= ~HFI1_S_BUSY; + ret = -EBUSY; + } + spin_unlock_irqrestore(&qp->s_lock, flags); + return ret; +} + +struct send_context *qp_to_send_context(struct hfi1_qp *qp, u8 sc5) +{ + struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device); + struct hfi1_pportdata *ppd = dd->pport + (qp->port_num - 1); + u8 vl; + + vl = sc_to_vlt(dd, sc5); + if (vl >= ppd->vls_supported && vl != 15) + return NULL; + return dd->vld[vl].sc; +} + +int hfi1_verbs_send_pio(struct hfi1_qp *qp, struct ahg_ib_header *ahdr, + u32 hdrwords, struct hfi1_sge_state *ss, u32 len, + u32 plen, u32 dwords, u64 pbc) +{ + struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num); + struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); + u32 *hdr = (u32 *)&ahdr->ibh; + u64 pbc_flags = 0; + u32 sc5; + unsigned long flags = 0; + struct send_context *sc; + struct pio_buf *pbuf; + int wc_status = IB_WC_SUCCESS; + + /* vl15 special case taken care of in ud.c */ + sc5 = qp->s_sc; + sc = qp_to_send_context(qp, sc5); + + if (!sc) + return -EINVAL; + if (likely(pbc == 0)) { + u32 vl = sc_to_vlt(dd_from_ibdev(qp->ib |