summaryrefslogtreecommitdiffstats
path: root/drivers/staging/rdma/hfi1/chip.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/staging/rdma/hfi1/chip.c')
-rw-r--r--drivers/staging/rdma/hfi1/chip.c647
1 files changed, 461 insertions, 186 deletions
diff --git a/drivers/staging/rdma/hfi1/chip.c b/drivers/staging/rdma/hfi1/chip.c
index 16eb653903e0..dcae8e723f98 100644
--- a/drivers/staging/rdma/hfi1/chip.c
+++ b/drivers/staging/rdma/hfi1/chip.c
@@ -123,6 +123,8 @@ struct flag_table {
#define MIN_KERNEL_KCTXTS 2
#define FIRST_KERNEL_KCTXT 1
+/* sizes for both the QP and RSM map tables */
+#define NUM_MAP_ENTRIES 256
#define NUM_MAP_REGS 32
/* Bit offset into the GUID which carries HFI id information */
@@ -1029,9 +1031,12 @@ static int thermal_init(struct hfi1_devdata *dd);
static int wait_logical_linkstate(struct hfi1_pportdata *ppd, u32 state,
int msecs);
static void read_planned_down_reason_code(struct hfi1_devdata *dd, u8 *pdrrc);
+static void read_link_down_reason(struct hfi1_devdata *dd, u8 *ldr);
static void handle_temp_err(struct hfi1_devdata *);
static void dc_shutdown(struct hfi1_devdata *);
static void dc_start(struct hfi1_devdata *);
+static int qos_rmt_entries(struct hfi1_devdata *dd, unsigned int *mp,
+ unsigned int *np);
/*
* Error interrupt table entry. This is used as input to the interrupt
@@ -5661,7 +5666,7 @@ static int sc_to_vl(struct hfi1_devdata *dd, int sw_index)
sci = &dd->send_contexts[sw_index];
/* there is no information for user (PSM) and ack contexts */
- if (sci->type != SC_KERNEL)
+ if ((sci->type != SC_KERNEL) && (sci->type != SC_VL15))
return -1;
sc = sci->sc;
@@ -6199,18 +6204,13 @@ static void hreq_response(struct hfi1_devdata *dd, u8 return_code, u16 rsp_data)
/*
* Handle host requests from the 8051.
- *
- * This is a work-queue function outside of the interrupt.
*/
-void handle_8051_request(struct work_struct *work)
+static void handle_8051_request(struct hfi1_pportdata *ppd)
{
- struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata,
- dc_host_req_work);
struct hfi1_devdata *dd = ppd->dd;
u64 reg;
u16 data = 0;
- u8 type, i, lanes, *cache = ppd->qsfp_info.cache;
- u8 cdr_ctrl_byte = cache[QSFP_CDR_CTRL_BYTE_OFFS];
+ u8 type;
reg = read_csr(dd, DC_DC8051_CFG_EXT_DEV_1);
if ((reg & DC_DC8051_CFG_EXT_DEV_1_REQ_NEW_SMASK) == 0)
@@ -6231,46 +6231,11 @@ void handle_8051_request(struct work_struct *work)
case HREQ_READ_CONFIG:
case HREQ_SET_TX_EQ_ABS:
case HREQ_SET_TX_EQ_REL:
+ case HREQ_ENABLE:
dd_dev_info(dd, "8051 request: request 0x%x not supported\n",
type);
hreq_response(dd, HREQ_NOT_SUPPORTED, 0);
break;
-
- case HREQ_ENABLE:
- lanes = data & 0xF;
- for (i = 0; lanes; lanes >>= 1, i++) {
- if (!(lanes & 1))
- continue;
- if (data & 0x200) {
- /* enable TX CDR */
- if (cache[QSFP_MOD_PWR_OFFS] & 0x8 &&
- cache[QSFP_CDR_INFO_OFFS] & 0x80)
- cdr_ctrl_byte |= (1 << (i + 4));
- } else {
- /* disable TX CDR */
- if (cache[QSFP_MOD_PWR_OFFS] & 0x8 &&
- cache[QSFP_CDR_INFO_OFFS] & 0x80)
- cdr_ctrl_byte &= ~(1 << (i + 4));
- }
-
- if (data & 0x800) {
- /* enable RX CDR */
- if (cache[QSFP_MOD_PWR_OFFS] & 0x4 &&
- cache[QSFP_CDR_INFO_OFFS] & 0x40)
- cdr_ctrl_byte |= (1 << i);
- } else {
- /* disable RX CDR */
- if (cache[QSFP_MOD_PWR_OFFS] & 0x4 &&
- cache[QSFP_CDR_INFO_OFFS] & 0x40)
- cdr_ctrl_byte &= ~(1 << i);
- }
- }
- one_qsfp_write(ppd, dd->hfi1_id, QSFP_CDR_CTRL_BYTE_OFFS,
- &cdr_ctrl_byte, 1);
- hreq_response(dd, HREQ_SUCCESS, data);
- refresh_qsfp_cache(ppd, &ppd->qsfp_info);
- break;
-
case HREQ_CONFIG_DONE:
hreq_response(dd, HREQ_SUCCESS, 0);
break;
@@ -6278,7 +6243,6 @@ void handle_8051_request(struct work_struct *work)
case HREQ_INTERFACE_TEST:
hreq_response(dd, HREQ_SUCCESS, data);
break;
-
default:
dd_dev_err(dd, "8051 request: unknown request 0x%x\n", type);
hreq_response(dd, HREQ_NOT_SUPPORTED, 0);
@@ -6849,6 +6813,75 @@ static void reset_neighbor_info(struct hfi1_pportdata *ppd)
ppd->neighbor_fm_security = 0;
}
+static const char * const link_down_reason_strs[] = {
+ [OPA_LINKDOWN_REASON_NONE] = "None",
+ [OPA_LINKDOWN_REASON_RCV_ERROR_0] = "Recive error 0",
+ [OPA_LINKDOWN_REASON_BAD_PKT_LEN] = "Bad packet length",
+ [OPA_LINKDOWN_REASON_PKT_TOO_LONG] = "Packet too long",
+ [OPA_LINKDOWN_REASON_PKT_TOO_SHORT] = "Packet too short",
+ [OPA_LINKDOWN_REASON_BAD_SLID] = "Bad SLID",
+ [OPA_LINKDOWN_REASON_BAD_DLID] = "Bad DLID",
+ [OPA_LINKDOWN_REASON_BAD_L2] = "Bad L2",
+ [OPA_LINKDOWN_REASON_BAD_SC] = "Bad SC",
+ [OPA_LINKDOWN_REASON_RCV_ERROR_8] = "Receive error 8",
+ [OPA_LINKDOWN_REASON_BAD_MID_TAIL] = "Bad mid tail",
+ [OPA_LINKDOWN_REASON_RCV_ERROR_10] = "Receive error 10",
+ [OPA_LINKDOWN_REASON_PREEMPT_ERROR] = "Preempt error",
+ [OPA_LINKDOWN_REASON_PREEMPT_VL15] = "Preempt vl15",
+ [OPA_LINKDOWN_REASON_BAD_VL_MARKER] = "Bad VL marker",
+ [OPA_LINKDOWN_REASON_RCV_ERROR_14] = "Receive error 14",
+ [OPA_LINKDOWN_REASON_RCV_ERROR_15] = "Receive error 15",
+ [OPA_LINKDOWN_REASON_BAD_HEAD_DIST] = "Bad head distance",
+ [OPA_LINKDOWN_REASON_BAD_TAIL_DIST] = "Bad tail distance",
+ [OPA_LINKDOWN_REASON_BAD_CTRL_DIST] = "Bad control distance",
+ [OPA_LINKDOWN_REASON_BAD_CREDIT_ACK] = "Bad credit ack",
+ [OPA_LINKDOWN_REASON_UNSUPPORTED_VL_MARKER] = "Unsupported VL marker",
+ [OPA_LINKDOWN_REASON_BAD_PREEMPT] = "Bad preempt",
+ [OPA_LINKDOWN_REASON_BAD_CONTROL_FLIT] = "Bad control flit",
+ [OPA_LINKDOWN_REASON_EXCEED_MULTICAST_LIMIT] = "Exceed multicast limit",
+ [OPA_LINKDOWN_REASON_RCV_ERROR_24] = "Receive error 24",
+ [OPA_LINKDOWN_REASON_RCV_ERROR_25] = "Receive error 25",
+ [OPA_LINKDOWN_REASON_RCV_ERROR_26] = "Receive error 26",
+ [OPA_LINKDOWN_REASON_RCV_ERROR_27] = "Receive error 27",
+ [OPA_LINKDOWN_REASON_RCV_ERROR_28] = "Receive error 28",
+ [OPA_LINKDOWN_REASON_RCV_ERROR_29] = "Receive error 29",
+ [OPA_LINKDOWN_REASON_RCV_ERROR_30] = "Receive error 30",
+ [OPA_LINKDOWN_REASON_EXCESSIVE_BUFFER_OVERRUN] =
+ "Excessive buffer overrun",
+ [OPA_LINKDOWN_REASON_UNKNOWN] = "Unknown",
+ [OPA_LINKDOWN_REASON_REBOOT] = "Reboot",
+ [OPA_LINKDOWN_REASON_NEIGHBOR_UNKNOWN] = "Neighbor unknown",
+ [OPA_LINKDOWN_REASON_FM_BOUNCE] = "FM bounce",
+ [OPA_LINKDOWN_REASON_SPEED_POLICY] = "Speed policy",
+ [OPA_LINKDOWN_REASON_WIDTH_POLICY] = "Width policy",
+ [OPA_LINKDOWN_REASON_DISCONNECTED] = "Disconnected",
+ [OPA_LINKDOWN_REASON_LOCAL_MEDIA_NOT_INSTALLED] =
+ "Local media not installed",
+ [OPA_LINKDOWN_REASON_NOT_INSTALLED] = "Not installed",
+ [OPA_LINKDOWN_REASON_CHASSIS_CONFIG] = "Chassis config",
+ [OPA_LINKDOWN_REASON_END_TO_END_NOT_INSTALLED] =
+ "End to end not installed",
+ [OPA_LINKDOWN_REASON_POWER_POLICY] = "Power policy",
+ [OPA_LINKDOWN_REASON_LINKSPEED_POLICY] = "Link speed policy",
+ [OPA_LINKDOWN_REASON_LINKWIDTH_POLICY] = "Link width policy",
+ [OPA_LINKDOWN_REASON_SWITCH_MGMT] = "Switch management",
+ [OPA_LINKDOWN_REASON_SMA_DISABLED] = "SMA disabled",
+ [OPA_LINKDOWN_REASON_TRANSIENT] = "Transient"
+};
+
+/* return the neighbor link down reason string */
+static const char *link_down_reason_str(u8 reason)
+{
+ const char *str = NULL;
+
+ if (reason < ARRAY_SIZE(link_down_reason_strs))
+ str = link_down_reason_strs[reason];
+ if (!str)
+ str = "(invalid)";
+
+ return str;
+}
+
/*
* Handle a link down interrupt from the 8051.
*
@@ -6857,8 +6890,11 @@ static void reset_neighbor_info(struct hfi1_pportdata *ppd)
void handle_link_down(struct work_struct *work)
{
u8 lcl_reason, neigh_reason = 0;
+ u8 link_down_reason;
struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata,
- link_down_work);
+ link_down_work);
+ int was_up;
+ static const char ldr_str[] = "Link down reason: ";
if ((ppd->host_link_state &
(HLS_DN_POLL | HLS_VERIFY_CAP | HLS_GOING_UP)) &&
@@ -6867,20 +6903,63 @@ void handle_link_down(struct work_struct *work)
HFI1_ODR_MASK(OPA_LINKDOWN_REASON_NOT_INSTALLED);
/* Go offline first, then deal with reading/writing through 8051 */
+ was_up = !!(ppd->host_link_state & HLS_UP);
set_link_state(ppd, HLS_DN_OFFLINE);
- lcl_reason = 0;
- read_planned_down_reason_code(ppd->dd, &neigh_reason);
+ if (was_up) {
+ lcl_reason = 0;
+ /* link down reason is only valid if the link was up */
+ read_link_down_reason(ppd->dd, &link_down_reason);
+ switch (link_down_reason) {
+ case LDR_LINK_TRANSFER_ACTIVE_LOW:
+ /* the link went down, no idle message reason */
+ dd_dev_info(ppd->dd, "%sUnexpected link down\n",
+ ldr_str);
+ break;
+ case LDR_RECEIVED_LINKDOWN_IDLE_MSG:
+ /*
+ * The neighbor reason is only valid if an idle message
+ * was received for it.
+ */
+ read_planned_down_reason_code(ppd->dd, &neigh_reason);
+ dd_dev_info(ppd->dd,
+ "%sNeighbor link down message %d, %s\n",
+ ldr_str, neigh_reason,
+ link_down_reason_str(neigh_reason));
+ break;
+ case LDR_RECEIVED_HOST_OFFLINE_REQ:
+ dd_dev_info(ppd->dd,
+ "%sHost requested link to go offline\n",
+ ldr_str);
+ break;
+ default:
+ dd_dev_info(ppd->dd, "%sUnknown reason 0x%x\n",
+ ldr_str, link_down_reason);
+ break;
+ }
- /*
- * If no reason, assume peer-initiated but missed
- * LinkGoingDown idle flits.
- */
- if (neigh_reason == 0)
- lcl_reason = OPA_LINKDOWN_REASON_NEIGHBOR_UNKNOWN;
+ /*
+ * If no reason, assume peer-initiated but missed
+ * LinkGoingDown idle flits.
+ */
+ if (neigh_reason == 0)
+ lcl_reason = OPA_LINKDOWN_REASON_NEIGHBOR_UNKNOWN;
+ } else {
+ /* went down while polling or going up */
+ lcl_reason = OPA_LINKDOWN_REASON_TRANSIENT;
+ }
set_link_down_reason(ppd, lcl_reason, neigh_reason, 0);
+ /* inform the SMA when the link transitions from up to down */
+ if (was_up && ppd->local_link_down_reason.sma == 0 &&
+ ppd->neigh_link_down_reason.sma == 0) {
+ ppd->local_link_down_reason.sma =
+ ppd->local_link_down_reason.latest;
+ ppd->neigh_link_down_reason.sma =
+ ppd->neigh_link_down_reason.latest;
+ }
+
reset_neighbor_info(ppd);
/* disable the port */
@@ -6890,7 +6969,7 @@ void handle_link_down(struct work_struct *work)
* If there is no cable attached, turn the DC off. Otherwise,
* start the link bring up.
*/
- if (!qsfp_mod_present(ppd)) {
+ if (ppd->port_type == PORT_TYPE_QSFP && !qsfp_mod_present(ppd)) {
dc_shutdown(ppd->dd);
} else {
tune_serdes(ppd);
@@ -7373,7 +7452,11 @@ retry:
ppd->link_width_downgrade_rx_active = rx;
}
- if (lwde == 0) {
+ if (ppd->link_width_downgrade_tx_active == 0 ||
+ ppd->link_width_downgrade_rx_active == 0) {
+ /* the 8051 reported a dead link as a downgrade */
+ dd_dev_err(ppd->dd, "Link downgrade is really a link down, ignoring\n");
+ } else if (lwde == 0) {
/* downgrade is disabled */
/* bounce if not at starting active width */
@@ -7534,7 +7617,7 @@ static void handle_8051_interrupt(struct hfi1_devdata *dd, u32 unused, u64 reg)
host_msg &= ~(u64)LINKUP_ACHIEVED;
}
if (host_msg & EXT_DEVICE_CFG_REQ) {
- queue_work(ppd->hfi1_wq, &ppd->dc_host_req_work);
+ handle_8051_request(ppd);
host_msg &= ~(u64)EXT_DEVICE_CFG_REQ;
}
if (host_msg & VERIFY_CAP_FRAME) {
@@ -8660,6 +8743,14 @@ static void read_planned_down_reason_code(struct hfi1_devdata *dd, u8 *pdrrc)
*pdrrc = (frame >> DOWN_REMOTE_REASON_SHIFT) & DOWN_REMOTE_REASON_MASK;
}
+static void read_link_down_reason(struct hfi1_devdata *dd, u8 *ldr)
+{
+ u32 frame;
+
+ read_8051_config(dd, LINK_DOWN_REASON, GENERAL_CONFIG, &frame);
+ *ldr = (frame & 0xff);
+}
+
static int read_tx_settings(struct hfi1_devdata *dd,
u8 *enable_lane_tx,
u8 *tx_polarity_inversion,
@@ -9049,9 +9140,9 @@ set_local_link_attributes_fail:
}
/*
- * Call this to start the link. Schedule a retry if the cable is not
- * present or if unable to start polling. Do not do anything if the
- * link is disabled. Returns 0 if link is disabled or moved to polling
+ * Call this to start the link.
+ * Do not do anything if the link is disabled.
+ * Returns 0 if link is disabled, moved to polling, or the driver is not ready.
*/
int start_link(struct hfi1_pportdata *ppd)
{
@@ -9068,15 +9159,7 @@ int start_link(struct hfi1_pportdata *ppd)
return 0;
}
- if (qsfp_mod_present(ppd) || loopback == LOOPBACK_SERDES ||
- loopback == LOOPBACK_LCB ||
- ppd->dd->icode == ICODE_FUNCTIONAL_SIMULATOR)
- return set_link_state(ppd, HLS_DN_POLL);
-
- dd_dev_info(ppd->dd,
- "%s: stopping link start because no cable is present\n",
- __func__);
- return -EAGAIN;
+ return set_link_state(ppd, HLS_DN_POLL);
}
static void wait_for_qsfp_init(struct hfi1_pportdata *ppd)
@@ -9247,7 +9330,7 @@ static int handle_qsfp_error_conditions(struct hfi1_pportdata *ppd,
return 0;
}
-/* This routine will only be scheduled if the QSFP module is present */
+/* This routine will only be scheduled if the QSFP module present is asserted */
void qsfp_event(struct work_struct *work)
{
struct qsfp_data *qd;
@@ -9676,6 +9759,7 @@ static void set_send_length(struct hfi1_pportdata *ppd)
& SEND_LEN_CHECK1_LEN_VL15_MASK) <<
SEND_LEN_CHECK1_LEN_VL15_SHIFT;
int i;
+ u32 thres;
for (i = 0; i < ppd->vls_supported; i++) {
if (dd->vld[i].mtu > maxvlmtu)
@@ -9694,16 +9778,17 @@ static void set_send_length(struct hfi1_pportdata *ppd)
/* adjust kernel credit return thresholds based on new MTUs */
/* all kernel receive contexts have the same hdrqentsize */
for (i = 0; i < ppd->vls_supported; i++) {
- sc_set_cr_threshold(dd->vld[i].sc,
- sc_mtu_to_threshold(dd->vld[i].sc,
- dd->vld[i].mtu,
- dd->rcd[0]->
- rcvhdrqentsize));
- }
- sc_set_cr_threshold(dd->vld[15].sc,
- sc_mtu_to_threshold(dd->vld[15].sc,
- dd->vld[15].mtu,
+ thres = min(sc_percent_to_threshold(dd->vld[i].sc, 50),
+ sc_mtu_to_threshold(dd->vld[i].sc,
+ dd->vld[i].mtu,
dd->rcd[0]->rcvhdrqentsize));
+ sc_set_cr_threshold(dd->vld[i].sc, thres);
+ }
+ thres = min(sc_percent_to_threshold(dd->vld[15].sc, 50),
+ sc_mtu_to_threshold(dd->vld[15].sc,
+ dd->vld[15].mtu,
+ dd->rcd[0]->rcvhdrqentsize));
+ sc_set_cr_threshold(dd->vld[15].sc, thres);
/* Adjust maximum MTU for the port in DC */
dcmtu = maxvlmtu == 10240 ? DCC_CFG_PORT_MTU_CAP_10240 :
@@ -10030,7 +10115,6 @@ int set_link_state(struct hfi1_pportdata *ppd, u32 state)
struct hfi1_devdata *dd = ppd->dd;
struct ib_event event = {.device = NULL};
int ret1, ret = 0;
- int was_up, is_down;
int orig_new_state, poll_bounce;
mutex_lock(&ppd->hls_lock);
@@ -10049,8 +10133,6 @@ int set_link_state(struct hfi1_pportdata *ppd, u32 state)
poll_bounce ? "(bounce) " : "",
link_state_reason_name(ppd, state));
- was_up = !!(ppd->host_link_state & HLS_UP);
-
/*
* If we're going to a (HLS_*) link state that implies the logical
* link state is neither of (IB_PORT_ARMED, IB_PORT_ACTIVE), then
@@ -10261,17 +10343,6 @@ int set_link_state(struct hfi1_pportdata *ppd, u32 state)
break;
}
- is_down = !!(ppd->host_link_state & (HLS_DN_POLL |
- HLS_DN_DISABLE | HLS_DN_OFFLINE));
-
- if (was_up && is_down && ppd->local_link_down_reason.sma == 0 &&
- ppd->neigh_link_down_reason.sma == 0) {
- ppd->local_link_down_reason.sma =
- ppd->local_link_down_reason.latest;
- ppd->neigh_link_down_reason.sma =
- ppd->neigh_link_down_reason.latest;
- }
-
goto done;
unexpected:
@@ -12673,22 +12744,24 @@ static int set_up_context_variables(struct hfi1_devdata *dd)
int total_contexts;
int ret;
unsigned ngroups;
+ int qos_rmt_count;
+ int user_rmt_reduced;
/*
- * Kernel contexts: (to be fixed later):
- * - min or 2 or 1 context/numa
+ * Kernel receive contexts:
+ * - min of 2 or 1 context/numa (excluding control context)
* - Context 0 - control context (VL15/multicast/error)
- * - Context 1 - default context
+ * - Context 1 - first kernel context
+ * - Context 2 - second kernel context
+ * ...
*/
if (n_krcvqs)
/*
- * Don't count context 0 in n_krcvqs since
- * is isn't used for normal verbs traffic.
- *
- * krcvqs will reflect number of kernel
- * receive contexts above 0.
+ * n_krcvqs is the sum of module parameter kernel receive
+ * contexts, krcvqs[]. It does not include the control
+ * context, so add that.
*/
- num_kernel_contexts = n_krcvqs + MIN_KERNEL_KCTXTS - 1;
+ num_kernel_contexts = n_krcvqs + 1;
else
num_kernel_contexts = num_online_nodes() + 1;
num_kernel_contexts =
@@ -12705,12 +12778,13 @@ static int set_up_context_variables(struct hfi1_devdata *dd)
num_kernel_contexts = dd->chip_send_contexts - num_vls - 1;
}
/*
- * User contexts: (to be fixed later)
- * - default to 1 user context per CPU if num_user_contexts is
- * negative
+ * User contexts:
+ * - default to 1 user context per real (non-HT) CPU core if
+ * num_user_contexts is negative
*/
if (num_user_contexts < 0)
- num_user_contexts = num_online_cpus();
+ num_user_contexts =
+ cpumask_weight(&dd->affinity->real_cpu_mask);
total_contexts = num_kernel_contexts + num_user_contexts;
@@ -12727,6 +12801,19 @@ static int set_up_context_variables(struct hfi1_devdata *dd)
total_contexts = num_kernel_contexts + num_user_contexts;
}
+ /* each user context requires an entry in the RMT */
+ qos_rmt_count = qos_rmt_entries(dd, NULL, NULL);
+ if (qos_rmt_count + num_user_contexts > NUM_MAP_ENTRIES) {
+ user_rmt_reduced = NUM_MAP_ENTRIES - qos_rmt_count;
+ dd_dev_err(dd,
+ "RMT size is reducing the number of user receive contexts from %d to %d\n",
+ (int)num_user_contexts,
+ user_rmt_reduced);
+ /* recalculate */
+ num_user_contexts = user_rmt_reduced;
+ total_contexts = num_kernel_contexts + num_user_contexts;
+ }
+
/* the first N are kernel contexts, the rest are user contexts */
dd->num_rcv_contexts = total_contexts;
dd->n_krcv_queues = num_kernel_contexts;
@@ -12776,12 +12863,13 @@ static int set_up_context_variables(struct hfi1_devdata *dd)
dd->num_send_contexts = ret;
dd_dev_info(
dd,
- "send contexts: chip %d, used %d (kernel %d, ack %d, user %d)\n",
+ "send contexts: chip %d, used %d (kernel %d, ack %d, user %d, vl15 %d)\n",
dd->chip_send_contexts,
dd->num_send_contexts,
dd->sc_sizes[SC_KERNEL].count,
dd->sc_sizes[SC_ACK].count,
- dd->sc_sizes[SC_USER].count);
+ dd->sc_sizes[SC_USER].count,
+ dd->sc_sizes[SC_VL15].count);
ret = 0; /* success */
}
@@ -13451,122 +13539,224 @@ static void init_qpmap_table(struct hfi1_devdata *dd,
int i;
u64 ctxt = first_ctxt;
- for (i = 0; i < 256;) {
+ for (i = 0; i < 256; i++) {
reg |= ctxt << (8 * (i % 8));
- i++;
ctxt++;
if (ctxt > last_ctxt)
ctxt = first_ctxt;
- if (i % 8 == 0) {
+ if (i % 8 == 7) {
write_csr(dd, regno, reg);
reg = 0;
regno += 8;
}
}
- if (i % 8)
- write_csr(dd, regno, reg);
add_rcvctrl(dd, RCV_CTRL_RCV_QP_MAP_ENABLE_SMASK
| RCV_CTRL_RCV_BYPASS_ENABLE_SMASK);
}
-/**
- * init_qos - init RX qos
- * @dd - device data
- * @first_context
- *
- * This routine initializes Rule 0 and the
- * RSM map table to implement qos.
- *
- * If all of the limit tests succeed,
- * qos is applied based on the array
- * interpretation of krcvqs where
- * entry 0 is VL0.
- *
- * The number of vl bits (n) and the number of qpn
- * bits (m) are computed to feed both the RSM map table
- * and the single rule.
- *
+struct rsm_map_table {
+ u64 map[NUM_MAP_REGS];
+ unsigned int used;
+};
+
+struct rsm_rule_data {
+ u8 offset;
+ u8 pkt_type;
+ u32 field1_off;
+ u32 field2_off;
+ u32 index1_off;
+ u32 index1_width;
+ u32 index2_off;
+ u32 index2_width;
+ u32 mask1;
+ u32 value1;
+ u32 mask2;
+ u32 value2;
+};
+
+/*
+ * Return an initialized RMT map table for users to fill in. OK if it
+ * returns NULL, indicating no table.
*/
-static void init_qos(struct hfi1_devdata *dd, u32 first_ctxt)
+static struct rsm_map_table *alloc_rsm_map_table(struct hfi1_devdata *dd)
{
+ struct rsm_map_table *rmt;
+ u8 rxcontext = is_ax(dd) ? 0 : 0xff; /* 0 is default if a0 ver. */
+
+ rmt = kmalloc(sizeof(*rmt), GFP_KERNEL);
+ if (rmt) {
+ memset(rmt->map, rxcontext, sizeof(rmt->map));
+ rmt->used = 0;
+ }
+
+ return rmt;
+}
+
+/*
+ * Write the final RMT map table to the chip and free the table. OK if
+ * table is NULL.
+ */
+static void complete_rsm_map_table(struct hfi1_devdata *dd,
+ struct rsm_map_table *rmt)
+{
+ int i;
+
+ if (rmt) {
+ /* write table to chip */
+ for (i = 0; i < NUM_MAP_REGS; i++)
+ write_csr(dd, RCV_RSM_MAP_TABLE + (8 * i), rmt->map[i]);
+
+ /* enable RSM */
+ add_rcvctrl(dd, RCV_CTRL_RCV_RSM_ENABLE_SMASK);
+ }
+}
+
+/*
+ * Add a receive side mapping rule.
+ */
+static void add_rsm_rule(struct hfi1_devdata *dd, u8 rule_index,
+ struct rsm_rule_data *rrd)
+{
+ write_csr(dd, RCV_RSM_CFG + (8 * rule_index),
+ (u64)rrd->offset << RCV_RSM_CFG_OFFSET_SHIFT |
+ 1ull << rule_index | /* enable bit */
+ (u64)rrd->pkt_type << RCV_RSM_CFG_PACKET_TYPE_SHIFT);
+ write_csr(dd, RCV_RSM_SELECT + (8 * rule_index),
+ (u64)rrd->field1_off << RCV_RSM_SELECT_FIELD1_OFFSET_SHIFT |
+ (u64)rrd->field2_off << RCV_RSM_SELECT_FIELD2_OFFSET_SHIFT |
+ (u64)rrd->index1_off << RCV_RSM_SELECT_INDEX1_OFFSET_SHIFT |
+ (u64)rrd->index1_width << RCV_RSM_SELECT_INDEX1_WIDTH_SHIFT |
+ (u64)rrd->index2_off << RCV_RSM_SELECT_INDEX2_OFFSET_SHIFT |
+ (u64)rrd->index2_width << RCV_RSM_SELECT_INDEX2_WIDTH_SHIFT);
+ write_csr(dd, RCV_RSM_MATCH + (8 * rule_index),
+ (u64)rrd->mask1 << RCV_RSM_MATCH_MASK1_SHIFT |
+ (u64)rrd->value1 << RCV_RSM_MATCH_VALUE1_SHIFT |
+ (u64)rrd->mask2 << RCV_RSM_MATCH_MASK2_SHIFT |
+ (u64)rrd->value2 << RCV_RSM_MATCH_VALUE2_SHIFT);
+}
+
+/* return the number of RSM map table entries that will be used for QOS */
+static int qos_rmt_entries(struct hfi1_devdata *dd, unsigned int *mp,
+ unsigned int *np)
+{
+ int i;
+ unsigned int m, n;
u8 max_by_vl = 0;
- unsigned qpns_per_vl, ctxt, i, qpn, n = 1, m;
- u64 *rsmmap;
- u64 reg;
- u8 rxcontext = is_ax(dd) ? 0 : 0xff; /* 0 is default if a0 ver. */
- /* validate */
+ /* is QOS active at all? */
if (dd->n_krcv_queues <= MIN_KERNEL_KCTXTS ||
num_vls == 1 ||
krcvqsset <= 1)
- goto bail;
- for (i = 0; i < min_t(unsigned, num_vls, krcvqsset); i++)
+ goto no_qos;
+
+ /* determine bits for qpn */
+ for (i = 0; i < min_t(unsigned int, num_vls, krcvqsset); i++)
if (krcvqs[i] > max_by_vl)
max_by_vl = krcvqs[i];
if (max_by_vl > 32)
- goto bail;
- qpns_per_vl = __roundup_pow_of_two(max_by_vl);
- /* determine bits vl */
- n = ilog2(num_vls);
- /* determine bits for qpn */
- m = ilog2(qpns_per_vl);
+ goto no_qos;
+ m = ilog2(__roundup_pow_of_two(max_by_vl));
+
+ /* determine bits for vl */
+ n = ilog2(__roundup_pow_of_two(num_vls));
+
+ /* reject if too much is used */
if ((m + n) > 7)
+ goto no_qos;
+
+ if (mp)
+ *mp = m;
+ if (np)
+ *np = n;
+
+ return 1 << (m + n);
+
+no_qos:
+ if (mp)
+ *mp = 0;
+ if (np)
+ *np = 0;
+ return 0;
+}
+
+/**
+ * init_qos - init RX qos
+ * @dd - device data
+ * @rmt - RSM map table
+ *
+ * This routine initializes Rule 0 and the RSM map table to implement
+ * quality of service (qos).
+ *
+ * If all of the limit tests succeed, qos is applied based on the array
+ * interpretation of krcvqs where entry 0 is VL0.
+ *
+ * The number of vl bits (n) and the number of qpn bits (m) are computed to
+ * feed both the RSM map table and the single rule.
+ */
+static void init_qos(struct hfi1_devdata *dd, struct rsm_map_table *rmt)
+{
+ struct rsm_rule_data rrd;
+ unsigned qpns_per_vl, ctxt, i, qpn, n = 1, m;
+ unsigned int rmt_entries;
+ u64 reg;
+
+ if (!rmt)
goto bail;
- if (num_vls * qpns_per_vl > dd->chip_rcv_contexts)
+ rmt_entries = qos_rmt_entries(dd, &m, &n);
+ if (rmt_entries == 0)
goto bail;
- rsmmap = kmalloc_array(NUM_MAP_REGS, sizeof(u64), GFP_KERNEL);
- if (!rsmmap)
+ qpns_per_vl = 1 << m;
+
+ /* enough room in the map table? */
+ rmt_entries = 1 << (m + n);
+ if (rmt->used + rmt_entries >= NUM_MAP_ENTRIES)
goto bail;
- memset(rsmmap, rxcontext, NUM_MAP_REGS * sizeof(u64));
- /* init the local copy of the table */
- for (i = 0, ctxt = first_ctxt; i < num_vls; i++) {
+
+ /* add qos entries to the the RSM map table */
+ for (i = 0, ctxt = FIRST_KERNEL_KCTXT; i < num_vls; i++) {
unsigned tctxt;
for (qpn = 0, tctxt = ctxt;
krcvqs[i] && qpn < qpns_per_vl; qpn++) {
unsigned idx, regoff, regidx;
- /* generate index <= 128 */
- idx = (qpn << n) ^ i;
+ /* generate the index the hardware will produce */
+ idx = rmt->used + ((qpn << n) ^ i);
regoff = (idx % 8) * 8;
regidx = idx / 8;
- reg = rsmmap[regidx];
- /* replace 0xff with context number */
+ /* replace default with context number */
+ reg = rmt->map[regidx];
reg &= ~(RCV_RSM_MAP_TABLE_RCV_CONTEXT_A_MASK
<< regoff);
reg |= (u64)(tctxt++) << regoff;
- rsmmap[regidx] = reg;
+ rmt->map[regidx] = reg;
if (tctxt == ctxt + krcvqs[i])
tctxt = ctxt;
}
ctxt += krcvqs[i];
}
- /* flush cached copies to chip */
- for (i = 0; i < NUM_MAP_REGS; i++)
- write_csr(dd, RCV_RSM_MAP_TABLE + (8 * i), rsmmap[i]);
- /* add rule0 */
- write_csr(dd, RCV_RSM_CFG /* + (8 * 0) */,
- RCV_RSM_CFG_ENABLE_OR_CHAIN_RSM0_MASK <<
- RCV_RSM_CFG_ENABLE_OR_CHAIN_RSM0_SHIFT |
- 2ull << RCV_RSM_CFG_PACKET_TYPE_SHIFT);
- write_csr(dd, RCV_RSM_SELECT /* + (8 * 0) */,
- LRH_BTH_MATCH_OFFSET << RCV_RSM_SELECT_FIELD1_OFFSET_SHIFT |
- LRH_SC_MATCH_OFFSET << RCV_RSM_SELECT_FIELD2_OFFSET_SHIFT |
- LRH_SC_SELECT_OFFSET << RCV_RSM_SELECT_INDEX1_OFFSET_SHIFT |
- ((u64)n) << RCV_RSM_SELECT_INDEX1_WIDTH_SHIFT |
- QPN_SELECT_OFFSET << RCV_RSM_SELECT_INDEX2_OFFSET_SHIFT |
- ((u64)m + (u64)n) << RCV_RSM_SELECT_INDEX2_WIDTH_SHIFT);
- write_csr(dd, RCV_RSM_MATCH /* + (8 * 0) */,
- LRH_BTH_MASK << RCV_RSM_MATCH_MASK1_SHIFT |
- LRH_BTH_VALUE << RCV_RSM_MATCH_VALUE1_SHIFT |
- LRH_SC_MASK << RCV_RSM_MATCH_MASK2_SHIFT |
- LRH_SC_VALUE << RCV_RSM_MATCH_VALUE2_SHIFT);
- /* Enable RSM */
- add_rcvctrl(dd, RCV_CTRL_RCV_RSM_ENABLE_SMASK);
- kfree(rsmmap);
- /* map everything else to first context */
- init_qpmap_table(dd, FIRST_KERNEL_KCTXT, MIN_KERNEL_KCTXTS - 1);
+
+ rrd.offset = rmt->used;
+ rrd.pkt_type = 2;
+ rrd.field1_off = LRH_BTH_MATCH_OFFSET;
+ rrd.field2_off = LRH_SC_MATCH_OFFSET;
+ rrd.index1_off = LRH_SC_SELECT_OFFSET;
+ rrd.index1_width = n;
+ rrd.index2_off = QPN_SELECT_OFFSET;
+ rrd.index2_width = m + n;
+ rrd.mask1 = LRH_BTH_MASK;
+ rrd.value1 = LRH_BTH_VALUE;
+ rrd.mask2 = LRH_SC_MASK;
+ rrd.value2 = LRH_SC_VALUE;
+
+ /* add rule 0 */
+ add_rsm_rule(dd, 0, &rrd);
+
+ /* mark RSM map entries as used */
+ rmt->used += rmt_entries;
+ /* map everything else to the mcast/err/vl15 context */
+ init_qpmap_table(dd, HFI1_CTRL_CTXT, HFI1_CTRL_CTXT);
dd->qos_shift = n + 1;
return;
bail:
@@ -13574,13 +13764,86 @@ bail:
init_qpmap_table(dd, FIRST_KERNEL_KCTXT, dd->n_krcv_queues - 1);
}
+static void init_user_fecn_handling(struct hfi1_devdata *dd,
+ struct rsm_map_table *rmt)
+{
+ struct rsm_rule_data rrd;
+ u64 reg;
+ int i, idx, regoff, regidx;
+ u8 offset;
+
+ /* there needs to be enough room in the map table */
+ if (rmt->used + dd->num_user_contexts >= NUM_MAP_ENTRIES) {
+ dd_dev_err(dd, "User FECN handling disabled - too many user contexts allocated\n");
+ return;
+ }
+
+ /*
+ * RSM will extract the destination context as an index into the
+ * map table. The destination contexts are a sequential block
+ * in the range first_user_ctxt...num_rcv_contexts-1 (inclusive).
+ * Map entries are accessed as offset + extracted value. Adjust
+ * the added offset so this sequence can be placed anywhere in
+ * the table - as long as the entries themselves do not wrap.
+ * There are only enough bits in offset for the table size, so
+ * start with that to allow for a "negative" offset.
+ */
+ offset = (u8)(NUM_MAP_ENTRIES + (int)rmt->used -
+ (int)dd->first_user_ctxt);
+
+ for (i = dd->first_user_ctxt, idx = rmt->used;
+ i < dd->num_rcv_contexts; i++, idx++) {
+ /* replace with identity mapping */
+ regoff = (idx % 8) * 8;
+ regidx = idx / 8;
+ reg = rmt->map[regidx];
+ reg &= ~(RCV_RSM_MAP_TABLE_RCV_CONTEXT_A_MASK << regoff);
+ reg |= (u64)i << regoff;
+ rmt->map[regidx] = reg;
+ }
+
+ /*
+ * For RSM intercept of Expected FECN packets:
+ * o packet type 0 - expected
+ * o match on F (bit 95), using select/match 1, and
+ * o match on SH (bit 133), using select/match 2.
+ *
+ * Use index 1 to extract the 8-bit receive context from DestQP
+ * (start at bit 64). Use that as the RSM map table index.
+ */
+ rrd.offset = offset;
+ rrd.pkt_type = 0;
+ rrd.field1_off = 95;
+ rrd.field2_off = 133;
+ rrd.index1_off = 64;
+ rrd.index1_width = 8;
+ rrd.index2_off = 0;
+ rrd.index2_width = 0;
+ rrd.mask1 = 1;
+ rrd.value1 = 1;
+ rrd.mask2 = 1;
+ rrd.value2 = 1;
+
+ /* add rule 1 */
+ add_rsm_rule(dd, 1, &rrd);
+
+ rmt->used += dd->num_user_contexts;
+}
+
static void init_rxe(struct hfi1_devdata *dd)
{
+ struct rsm_map_table *rmt;
+
/* enable all receive errors */
write_csr(dd, RCV_ERR_MASK, ~0ull);
- /* setup QPN map table - start where VL15 context leaves off */
- init_qos(dd, dd->n_krcv_queues > MIN_KERNEL_KCTXTS ?
- MIN_KERNEL_KCTXTS : 0);
+
+ rmt = alloc_rsm_map_table(dd);
+ /* set up QOS, including the QPN map table */
+ init_qos(dd, rmt);
+ init_user_fecn_handling(dd, rmt);
+ complete_rsm_map_table(dd, rmt);
+ kfree(rmt);
+
/*
* make sure RcvCtrl.RcvWcb <= PCIe Device Control
* Register Max_Payload_Size (PCI_EXP_DEVCTL in Linux PCIe config
@@ -13762,6 +14025,7 @@ int hfi1_set_ctxt_pkey(struct hfi1_devdata *dd, unsigned ctxt, u16 pkey)
write_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_PARTITION_KEY, reg);
reg = read_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_ENABLE);
reg |= SEND_CTXT_CHECK_ENABLE_CHECK_PARTITION_KEY_SMASK;
+ reg &= ~SEND_CTXT_CHECK_ENABLE_DISALLOW_KDETH_PACKETS_SMASK;
write_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_ENABLE, reg);
done:
return ret;
@@ -14148,6 +14412,19 @@ struct hfi1_devdata *hfi1_init_dd(struct pci_dev *pdev,
(dd->revision >> CCE_REVISION_SW_SHIFT)
& CCE_REVISION_SW_MASK);
+ /*
+ * The real cpu mask is part of the affinity struct but has to be
+ * initialized earlier than the rest of the affinity struct because it
+ * is needed to calculate the number of user contexts in
+ * set_up_context_variables(). However, hfi1_dev_affinity_init(),
+ * which initializes the rest of the affinity struct members,
+ * depends on set_up_context_variables() for the number of kernel
+ * contexts, so it cannot be called before set_up_context_variables().
+ */
+ ret = init_real_cpu_mask(dd);
+ if (ret)
+ goto bail_cleanup;
+
ret = set_up_context_variables(dd);
if (ret)
goto bail_cleanup;
@@ -14161,9 +14438,7 @@ struct hfi1_devdata *hfi1_init_dd(struct pci_dev *pdev,
/* set up KDETH QP prefix in both RX and TX CSRs */
init_kdeth_qp(dd);
- ret = hfi1_dev_affinity_init(dd);
- if (ret)
- goto bail_cleanup;
+ hfi1_dev_affinity_init(dd);
/* send contexts must be set up before receive contexts */
ret = init_send_contexts(dd);