summaryrefslogtreecommitdiffstats
path: root/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
diff options
context:
space:
mode:
authorAchiad Shochat <achiad@mellanox.com>2015-07-23 23:35:59 +0300
committerDavid S. Miller <davem@davemloft.net>2015-07-27 00:29:17 -0700
commit88a85f99e51fb2373259ab83c8bb130a9bbf3804 (patch)
treef27980b2cdd3c9601a5953c7d268eabc18452a2e /drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
parent58d522912ac7d25b63f468fa4a4e8bb059c5144e (diff)
net/mlx5e: TX latency optimization to save DMA reads
A regular TX WQE execution involves two or more DMA reads - one to fetch the WQE, and another one per WQE gather entry. These DMA reads obviously increase the TX latency. There are two mlx5 mechanisms to bypass these DMA reads: 1) Inline WQE 2) Blue Flame (BF) An inline WQE contains a whole packet, thus saves the DMA read/s of the regular WQE gather entry/s. Inline WQE support was already added in the previous commit. A BF WQE is written directly to the device I/O mapped memory, thus enables saving the DMA read that fetches the WQE. The BF WQE I/O write must be in cache line granularity, thus uses the CPU write combining mechanism. A BF WQE I/O write acts also as a TX doorbell for notifying the device of new TX WQEs. A BF WQE is written to the same I/O mapped address as the regular TX doorbell, thus this address is being mapped twice - once by ioremap() and once by io_mapping_map_wc(). While both mechanisms reduce the TX latency, they both consume more CPU cycles than a regular WQE: - A BF WQE must still be written to host memory, in addition to being written directly to the device I/O mapped memory. - An inline WQE involves copying the SKB data into it. To handle this tradeoff, we introduce here a heuristic algorithm that strives to avoid using these two mechanisms in case the TX queue is being back-pressured by the device, and limit their usage rate otherwise. An inline WQE will always be "Blue Flamed" (written directly to the device I/O mapped memory) while a BF WQE may not be inlined (may contain gather entries). Preliminary testing using netperf UDP_RR shows that the latency goes down from 17.5us to 16.9us, while the message rate (tested with pktgen) stays the same. Signed-off-by: Achiad Shochat <achiad@mellanox.com> Signed-off-by: Amir Vadai <amirv@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'drivers/net/ethernet/mellanox/mlx5/core/en_tx.c')
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en_tx.c26
1 files changed, 21 insertions, 5 deletions
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
index 351ac6982e22..64380bc0cd6a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
@@ -57,7 +57,7 @@ void mlx5e_send_nop(struct mlx5e_sq *sq, bool notify_hw)
if (notify_hw) {
cseg->fm_ce_se = MLX5_WQE_CTRL_CQ_UPDATE;
- mlx5e_tx_notify_hw(sq, wqe);
+ mlx5e_tx_notify_hw(sq, wqe, 0);
}
}
@@ -110,7 +110,7 @@ u16 mlx5e_select_queue(struct net_device *dev, struct sk_buff *skb,
}
static inline u16 mlx5e_get_inline_hdr_size(struct mlx5e_sq *sq,
- struct sk_buff *skb)
+ struct sk_buff *skb, bool bf)
{
/* Some NIC TX decisions, e.g loopback, are based on the packet
* headers and occur before the data gather.
@@ -118,7 +118,7 @@ static inline u16 mlx5e_get_inline_hdr_size(struct mlx5e_sq *sq,
*/
#define MLX5E_MIN_INLINE (ETH_HLEN + 2/*vlan tag*/)
- if (skb_headlen(skb) <= sq->max_inline)
+ if (bf && (skb_headlen(skb) <= sq->max_inline))
return skb_headlen(skb);
return MLX5E_MIN_INLINE;
@@ -137,6 +137,7 @@ static netdev_tx_t mlx5e_sq_xmit(struct mlx5e_sq *sq, struct sk_buff *skb)
u8 opcode = MLX5_OPCODE_SEND;
dma_addr_t dma_addr = 0;
+ bool bf = false;
u16 headlen;
u16 ds_cnt;
u16 ihs;
@@ -149,6 +150,11 @@ static netdev_tx_t mlx5e_sq_xmit(struct mlx5e_sq *sq, struct sk_buff *skb)
else
sq->stats.csum_offload_none++;
+ if (sq->cc != sq->prev_cc) {
+ sq->prev_cc = sq->cc;
+ sq->bf_budget = (sq->cc == sq->pc) ? MLX5E_SQ_BF_BUDGET : 0;
+ }
+
if (skb_is_gso(skb)) {
u32 payload_len;
@@ -161,7 +167,10 @@ static netdev_tx_t mlx5e_sq_xmit(struct mlx5e_sq *sq, struct sk_buff *skb)
sq->stats.tso_packets++;
sq->stats.tso_bytes += payload_len;
} else {
- ihs = mlx5e_get_inline_hdr_size(sq, skb);
+ bf = sq->bf_budget &&
+ !skb->xmit_more &&
+ !skb_shinfo(skb)->nr_frags;
+ ihs = mlx5e_get_inline_hdr_size(sq, skb, bf);
MLX5E_TX_SKB_CB(skb)->num_bytes = max_t(unsigned int, skb->len,
ETH_ZLEN);
}
@@ -233,14 +242,21 @@ static netdev_tx_t mlx5e_sq_xmit(struct mlx5e_sq *sq, struct sk_buff *skb)
}
if (!skb->xmit_more || netif_xmit_stopped(sq->txq)) {
+ int bf_sz = 0;
+
+ if (bf && sq->uar_bf_map)
+ bf_sz = MLX5E_TX_SKB_CB(skb)->num_wqebbs << 3;
+
cseg->fm_ce_se = MLX5_WQE_CTRL_CQ_UPDATE;
- mlx5e_tx_notify_hw(sq, wqe);
+ mlx5e_tx_notify_hw(sq, wqe, bf_sz);
}
/* fill sq edge with nops to avoid wqe wrap around */
while ((sq->pc & wq->sz_m1) > sq->edge)
mlx5e_send_nop(sq, false);
+ sq->bf_budget = bf ? sq->bf_budget - 1 : 0;
+
sq->stats.packets++;
return NETDEV_TX_OK;