summaryrefslogtreecommitdiffstats
path: root/drivers
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2020-05-28 11:04:12 -0700
committerDavid S. Miller <davem@davemloft.net>2020-05-28 11:04:12 -0700
commit1eba1110f0a7a2faa88b8bd1fc5516870bda387b (patch)
tree17e0057dbec3b54371490b801bf69424ddb7e765 /drivers
parentd29245692a44d71d5e2e0770463184a693696232 (diff)
parented03a418abe8e5a3ba541a805314bbf8a9eadda3 (diff)
Merge tag 'mlx5-updates-2020-05-26' of git://git.kernel.org/pub/scm/linux/kernel/git/saeed/linux
Saeed Mahameed says: ==================== mlx5-updates-2020-05-26 Updates highlights: 1) From Vu Pham (8): Support VM traffics failover with bonded VF representors and e-switch egress/ingress ACLs This series introduce the support for Virtual Machine running I/O traffic over direct/fast VF path and failing over to slower paravirtualized path using the following features: __________________________________ | VM _________________ | | |FAILOVER device | | | |________________| | | | | | ____|_____ | | | | | | ______ |___ ____|_______ | | | VF PT | |VIRTIO-NET | | | | device | | device | | | |_________| |___________| | |___________|______________|________| | | | HYPERVISOR | | ____|______ | | macvtap | | |virtio BE | | |___________| | | | ____|_____ | |host VF | | |_________| | | _____|______ _____|_____ | PT VF | | host VF | |representor| |representor| |___________| |___________| \ / \ / \ / \ / _________________ \_______/ | | _______|________ | V-SWITCH | |VF representors |________________| (OVS) | | bond | |________________| |________________| | ________|________ | Uplink | | representor | |_________________| Summary: -------- Problem statement: ------------------ Currently in above topology, when netfailover device is configured using VFs and eswitch VF representors, and when traffic fails over to stand-by VF which is exposed using macvtap device to guest VM, eswitch fails to switch the traffic to the stand-by VF representor. This occurs because there is no knowledge at eswitch level of the stand-by representor device. Solution: --------- Using standard bonding driver, a bond netdevice is created over VF representor device which is used for offloading tc rules. Two VF representors are bonded together, one for the passthrough VF device and another one for the stand-by VF device. With this solution, mlx5 driver listens to the failover events occuring at the bond device level to failover traffic to either of the active VF representor of the bond. a. VM with netfailover device of VF pass-thru (PT) device and virtio-net paravirtualized device with same MAC-address to handle failover traffics at VM level. b. Host bond is active-standby mode, with the lower devices being the VM VF PT representor, and the representor of the 2nd VF to handle failover traffics at Hypervisor/V-Switch OVS level. - During the steady state (fast datapath): set the bond active device to be the VM PT VF representor. - During failover: apply bond failover to the second VF representor device which connects to the VM non-accelerated path. c. E-Switch ingress/egress ACL tables to support failover traffics at E-Switch level I. E-Switch egress ACL with forward-to-vport rule: - By default, eswitch vport egress acl forward packets to its counterpart NIC vport. - During port failover, the egress acl forward-to-vport rule will be added to e-switch vport of passive/in-active slave VF representor to forward packets to other e-switch vport ie. the active slave representor's e-switch vport to handle egress "failover" traffics. - Using lower change netdev event to detect a representor is a lower dev (slave) of bond and becomes active, adding egress acl forward-to-vport rule of all other slave netdevs to forward to this representor's vport. - Using upper change netdev event to detect a representor unslaving from bond device to delete its vport's egress acl forward-to-vport rule. II. E-Switch ingress ACL metadata reg_c for match - Bonded representors' vorts sharing tc block have the same root ingress acl table and a unique metadata for match. - Traffics from both representors's vports will be tagged with same unique metadata reg_c. - Using upper change netdev event to detect a representor enslaving/unslaving from bond device to setup shared root ingress acl and unique metadata. 2) From Alex Vesker (2): Slpit RX and TX lock for parallel rule insertion in software steering 3) Eli Britstein (2): Optimize performance for IPv4/IPv6 ethertype use the HW ip_version register rather than parsing eth frames for ethertype. ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'drivers')
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/Makefile7
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.c85
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en/rep/bond.c350
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c10
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c21
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en_rep.c30
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en_rep.h13
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en_tc.c96
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en_tc.h4
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/esw/acl/egress_lgcy.c170
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/esw/acl/egress_ofld.c235
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/esw/acl/helper.c160
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/esw/acl/helper.h26
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/esw/acl/ingress_lgcy.c279
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/esw/acl/ingress_ofld.c322
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/esw/acl/lgcy.h17
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/esw/acl/ofld.h29
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/eswitch.c559
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/eswitch.h41
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c401
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/main.c16
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/steering/dr_domain.c14
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/steering/dr_matcher.c10
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/steering/dr_rule.c31
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c13
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/steering/dr_table.c12
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h25
27 files changed, 1965 insertions, 1011 deletions
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
index e5ee9103fefb..b61e47bc16e8 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
@@ -34,7 +34,8 @@ mlx5_core-$(CONFIG_MLX5_EN_ARFS) += en_arfs.o
mlx5_core-$(CONFIG_MLX5_EN_RXNFC) += en_fs_ethtool.o
mlx5_core-$(CONFIG_MLX5_CORE_EN_DCB) += en_dcbnl.o en/port_buffer.o
mlx5_core-$(CONFIG_PCI_HYPERV_INTERFACE) += en/hv_vhca_stats.o
-mlx5_core-$(CONFIG_MLX5_ESWITCH) += en_rep.o lib/geneve.o lib/port_tun.o lag_mp.o
+mlx5_core-$(CONFIG_MLX5_ESWITCH) += lag_mp.o lib/geneve.o lib/port_tun.o \
+ en_rep.o en/rep/bond.o
mlx5_core-$(CONFIG_MLX5_CLS_ACT) += en_tc.o en/rep/tc.o en/rep/neigh.o \
en/mapping.o esw/chains.o en/tc_tun.o \
en/tc_tun_vxlan.o en/tc_tun_gre.o en/tc_tun_geneve.o \
@@ -46,6 +47,10 @@ mlx5_core-$(CONFIG_MLX5_TC_CT) += en/tc_ct.o
#
mlx5_core-$(CONFIG_MLX5_ESWITCH) += eswitch.o eswitch_offloads.o eswitch_offloads_termtbl.o \
ecpf.o rdma.o
+mlx5_core-$(CONFIG_MLX5_ESWITCH) += esw/acl/helper.o \
+ esw/acl/egress_lgcy.o esw/acl/egress_ofld.o \
+ esw/acl/ingress_lgcy.o esw/acl/ingress_ofld.o
+
mlx5_core-$(CONFIG_MLX5_MPFS) += lib/mpfs.o
mlx5_core-$(CONFIG_VXLAN) += lib/vxlan.o
mlx5_core-$(CONFIG_PTP_1588_CLOCK) += lib/clock.o
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.c b/drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.c
index 8ecac81a385d..a700f3c86899 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.c
@@ -76,58 +76,59 @@ static void print_lyr_2_4_hdrs(struct trace_seq *p,
.v = MLX5_GET(fte_match_set_lyr_2_4, value, dmac_47_16) << 16 |
MLX5_GET(fte_match_set_lyr_2_4, value, dmac_15_0)};
MASK_VAL_L2(u16, ethertype, ethertype);
+ MASK_VAL_L2(u8, ip_version, ip_version);
PRINT_MASKED_VALP(smac, u8 *, p, "%pM");
PRINT_MASKED_VALP(dmac, u8 *, p, "%pM");
PRINT_MASKED_VAL(ethertype, p, "%04x");
- if (ethertype.m == 0xffff) {
- if (ethertype.v == ETH_P_IP) {
+ if ((ethertype.m == 0xffff && ethertype.v == ETH_P_IP) ||
+ (ip_version.m == 0xf && ip_version.v == 4)) {
#define MASK_VAL_L2_BE(type, name, fld) \
MASK_VAL_BE(type, fte_match_set_lyr_2_4, name, mask, value, fld)
- MASK_VAL_L2_BE(u32, src_ipv4,
- src_ipv4_src_ipv6.ipv4_layout.ipv4);
- MASK_VAL_L2_BE(u32, dst_ipv4,
- dst_ipv4_dst_ipv6.ipv4_layout.ipv4);
+ MASK_VAL_L2_BE(u32, src_ipv4,
+ src_ipv4_src_ipv6.ipv4_layout.ipv4);
+ MASK_VAL_L2_BE(u32, dst_ipv4,
+ dst_ipv4_dst_ipv6.ipv4_layout.ipv4);
- PRINT_MASKED_VALP(src_ipv4, typeof(&src_ipv4.v), p,
- "%pI4");
- PRINT_MASKED_VALP(dst_ipv4, typeof(&dst_ipv4.v), p,
- "%pI4");
- } else if (ethertype.v == ETH_P_IPV6) {
- static const struct in6_addr full_ones = {
- .in6_u.u6_addr32 = {__constant_htonl(0xffffffff),
- __constant_htonl(0xffffffff),
- __constant_htonl(0xffffffff),
- __constant_htonl(0xffffffff)},
- };
- DECLARE_MASK_VAL(struct in6_addr, src_ipv6);
- DECLARE_MASK_VAL(struct in6_addr, dst_ipv6);
+ PRINT_MASKED_VALP(src_ipv4, typeof(&src_ipv4.v), p,
+ "%pI4");
+ PRINT_MASKED_VALP(dst_ipv4, typeof(&dst_ipv4.v), p,
+ "%pI4");
+ } else if ((ethertype.m == 0xffff && ethertype.v == ETH_P_IPV6) ||
+ (ip_version.m == 0xf && ip_version.v == 6)) {
+ static const struct in6_addr full_ones = {
+ .in6_u.u6_addr32 = {__constant_htonl(0xffffffff),
+ __constant_htonl(0xffffffff),
+ __constant_htonl(0xffffffff),
+ __constant_htonl(0xffffffff)},
+ };
+ DECLARE_MASK_VAL(struct in6_addr, src_ipv6);
+ DECLARE_MASK_VAL(struct in6_addr, dst_ipv6);
- memcpy(src_ipv6.m.in6_u.u6_addr8,
- MLX5_ADDR_OF(fte_match_set_lyr_2_4, mask,
- src_ipv4_src_ipv6.ipv6_layout.ipv6),
- sizeof(src_ipv6.m));
- memcpy(dst_ipv6.m.in6_u.u6_addr8,
- MLX5_ADDR_OF(fte_match_set_lyr_2_4, mask,
- dst_ipv4_dst_ipv6.ipv6_layout.ipv6),
- sizeof(dst_ipv6.m));
- memcpy(src_ipv6.v.in6_u.u6_addr8,
- MLX5_ADDR_OF(fte_match_set_lyr_2_4, value,
- src_ipv4_src_ipv6.ipv6_layout.ipv6),
- sizeof(src_ipv6.v));
- memcpy(dst_ipv6.v.in6_u.u6_addr8,
- MLX5_ADDR_OF(fte_match_set_lyr_2_4, value,
- dst_ipv4_dst_ipv6.ipv6_layout.ipv6),
- sizeof(dst_ipv6.v));
+ memcpy(src_ipv6.m.in6_u.u6_addr8,
+ MLX5_ADDR_OF(fte_match_set_lyr_2_4, mask,
+ src_ipv4_src_ipv6.ipv6_layout.ipv6),
+ sizeof(src_ipv6.m));
+ memcpy(dst_ipv6.m.in6_u.u6_addr8,
+ MLX5_ADDR_OF(fte_match_set_lyr_2_4, mask,
+ dst_ipv4_dst_ipv6.ipv6_layout.ipv6),
+ sizeof(dst_ipv6.m));
+ memcpy(src_ipv6.v.in6_u.u6_addr8,
+ MLX5_ADDR_OF(fte_match_set_lyr_2_4, value,
+ src_ipv4_src_ipv6.ipv6_layout.ipv6),
+ sizeof(src_ipv6.v));
+ memcpy(dst_ipv6.v.in6_u.u6_addr8,
+ MLX5_ADDR_OF(fte_match_set_lyr_2_4, value,
+ dst_ipv4_dst_ipv6.ipv6_layout.ipv6),
+ sizeof(dst_ipv6.v));
- if (!memcmp(&src_ipv6.m, &full_ones, sizeof(full_ones)))
- trace_seq_printf(p, "src_ipv6=%pI6 ",
- &src_ipv6.v);
- if (!memcmp(&dst_ipv6.m, &full_ones, sizeof(full_ones)))
- trace_seq_printf(p, "dst_ipv6=%pI6 ",
- &dst_ipv6.v);
- }
+ if (!memcmp(&src_ipv6.m, &full_ones, sizeof(full_ones)))
+ trace_seq_printf(p, "src_ipv6=%pI6 ",
+ &src_ipv6.v);
+ if (!memcmp(&dst_ipv6.m, &full_ones, sizeof(full_ones)))
+ trace_seq_printf(p, "dst_ipv6=%pI6 ",
+ &dst_ipv6.v);
}
#define PRINT_MASKED_VAL_L2(type, name, fld, p, format) {\
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/rep/bond.c b/drivers/net/ethernet/mellanox/mlx5/core/en/rep/bond.c
new file mode 100644
index 000000000000..bdb71332cbf2
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/rep/bond.c
@@ -0,0 +1,350 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/* Copyright (c) 2020 Mellanox Technologies Inc. All rights reserved. */
+
+#include <linux/netdevice.h>
+#include <linux/list.h>
+#include <net/lag.h>
+
+#include "mlx5_core.h"
+#include "eswitch.h"
+#include "esw/acl/ofld.h"
+#include "en_rep.h"
+
+struct mlx5e_rep_bond {
+ struct notifier_block nb;
+ struct netdev_net_notifier nn;
+ struct list_head metadata_list;
+};
+
+struct mlx5e_rep_bond_slave_entry {
+ struct list_head list;
+ struct net_device *netdev;
+};
+
+struct mlx5e_rep_bond_metadata {
+ struct list_head list; /* link to global list of rep_bond_metadata */
+ struct mlx5_eswitch *esw;
+ /* private of uplink holding rep bond metadata list */
+ struct net_device *lag_dev;
+ u32 metadata_reg_c_0;
+
+ struct list_head slaves_list; /* slaves list */
+ int slaves;
+};
+
+static struct mlx5e_rep_bond_metadata *
+mlx5e_lookup_rep_bond_metadata(struct mlx5_rep_uplink_priv *uplink_priv,
+ const struct net_device *lag_dev)
+{
+ struct mlx5e_rep_bond_metadata *found = NULL;
+ struct mlx5e_rep_bond_metadata *cur;
+
+ list_for_each_entry(cur, &uplink_priv->bond->metadata_list, list) {
+ if (cur->lag_dev == lag_dev) {
+ found = cur;
+ break;
+ }
+ }
+
+ return found;
+}
+
+static struct mlx5e_rep_bond_slave_entry *
+mlx5e_lookup_rep_bond_slave_entry(struct mlx5e_rep_bond_metadata *mdata,
+ const struct net_device *netdev)
+{
+ struct mlx5e_rep_bond_slave_entry *found = NULL;
+ struct mlx5e_rep_bond_slave_entry *cur;
+
+ list_for_each_entry(cur, &mdata->slaves_list, list) {
+ if (cur->netdev == netdev) {
+ found = cur;
+ break;
+ }
+ }
+
+ return found;
+}
+
+static void mlx5e_rep_bond_metadata_release(struct mlx5e_rep_bond_metadata *mdata)
+{
+ netdev_dbg(mdata->lag_dev, "destroy rep_bond_metadata(%d)\n",
+ mdata->metadata_reg_c_0);
+ list_del(&mdata->list);
+ mlx5_esw_match_metadata_free(mdata->esw, mdata->metadata_reg_c_0);
+ WARN_ON(!list_empty(&mdata->slaves_list));
+ kfree(mdata);
+}
+
+/* This must be called under rtnl_lock */
+int mlx5e_rep_bond_enslave(struct mlx5_eswitch *esw, struct net_device *netdev,
+ struct net_device *lag_dev)
+{
+ struct mlx5e_rep_bond_slave_entry *s_entry;
+ struct mlx5e_rep_bond_metadata *mdata;
+ struct mlx5e_rep_priv *rpriv;
+ struct mlx5e_priv *priv;
+ int err;
+
+ ASSERT_RTNL();
+
+ rpriv = mlx5_eswitch_get_uplink_priv(esw, REP_ETH);
+ mdata = mlx5e_lookup_rep_bond_metadata(&rpriv->uplink_priv, lag_dev);
+ if (!mdata) {
+ /* First netdev becomes slave, no metadata presents the lag_dev. Create one */
+ mdata = kzalloc(sizeof(*mdata), GFP_KERNEL);
+ if (!mdata)
+ return -ENOMEM;
+
+ mdata->lag_dev = lag_dev;
+ mdata->esw = esw;
+ INIT_LIST_HEAD(&mdata->slaves_list);
+ mdata->metadata_reg_c_0 = mlx5_esw_match_metadata_alloc(esw);
+ if (!mdata->metadata_reg_c_0) {
+ kfree(mdata);
+ return -ENOSPC;
+ }
+ list_add(&mdata->list, &rpriv->uplink_priv.bond->metadata_list);
+
+ netdev_dbg(lag_dev, "create rep_bond_metadata(%d)\n",
+ mdata->metadata_reg_c_0);
+ }
+
+ s_entry = kzalloc(sizeof(*s_entry), GFP_KERNEL);
+ if (!s_entry) {
+ err = -ENOMEM;
+ goto entry_alloc_err;
+ }
+
+ s_entry->netdev = netdev;
+ priv = netdev_priv(netdev);
+ rpriv = priv->ppriv;
+
+ err = mlx5_esw_acl_ingress_vport_bond_update(esw, rpriv->rep->vport,
+ mdata->metadata_reg_c_0);
+ if (err)
+ goto ingress_err;
+
+ mdata->slaves++;
+ list_add_tail(&s_entry->list, &mdata->slaves_list);
+ netdev_dbg(netdev, "enslave rep vport(%d) lag_dev(%s) metadata(0x%x)\n",
+ rpriv->rep->vport, lag_dev->name, mdata->metadata_reg_c_0);
+
+ return 0;
+
+ingress_err:
+ kfree(s_entry);
+entry_alloc_err:
+ if (!mdata->slaves)
+ mlx5e_rep_bond_metadata_release(mdata);
+ return err;
+}
+
+/* This must be called under rtnl_lock */
+void mlx5e_rep_bond_unslave(struct mlx5_eswitch *esw,
+ const struct net_device *netdev,
+ const struct net_device *lag_dev)
+{
+ struct mlx5e_rep_bond_slave_entry *s_entry;
+ struct mlx5e_rep_bond_metadata *mdata;
+ struct mlx5e_rep_priv *rpriv;
+ struct mlx5e_priv *priv;
+
+ ASSERT_RTNL();
+
+ rpriv = mlx5_eswitch_get_uplink_priv(esw, REP_ETH);
+ mdata = mlx5e_lookup_rep_bond_metadata(&rpriv->uplink_priv, lag_dev);
+ if (!mdata)
+ return;
+
+ s_entry = mlx5e_lookup_rep_bond_slave_entry(mdata, netdev);
+ if (!s_entry)
+ return;
+
+ priv = netdev_priv(netdev);
+ rpriv = priv->ppriv;
+
+ /* Reset bond_metadata to zero first then reset all ingress/egress
+ * acls and rx rules of unslave representor's vport
+ */
+ mlx5_esw_acl_ingress_vport_bond_update(esw, rpriv->rep->vport, 0);
+ mlx5_esw_acl_egress_vport_unbond(esw, rpriv->rep->vport);
+ mlx5e_rep_bond_update(priv, false);
+
+ list_del(&s_entry->list);
+
+ netdev_dbg(netdev, "unslave rep vport(%d) lag_dev(%s) metadata(0x%x)\n",
+ rpriv->rep->vport, lag_dev->name, mdata->metadata_reg_c_0);
+
+ if (--mdata->slaves == 0)
+ mlx5e_rep_bond_metadata_release(mdata);
+ kfree(s_entry);
+}
+
+static bool mlx5e_rep_is_lag_netdev(struct net_device *netdev)
+{
+ struct mlx5e_priv *priv = netdev_priv(netdev);
+ struct mlx5e_rep_priv *rpriv = priv->ppriv;
+
+ /* A given netdev is not a representor or not a slave of LAG configuration */
+ if (!mlx5e_eswitch_rep(netdev) || !bond_slave_get_rtnl(netdev))
+ return false;
+
+ /* Egress acl forward to vport is supported only non-uplink representor */
+ return rpriv->rep->vport != MLX5_VPORT_UPLINK;
+}
+
+static void mlx5e_rep_changelowerstate_event(struct net_device *netdev, void *ptr)
+{
+ struct netdev_notifier_changelowerstate_info *info;
+ struct netdev_lag_lower_state_info *lag_info;
+ struct mlx5e_rep_priv *rpriv;
+ struct net_device *lag_dev;
+ struct mlx5e_priv *priv;
+ struct list_head *iter;
+ struct net_device *dev;
+ u16 acl_vport_num;
+ u16 fwd_vport_num;
+ int err;
+
+ if (!mlx5e_rep_is_lag_netdev(netdev))
+ return;
+
+ info = ptr;
+ lag_info = info->lower_state_info;
+ /* This is not an event of a representor becoming active slave */
+ if (!lag_info->tx_enabled)
+ return;
+
+ priv = netdev_priv(netdev);
+ rpriv = priv->ppriv;
+ fwd_vport_num = rpriv->rep->vport;
+ lag_dev = netdev_master_upper_dev_get(netdev);
+
+ netdev_dbg(netdev, "lag_dev(%s)'s slave vport(%d) is txable(%d)\n",
+ lag_dev->name, fwd_vport_num, net_lag_port_dev_txable(netdev));
+
+ /* Point everyone's egress acl to the vport of the active representor */
+ netdev_for_each_lower_dev(lag_dev, dev, iter) {
+ priv = netdev_priv(dev);
+ rpriv = priv->ppriv;
+ acl_vport_num = rpriv->rep->vport;
+ if (acl_vport_num != fwd_vport_num) {
+ /* Only single rx_rule for unique bond_metadata should be
+ * present, delete it if it's saved as passive vport's
+ * rx_rule with destination as passive vport's root_ft
+ */
+ mlx5e_rep_bond_update(priv, true);
+ err = mlx5_esw_acl_egress_vport_bond(priv->mdev->priv.eswitch,
+ fwd_vport_num,
+ acl_vport_num);
+ if (err)
+ netdev_warn(dev,
+ "configure slave vport(%d) egress fwd, err(%d)",
+ acl_vport_num, err);
+ }
+ }
+
+ /* Insert new rx_rule for unique bond_metadata, save it as active vport's
+ * rx_rule with new destination as active vport's root_ft
+ */
+ err = mlx5e_rep_bond_update(netdev_priv(netdev), false);
+ if (err)
+ netdev_warn(netdev, "configure active slave vport(%d) rx_rule, err(%d)",
+ fwd_vport_num, err);
+}
+
+static void mlx5e_rep_changeupper_event(struct net_device *netdev, void *ptr)
+{
+ struct netdev_notifier_changeupper_info *info = ptr;
+ struct mlx5e_rep_priv *rpriv;
+ struct net_device *lag_dev;
+ struct mlx5e_priv *priv;
+
+ if (!mlx5e_rep_is_lag_netdev(netdev))
+ return;
+
+ priv = netdev_priv(netdev);
+ rpriv = priv->ppriv;
+ lag_dev = info->upper_dev;
+
+ netdev_dbg(netdev, "%sslave vport(%d) lag(%s)\n",
+ info->linking ? "en" : "un", rpriv->rep->vport, lag_dev->name);
+
+ if (info->linking)
+ mlx5e_rep_bond_enslave(priv->mdev->priv.eswitch, netdev, lag_dev);
+ else
+ mlx5e_rep_bond_unslave(priv->mdev->priv.eswitch, netdev, lag_dev);
+}
+
+/* Bond device of representors and netdev events are used here in specific way
+ * to support eswitch vports bonding and to perform failover of eswitch vport
+ * by modifying the vport's egress acl of lower dev representors. Thus this
+ * also change the traditional behavior of lower dev under bond device.
+ * All non-representor netdevs or representors of other vendors as lower dev
+ * of bond device are not supported.
+ */
+static int mlx5e_rep_esw_bond_netevent(struct notifier_block *nb,
+ unsigned long event, void *ptr)
+{
+ struct net_device *netdev = netdev_notifier_info_to_dev(ptr);
+
+ switch (event) {
+ case NETDEV_CHANGELOWERSTATE:
+ mlx5e_rep_changelowerstate_event(netdev, ptr);
+ break;
+ case NETDEV_CHANGEUPPER:
+ mlx5e_rep_changeupper_event(netdev, ptr);
+ break;
+ }
+ return NOTIFY_DONE;
+}
+
+/* If HW support eswitch vports bonding, register a specific notifier to
+ * handle it when two or more representors are bonded
+ */
+int mlx5e_rep_bond_init(struct mlx5e_rep_priv *rpriv)
+{
+ struct mlx5_rep_uplink_priv *uplink_priv = &rpriv->uplink_priv;
+ struct net_device *netdev = rpriv->netdev;
+ struct mlx5e_priv *priv;
+ int ret = 0;
+
+ priv = netdev_priv(netdev);
+ if (!mlx5_esw_acl_egress_fwd2vport_supported(priv->mdev->priv.eswitch))
+ goto out;
+
+ uplink_priv->bond = kvzalloc(sizeof(*uplink_priv->bond), GFP_KERNEL);
+ if (!uplink_priv->bond) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ INIT_LIST_HEAD(&uplink_priv->bond->metadata_list);
+ uplink_priv->bond->nb.notifier_call = mlx5e_rep_esw_bond_netevent;
+ ret = register_netdevice_notifier_dev_net(netdev,
+ &uplink_priv->bond->nb,
+ &uplink_priv->bond->nn);
+ if (ret) {
+ netdev_err(netdev, "register bonding netevent notifier, err(%d)\n", ret);
+ kvfree(uplink_priv->bond);
+ uplink_priv->bond = NULL;
+ }
+
+out:
+ return ret;
+}
+
+void mlx5e_rep_bond_cleanup(struct mlx5e_rep_priv *rpriv)
+{
+ struct mlx5e_priv *priv = netdev_priv(rpriv->netdev);
+
+ if (!mlx5_esw_acl_egress_fwd2vport_supported(priv->mdev->priv.eswitch) ||
+ !rpriv->uplink_priv.bond)
+ return;
+
+ unregister_netdevice_notifier_dev_net(rpriv->netdev,
+ &rpriv->uplink_priv.bond->nb,
+ &rpriv->uplink_priv.bond->nn);
+ kvfree(rpriv->uplink_priv.bond);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c
index 995b2ef1fb3b..afc19dca1f5f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c
@@ -119,7 +119,7 @@ mlx5_tc_ct_get_ct_priv(struct mlx5e_priv *priv)
}
static int
-mlx5_tc_ct_set_tuple_match(struct mlx5_flow_spec *spec,
+mlx5_tc_ct_set_tuple_match(struct mlx5e_priv *priv, struct mlx5_flow_spec *spec,
struct flow_rule *rule)
{
void *headers_c = MLX5_ADDR_OF(fte_match_param, spec->match_criteria,
@@ -134,10 +134,8 @@ mlx5_tc_ct_set_tuple_match(struct mlx5_flow_spec *spec,
flow_rule_match_basic(rule, &match);
- MLX5_SET(fte_match_set_lyr_2_4, headers_c, ethertype,
- ntohs(match.mask->n_proto));
- MLX5_SET(fte_match_set_lyr_2_4, headers_v, ethertype,
- ntohs(match.key->n_proto));
+ mlx5e_tc_set_ethertype(priv->mdev, &match, true, headers_c,
+ headers_v);
MLX5_SET(fte_match_set_lyr_2_4, headers_c, ip_protocol,
match.mask->ip_proto);
MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_protocol,
@@ -533,7 +531,7 @@ mlx5_tc_ct_entry_add_rule(struct mlx5_tc_ct_priv *ct_priv,
attr->counter = entry->counter;
attr->flags |= MLX5_ESW_ATTR_FLAG_NO_IN_PORT;
- mlx5_tc_ct_set_tuple_match(spec, flow_rule);
+ mlx5_tc_ct_set_tuple_match(netdev_priv(ct_priv->netdev), spec, flow_rule);
mlx5e_tc_match_to_reg_match(spec, ZONE_TO_REG,
entry->zone & MLX5_CT_ZONE_MASK,
MLX5_CT_ZONE_MASK);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c
index e99382f58807..7cce85faa16f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c
@@ -512,6 +512,13 @@ int mlx5e_tc_tun_parse(struct net_device *filter_dev,
}
if (flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_ENC_CONTROL)) {
+ struct flow_dissector_key_basic key_basic = {};
+ struct flow_dissector_key_basic mask_basic = {
+ .n_proto = htons(0xFFFF),
+ };
+ struct flow_match_basic match_basic = {
+ .key = &key_basic, .mask = &mask_basic,
+ };
struct flow_match_control match;
u16 addr_type;
@@ -537,10 +544,9 @@ int mlx5e_tc_tun_parse(struct net_device *filter_dev,
dst_ipv4_dst_ipv6.ipv4_layout.ipv4,
ntohl(match.key->dst));
- MLX5_SET_TO_ONES(fte_match_set_lyr_2_4, headers_c,
- ethertype);
- MLX5_SET(fte_match_set_lyr_2_4, headers_v, ethertype,
- ETH_P_IP);
+ key_basic.n_proto = htons(ETH_P_IP);
+ mlx5e_tc_set_ethertype(priv->mdev, &match_basic, true,
+ headers_c, headers_v);
} else if (addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
struct flow_match_ipv6_addrs match;
@@ -563,10 +569,9 @@ int mlx5e_tc_tun_parse(struct net_device *filter_dev,
&match.key->dst, MLX5_FLD_SZ_BYTES(ipv6_layout,
ipv6));
- MLX5_SET_TO_ONES(fte_match_set_lyr_2_4, headers_c,
- ethertype);
- MLX5_SET(fte_match_set_lyr_2_4, headers_v, ethertype,
- ETH_P_IPV6);
+ key_basic.n_proto = htons(ETH_P_IPV6);
+ mlx5e_tc_set_ethertype(priv->mdev, &match_basic, true,
+ headers_c, headers_v);
}
}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
index 4e13e37a9ecd..af89a4803c7d 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
@@ -854,6 +854,24 @@ static int mlx5e_create_rep_vport_rx_rule(struct mlx5e_priv *priv)
return 0;
}
+static void rep_vport_rx_rule_destroy(struct mlx5e_priv *priv)
+{
+ struct mlx5e_rep_priv *rpriv = priv->ppriv;
+
+ if (!rpriv->vport_rx_rule)
+ return;
+
+ mlx5_del_flow_rules(rpriv->vport_rx_rule);
+ rpriv->vport_rx_rule = NULL;
+}
+
+int mlx5e_rep_bond_update(struct mlx5e_priv *priv, bool cleanup)
+{
+ rep_vport_rx_rule_destroy(priv);
+
+ return cleanup ? 0 : mlx5e_create_rep_vport_rx_rule(priv);
+}
+
static int mlx5e_init_rep_rx(struct mlx5e_priv *priv)
{
struct mlx5_core_dev *mdev = priv->mdev;
@@ -918,9 +936,7 @@ err_close_drop_rq:
static void mlx5e_cleanup_rep_rx(struct mlx5e_priv *priv)
{
- struct mlx5e_rep_priv *rpriv = priv->ppriv;
-
- mlx5_del_flow_rules(rpriv->vport_rx_rule);
+ rep_vport_rx_rule_destroy(priv);
mlx5e_destroy_rep_root_ft(priv);
mlx5e_destroy_ttc_table(priv, &priv->fs.ttc);
mlx5e_destroy_direct_tirs(priv, priv->direct_tir);
@@ -959,16 +975,18 @@ static int mlx5e_init_uplink_rep_tx(struct mlx5e_rep_priv *rpriv)
mlx5_init_port_tun_entropy(&uplink_priv->tun_entropy, priv->mdev);
+ mlx5e_rep_bond_init(rpriv);
err = mlx5e_rep_tc_netdevice_event_register(rpriv);
if (err) {
mlx5_core_err(priv->mdev, "Failed to register netdev notifier, err: %d\n",
err);
- goto tc_rep_cleanup;
+ goto err_event_reg;
}
return 0;
-tc_rep_cleanup:
+err_event_reg:
+ mlx5e_rep_bond_cleanup(rpriv);
mlx5e_rep_tc_cleanup(rpriv);
return err;
}
@@ -1001,7 +1019,7 @@ static void mlx5e_cleanup_uplink_rep_tx(struct mlx5e_rep_priv *rpriv)
{
mlx5e_rep_tc_netdevice_event_unregister(rpriv);
mlx5e_rep_indr_clean_block_privs(rpriv);
-
+ mlx5e_rep_bond_cleanup(rpriv);
mlx5e_rep_tc_cleanup(rpriv);
}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.h b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.h
index 1c4af8522467..da9f1686d525 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.h
@@ -56,6 +56,7 @@ struct mlx5e_neigh_update_table {
};
struct mlx5_tc_ct_priv;
+struct mlx5e_rep_bond;
struct mlx5_rep_uplink_priv {
/* Filters DB - instantiated by the uplink representor and shared by
* the uplink's VFs
@@ -89,6 +90,9 @@ struct mlx5_rep_uplink_priv {
struct mapping_ctx *tunnel_enc_opts_mapping;
struct mlx5_tc_ct_priv *ct_priv;
+
+ /* support eswitch vports bonding */
+ struct mlx5e_rep_bond *bond;
};
struct mlx5e_rep_priv {
@@ -211,6 +215,15 @@ struct mlx5e_rep_sq {
void mlx5e_rep_register_vport_reps(struct mlx5_core_dev *mdev);
void mlx5e_rep_unregister_vport_reps(struct mlx5_core_dev *mdev);
+int mlx5e_rep_bond_init(struct mlx5e_rep_priv *rpriv);
+void mlx5e_rep_bond_cleanup(struct mlx5e_rep_priv *rpriv);
+int mlx5e_rep_bond_enslave(struct mlx5_eswitch *esw, struct net_device *netdev,
+ struct net_device *lag_dev);
+void mlx5e_rep_bond_unslave(struct mlx5_eswitch *esw,
+ const struct net_device *netdev,
+ const struct net_device *lag_dev);
+int mlx5e_rep_bond_update(struct mlx5e_priv *priv, bool cleanup);
+
bool mlx5e_is_uplink_rep(struct mlx5e_priv *priv);
int mlx5e_add_sqs_fwd_rules(struct mlx5e_priv *priv);
void mlx5e_remove_sqs_fwd_rules(struct mlx5e_priv *priv);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index 571da14809fe..0f119c08b835 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -50,6 +50,7 @@
#include <net/arp.h>
#include <net/ipv6_stubs.h>
#include <net/bareudp.h>
+#include <net/bonding.h>
#include "en.h"
#include "en_rep.h"
#include "en/rep/tc.h"
@@ -145,6 +146,7 @@ struct mlx5e_tc_flow {
struct list_head hairpin; /* flows sharing the same hairpin */
struct list_head peer; /* flows with peer flow */
struct list_head unready; /* flows not ready to be offloaded (e.g due to missing route) */
+ struct net_device *orig_dev; /* netdev adding flow first */
int tmp_efi_index;
struct list_head tmp_list; /* temporary flow list used by neigh update */
refcount_