summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/networking/openvswitch.txt40
-rw-r--r--include/uapi/linux/openvswitch.h9
-rw-r--r--net/openvswitch/actions.c6
-rw-r--r--net/openvswitch/datapath.c140
-rw-r--r--net/openvswitch/datapath.h6
-rw-r--r--net/openvswitch/flow.c1387
-rw-r--r--net/openvswitch/flow.h96
7 files changed, 1171 insertions, 513 deletions
diff --git a/Documentation/networking/openvswitch.txt b/Documentation/networking/openvswitch.txt
index 8fa2dd1e792e..37c20ee2455e 100644
--- a/Documentation/networking/openvswitch.txt
+++ b/Documentation/networking/openvswitch.txt
@@ -91,6 +91,46 @@ Often we ellipsize arguments not important to the discussion, e.g.:
in_port(1), eth(...), eth_type(0x0800), ipv4(...), tcp(...)
+Wildcarded flow key format
+--------------------------
+
+A wildcarded flow is described with two sequences of Netlink attributes
+passed over the Netlink socket. A flow key, exactly as described above, and an
+optional corresponding flow mask.
+
+A wildcarded flow can represent a group of exact match flows. Each '1' bit
+in the mask specifies a exact match with the corresponding bit in the flow key.
+A '0' bit specifies a don't care bit, which will match either a '1' or '0' bit
+of a incoming packet. Using wildcarded flow can improve the flow set up rate
+by reduce the number of new flows need to be processed by the user space program.
+
+Support for the mask Netlink attribute is optional for both the kernel and user
+space program. The kernel can ignore the mask attribute, installing an exact
+match flow, or reduce the number of don't care bits in the kernel to less than
+what was specified by the user space program. In this case, variations in bits
+that the kernel does not implement will simply result in additional flow setups.
+The kernel module will also work with user space programs that neither support
+nor supply flow mask attributes.
+
+Since the kernel may ignore or modify wildcard bits, it can be difficult for
+the userspace program to know exactly what matches are installed. There are
+two possible approaches: reactively install flows as they miss the kernel
+flow table (and therefore not attempt to determine wildcard changes at all)
+or use the kernel's response messages to determine the installed wildcards.
+
+When interacting with userspace, the kernel should maintain the match portion
+of the key exactly as originally installed. This will provides a handle to
+identify the flow for all future operations. However, when reporting the
+mask of an installed flow, the mask should include any restrictions imposed
+by the kernel.
+
+The behavior when using overlapping wildcarded flows is undefined. It is the
+responsibility of the user space program to ensure that any incoming packet
+can match at most one flow, wildcarded or not. The current implementation
+performs best-effort detection of overlapping wildcarded flows and may reject
+some but not all of them. However, this behavior may change in future versions.
+
+
Basic rule for evolving flow keys
---------------------------------
diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index 52490b0e62b5..de1fa5d3780f 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -1,6 +1,6 @@
/*
- * Copyright (c) 2007-2011 Nicira Networks.
+ * Copyright (c) 2007-2013 Nicira, Inc.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of version 2 of the GNU General Public
@@ -379,6 +379,12 @@ struct ovs_key_nd {
* @OVS_FLOW_ATTR_CLEAR: If present in a %OVS_FLOW_CMD_SET request, clears the
* last-used time, accumulated TCP flags, and statistics for this flow.
* Otherwise ignored in requests. Never present in notifications.
+ * @OVS_FLOW_ATTR_MASK: Nested %OVS_KEY_ATTR_* attributes specifying the
+ * mask bits for wildcarded flow match. Mask bit value '1' specifies exact
+ * match with corresponding flow key bit, while mask bit value '0' specifies
+ * a wildcarded match. Omitting attribute is treated as wildcarding all
+ * corresponding fields. Optional for all requests. If not present,
+ * all flow key bits are exact match bits.
*
* These attributes follow the &struct ovs_header within the Generic Netlink
* payload for %OVS_FLOW_* commands.
@@ -391,6 +397,7 @@ enum ovs_flow_attr {
OVS_FLOW_ATTR_TCP_FLAGS, /* 8-bit OR'd TCP flags. */
OVS_FLOW_ATTR_USED, /* u64 msecs last used in monotonic time. */
OVS_FLOW_ATTR_CLEAR, /* Flag to clear stats, tcp_flags, used. */
+ OVS_FLOW_ATTR_MASK, /* Sequence of OVS_KEY_ATTR_* attributes. */
__OVS_FLOW_ATTR_MAX
};
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index ab101f715447..1f680222f4f7 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2007-2012 Nicira, Inc.
+ * Copyright (c) 2007-2013 Nicira, Inc.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of version 2 of the GNU General Public
@@ -376,8 +376,10 @@ static int output_userspace(struct datapath *dp, struct sk_buff *skb,
const struct nlattr *a;
int rem;
+ BUG_ON(!OVS_CB(skb)->pkt_key);
+
upcall.cmd = OVS_PACKET_CMD_ACTION;
- upcall.key = &OVS_CB(skb)->flow->key;
+ upcall.key = OVS_CB(skb)->pkt_key;
upcall.userdata = NULL;
upcall.portid = 0;
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index 9d97ef3c9830..d29cd9aa4a67 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2007-2012 Nicira, Inc.
+ * Copyright (c) 2007-2013 Nicira, Inc.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of version 2 of the GNU General Public
@@ -165,7 +165,7 @@ static void destroy_dp_rcu(struct rcu_head *rcu)
{
struct datapath *dp = container_of(rcu, struct datapath, rcu);
- ovs_flow_tbl_destroy((__force struct flow_table *)dp->table);
+ ovs_flow_tbl_destroy((__force struct flow_table *)dp->table, false);
free_percpu(dp->stats_percpu);
release_net(ovs_dp_get_net(dp));
kfree(dp->ports);
@@ -226,19 +226,18 @@ void ovs_dp_process_received_packet(struct vport *p, struct sk_buff *skb)
struct sw_flow_key key;
u64 *stats_counter;
int error;
- int key_len;
stats = this_cpu_ptr(dp->stats_percpu);
/* Extract flow from 'skb' into 'key'. */
- error = ovs_flow_extract(skb, p->port_no, &key, &key_len);
+ error = ovs_flow_extract(skb, p->port_no, &key);
if (unlikely(error)) {
kfree_skb(skb);
return;
}
/* Look up flow. */
- flow = ovs_flow_tbl_lookup(rcu_dereference(dp->table), &key, key_len);
+ flow = ovs_flow_lookup(rcu_dereference(dp->table), &key);
if (unlikely(!flow)) {
struct dp_upcall_info upcall;
@@ -253,6 +252,7 @@ void ovs_dp_process_received_packet(struct vport *p, struct sk_buff *skb)
}
OVS_CB(skb)->flow = flow;
+ OVS_CB(skb)->pkt_key = &key;
stats_counter = &stats->n_hit;
ovs_flow_used(OVS_CB(skb)->flow, skb);
@@ -435,7 +435,7 @@ static int queue_userspace_packet(struct net *net, int dp_ifindex,
upcall->dp_ifindex = dp_ifindex;
nla = nla_nest_start(user_skb, OVS_PACKET_ATTR_KEY);
- ovs_flow_to_nlattrs(upcall_info->key, user_skb);
+ ovs_flow_to_nlattrs(upcall_info->key, upcall_info->key, user_skb);
nla_nest_end(user_skb, nla);
if (upcall_info->userdata)
@@ -468,7 +468,7 @@ static int flush_flows(struct datapath *dp)
rcu_assign_pointer(dp->table, new_table);
- ovs_flow_tbl_deferred_destroy(old_table);
+ ovs_flow_tbl_destroy(old_table, true);
return 0;
}
@@ -611,10 +611,12 @@ static int validate_tp_port(const struct sw_flow_key *flow_key)
static int validate_and_copy_set_tun(const struct nlattr *attr,
struct sw_flow_actions **sfa)
{
- struct ovs_key_ipv4_tunnel tun_key;
+ struct sw_flow_match match;
+ struct sw_flow_key key;
int err, start;
- err = ovs_ipv4_tun_from_nlattr(nla_data(attr), &tun_key);
+ ovs_match_init(&match, &key, NULL);
+ err = ovs_ipv4_tun_from_nlattr(nla_data(attr), &match, false);
if (err)
return err;
@@ -622,7 +624,8 @@ static int validate_and_copy_set_tun(const struct nlattr *attr,
if (start < 0)
return start;
- err = add_action(sfa, OVS_KEY_ATTR_IPV4_TUNNEL, &tun_key, sizeof(tun_key));
+ err = add_action(sfa, OVS_KEY_ATTR_IPV4_TUNNEL, &match.key->tun_key,
+ sizeof(match.key->tun_key));
add_nested_action_end(*sfa, start);
return err;
@@ -857,7 +860,6 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info)
struct ethhdr *eth;
int len;
int err;
- int key_len;
err = -EINVAL;
if (!a[OVS_PACKET_ATTR_PACKET] || !a[OVS_PACKET_ATTR_KEY] ||
@@ -890,11 +892,11 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info)
if (IS_ERR(flow))
goto err_kfree_skb;
- err = ovs_flow_extract(packet, -1, &flow->key, &key_len);
+ err = ovs_flow_extract(packet, -1, &flow->key);
if (err)
goto err_flow_free;
- err = ovs_flow_metadata_from_nlattrs(flow, key_len, a[OVS_PACKET_ATTR_KEY]);
+ err = ovs_flow_metadata_from_nlattrs(flow, a[OVS_PACKET_ATTR_KEY]);
if (err)
goto err_flow_free;
acts = ovs_flow_actions_alloc(nla_len(a[OVS_PACKET_ATTR_ACTIONS]));
@@ -908,6 +910,7 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info)
goto err_flow_free;
OVS_CB(packet)->flow = flow;
+ OVS_CB(packet)->pkt_key = &flow->key;
packet->priority = flow->key.phy.priority;
packet->mark = flow->key.phy.skb_mark;
@@ -922,13 +925,13 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info)
local_bh_enable();
rcu_read_unlock();
- ovs_flow_free(flow);
+ ovs_flow_free(flow, false);
return err;
err_unlock:
rcu_read_unlock();
err_flow_free:
- ovs_flow_free(flow);
+ ovs_flow_free(flow, false);
err_kfree_skb:
kfree_skb(packet);
err:
@@ -1045,7 +1048,8 @@ static int set_action_to_attr(const struct nlattr *a, struct sk_buff *skb)
if (!start)
return -EMSGSIZE;
- err = ovs_ipv4_tun_to_nlattr(skb, nla_data(ovs_key));
+ err = ovs_ipv4_tun_to_nlattr(skb, nla_data(ovs_key),
+ nla_data(ovs_key));
if (err)
return err;
nla_nest_end(skb, start);
@@ -1093,6 +1097,7 @@ static size_t ovs_flow_cmd_msg_size(const struct sw_flow_actions *acts)
{
return NLMSG_ALIGN(sizeof(struct ovs_header))
+ nla_total_size(key_attr_size()) /* OVS_FLOW_ATTR_KEY */
+ + nla_total_size(key_attr_size()) /* OVS_FLOW_ATTR_MASK */
+ nla_total_size(sizeof(struct ovs_flow_stats)) /* OVS_FLOW_ATTR_STATS */
+ nla_total_size(1) /* OVS_FLOW_ATTR_TCP_FLAGS */
+ nla_total_size(8) /* OVS_FLOW_ATTR_USED */
@@ -1119,12 +1124,25 @@ static int ovs_flow_cmd_fill_info(struct sw_flow *flow, struct datapath *dp,
ovs_header->dp_ifindex = get_dpifindex(dp);
+ /* Fill flow key. */
nla = nla_nest_start(skb, OVS_FLOW_ATTR_KEY);
if (!nla)
goto nla_put_failure;
- err = ovs_flow_to_nlattrs(&flow->key, skb);
+
+ err = ovs_flow_to_nlattrs(&flow->unmasked_key,
+ &flow->unmasked_key, skb);
+ if (err)
+ goto error;
+ nla_nest_end(skb, nla);
+
+ nla = nla_nest_start(skb, OVS_FLOW_ATTR_MASK);
+ if (!nla)
+ goto nla_put_failure;
+
+ err = ovs_flow_to_nlattrs(&flow->key, &flow->mask->key, skb);
if (err)
goto error;
+
nla_nest_end(skb, nla);
spin_lock_bh(&flow->lock);
@@ -1214,20 +1232,24 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info)
{
struct nlattr **a = info->attrs;
struct ovs_header *ovs_header = info->userhdr;
- struct sw_flow_key key;
- struct sw_flow *flow;
+ struct sw_flow_key key, masked_key;
+ struct sw_flow *flow = NULL;
+ struct sw_flow_mask mask;
struct sk_buff *reply;
struct datapath *dp;
struct flow_table *table;
struct sw_flow_actions *acts = NULL;
+ struct sw_flow_match match;
int error;
- int key_len;
/* Extract key. */
error = -EINVAL;
if (!a[OVS_FLOW_ATTR_KEY])
goto error;
- error = ovs_flow_from_nlattrs(&key, &key_len, a[OVS_FLOW_ATTR_KEY]);
+
+ ovs_match_init(&match, &key, &mask);
+ error = ovs_match_from_nlattrs(&match,
+ a[OVS_FLOW_ATTR_KEY], a[OVS_FLOW_ATTR_MASK]);
if (error)
goto error;
@@ -1238,9 +1260,13 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info)
if (IS_ERR(acts))
goto error;
- error = validate_and_copy_actions(a[OVS_FLOW_ATTR_ACTIONS], &key, 0, &acts);
- if (error)
+ ovs_flow_key_mask(&masked_key, &key, &mask);
+ error = validate_and_copy_actions(a[OVS_FLOW_ATTR_ACTIONS],
+ &masked_key, 0, &acts);
+ if (error) {
+ OVS_NLERR("Flow actions may not be safe on all matching packets.\n");
goto err_kfree;
+ }
} else if (info->genlhdr->cmd == OVS_FLOW_CMD_NEW) {
error = -EINVAL;
goto error;
@@ -1253,8 +1279,11 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info)
goto err_unlock_ovs;
table = ovsl_dereference(dp->table);
- flow = ovs_flow_tbl_lookup(table, &key, key_len);
+
+ /* Check if this is a duplicate flow */
+ flow = ovs_flow_lookup(table, &key);
if (!flow) {
+ struct sw_flow_mask *mask_p;
/* Bail out if we're not allowed to create a new flow. */
error = -ENOENT;
if (info->genlhdr->cmd == OVS_FLOW_CMD_SET)
@@ -1267,7 +1296,7 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info)
new_table = ovs_flow_tbl_expand(table);
if (!IS_ERR(new_table)) {
rcu_assign_pointer(dp->table, new_table);
- ovs_flow_tbl_deferred_destroy(table);
+ ovs_flow_tbl_destroy(table, true);
table = ovsl_dereference(dp->table);
}
}
@@ -1280,14 +1309,30 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info)
}
clear_stats(flow);
+ flow->key = masked_key;
+ flow->unmasked_key = key;
+
+ /* Make sure mask is unique in the system */
+ mask_p = ovs_sw_flow_mask_find(table, &mask);
+ if (!mask_p) {
+ /* Allocate a new mask if none exsits. */
+ mask_p = ovs_sw_flow_mask_alloc();
+ if (!mask_p)
+ goto err_flow_free;
+ mask_p->key = mask.key;
+ mask_p->range = mask.range;
+ ovs_sw_flow_mask_insert(table, mask_p);
+ }
+
+ ovs_sw_flow_mask_add_ref(mask_p);
+ flow->mask = mask_p;
rcu_assign_pointer(flow->sf_acts, acts);
/* Put flow in bucket. */
- ovs_flow_tbl_insert(table, flow, &key, key_len);
+ ovs_flow_insert(table, flow);
reply = ovs_flow_cmd_build_info(flow, dp, info->snd_portid,
- info->snd_seq,
- OVS_FLOW_CMD_NEW);
+ info->snd_seq, OVS_FLOW_CMD_NEW);
} else {
/* We found a matching flow. */
struct sw_flow_actions *old_acts;
@@ -1303,6 +1348,13 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info)
info->nlhdr->nlmsg_flags & (NLM_F_CREATE | NLM_F_EXCL))
goto err_unlock_ovs;
+ /* The unmasked key has to be the same for flow updates. */
+ error = -EINVAL;
+ if (!ovs_flow_cmp_unmasked_key(flow, &key, match.range.end)) {
+ OVS_NLERR("Flow modification message rejected, unmasked key does not match.\n");
+ goto err_unlock_ovs;
+ }
+
/* Update actions. */
old_acts = ovsl_dereference(flow->sf_acts);
rcu_assign_pointer(flow->sf_acts, acts);
@@ -1327,6 +1379,8 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info)
ovs_dp_flow_multicast_group.id, PTR_ERR(reply));
return 0;
+err_flow_free:
+ ovs_flow_free(flow, false);
err_unlock_ovs:
ovs_unlock();
err_kfree:
@@ -1344,12 +1398,16 @@ static int ovs_flow_cmd_get(struct sk_buff *skb, struct genl_info *info)
struct sw_flow *flow;
struct datapath *dp;
struct flow_table *table;
+ struct sw_flow_match match;
int err;
- int key_len;
- if (!a[OVS_FLOW_ATTR_KEY])
+ if (!a[OVS_FLOW_ATTR_KEY]) {
+ OVS_NLERR("Flow get message rejected, Key attribute missing.\n");
return -EINVAL;
- err = ovs_flow_from_nlattrs(&key, &key_len, a[OVS_FLOW_ATTR_KEY]);
+ }
+
+ ovs_match_init(&match, &key, NULL);
+ err = ovs_match_from_nlattrs(&match, a[OVS_FLOW_ATTR_KEY], NULL);
if (err)
return err;
@@ -1361,7 +1419,7 @@ static int ovs_flow_cmd_get(struct sk_buff *skb, struct genl_info *info)
}
table = ovsl_dereference(dp->table);
- flow = ovs_flow_tbl_lookup(table, &key, key_len);
+ flow = ovs_flow_lookup_unmasked_key(table, &match);
if (!flow) {
err = -ENOENT;
goto unlock;
@@ -1390,8 +1448,8 @@ static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info)
struct sw_flow *flow;
struct datapath *dp;
struct flow_table *table;
+ struct sw_flow_match match;
int err;
- int key_len;
ovs_lock();
dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex);
@@ -1404,12 +1462,14 @@ static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info)
err = flush_flows(dp);
goto unlock;
}
- err = ovs_flow_from_nlattrs(&key, &key_len, a[OVS_FLOW_ATTR_KEY]);
+
+ ovs_match_init(&match, &key, NULL);
+ err = ovs_match_from_nlattrs(&match, a[OVS_FLOW_ATTR_KEY], NULL);
if (err)
goto unlock;
table = ovsl_dereference(dp->table);
- flow = ovs_flow_tbl_lookup(table, &key, key_len);
+ flow = ovs_flow_lookup_unmasked_key(table, &match);
if (!flow) {
err = -ENOENT;
goto unlock;
@@ -1421,13 +1481,13 @@ static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info)
goto unlock;
}
- ovs_flow_tbl_remove(table, flow);
+ ovs_flow_remove(table, flow);
err = ovs_flow_cmd_fill_info(flow, dp, reply, info->snd_portid,
info->snd_seq, 0, OVS_FLOW_CMD_DEL);
BUG_ON(err < 0);
- ovs_flow_deferred_free(flow);
+ ovs_flow_free(flow, true);
ovs_unlock();
ovs_notify(reply, info, &ovs_dp_flow_multicast_group);
@@ -1457,7 +1517,7 @@ static int ovs_flow_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb)
bucket = cb->args[0];
obj = cb->args[1];
- flow = ovs_flow_tbl_next(table, &bucket, &obj);
+ flow = ovs_flow_dump_next(table, &bucket, &obj);
if (!flow)
break;
@@ -1680,7 +1740,7 @@ err_destroy_ports_array:
err_destroy_percpu:
free_percpu(dp->stats_percpu);
err_destroy_table:
- ovs_flow_tbl_destroy(ovsl_dereference(dp->table));
+ ovs_flow_tbl_destroy(ovsl_dereference(dp->table), false);
err_free_dp:
release_net(ovs_dp_get_net(dp));
kfree(dp);
@@ -2287,7 +2347,7 @@ static void rehash_flow_table(struct work_struct *work)
new_table = ovs_flow_tbl_rehash(old_table);
if (!IS_ERR(new_table)) {
rcu_assign_pointer(dp->table, new_table);
- ovs_flow_tbl_deferred_destroy(old_table);
+ ovs_flow_tbl_destroy(old_table, true);
}
}
}
diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h
index a91486484916..4d109c176ef3 100644
--- a/net/openvswitch/datapath.h
+++ b/net/openvswitch/datapath.h
@@ -88,11 +88,13 @@ struct datapath {
/**
* struct ovs_skb_cb - OVS data in skb CB
* @flow: The flow associated with this packet. May be %NULL if no flow.
+ * @pkt_key: The flow information extracted from the packet. Must be nonnull.
* @tun_key: Key for the tunnel that encapsulated this packet. NULL if the
* packet is not being tunneled.
*/
struct ovs_skb_cb {
struct sw_flow *flow;
+ struct sw_flow_key *pkt_key;
struct ovs_key_ipv4_tunnel *tun_key;
};
#define OVS_CB(skb) ((struct ovs_skb_cb *)(skb)->cb)
@@ -183,4 +185,8 @@ struct sk_buff *ovs_vport_cmd_build_info(struct vport *, u32 pid, u32 seq,
int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb);
void ovs_dp_notify_wq(struct work_struct *work);
+
+#define OVS_NLERR(fmt, ...) \
+ pr_info_once("netlink: " fmt, ##__VA_ARGS__)
+
#endif /* datapath.h */
diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c
index fca282520cee..1fceb9653598 100644
--- a/net/openvswitch/flow.c
+++ b/net/openvswitch/flow.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2007-2011 Nicira, Inc.
+ * Copyright (c) 2007-2013 Nicira, Inc.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of version 2 of the GNU General Public
@@ -46,6 +46,184 @@
static struct kmem_cache *flow_cache;
+static void ovs_sw_flow_mask_set(struct sw_flow_mask *mask,
+ struct sw_flow_key_range *range, u8 val);
+
+static void update_range__(struct sw_flow_match *match,
+ size_t offset, size_t size, bool is_mask)
+{
+ struct sw_flow_key_range *range = NULL;
+ size_t start = offset;
+ size_t end = offset + size;
+
+ if (!is_mask)
+ range = &match->range;
+ else if (match->mask)
+ range = &match->mask->range;
+
+ if (!range)
+ return;
+
+ if (range->start == range->end) {
+ range->start = start;
+ range->end = end;
+ return;
+ }
+
+ if (range->start > start)
+ range->start = start;
+
+ if (range->end < end)
+ range->end = end;
+}
+
+#define SW_FLOW_KEY_PUT(match, field, value, is_mask) \
+ do { \
+ update_range__(match, offsetof(struct sw_flow_key, field), \
+ sizeof((match)->key->field), is_mask); \
+ if (is_mask) { \
+ if ((match)->mask) \
+ (match)->mask->key.field = value; \
+ } else { \
+ (match)->key->field = value; \
+ } \
+ } while (0)
+
+#define SW_FLOW_KEY_MEMCPY(match, field, value_p, len, is_mask) \
+ do { \
+ update_range__(match, offsetof(struct sw_flow_key, field), \
+ len, is_mask); \
+ if (is_mask) { \
+ if ((match)->mask) \
+ memcpy(&(match)->mask->key.field, value_p, len);\
+ } else { \
+ memcpy(&(match)->key->field, value_p, len); \
+ } \
+ } while (0)
+
+void ovs_match_init(struct sw_flow_match *match,
+ struct sw_flow_key *key,
+ struct sw_flow_mask *mask)
+{
+ memset(match, 0, sizeof(*match));
+ match->key = key;
+ match->mask = mask;
+
+ memset(key, 0, sizeof(*key));
+
+ if (mask) {
+ memset(&mask->key, 0, sizeof(mask->key));
+ mask->range.start = mask->range.end = 0;
+ }
+}
+
+static bool ovs_match_validate(const struct sw_flow_match *match,
+ u64 key_attrs, u64 mask_attrs)
+{
+ u64 key_expected = 1 << OVS_KEY_ATTR_ETHERNET;
+ u64 mask_allowed = key_attrs; /* At most allow all key attributes */
+
+ /* The following mask attributes allowed only if they
+ * pass the validation tests. */
+ mask_allowed &= ~((1 << OVS_KEY_ATTR_IPV4)
+ | (1 << OVS_KEY_ATTR_IPV6)
+ | (1 << OVS_KEY_ATTR_TCP)
+ | (1 << OVS_KEY_ATTR_UDP)
+ | (1 << OVS_KEY_ATTR_ICMP)
+ | (1 << OVS_KEY_ATTR_ICMPV6)
+ | (1 << OVS_KEY_ATTR_ARP)
+ | (1 << OVS_KEY_ATTR_ND));
+
+ /* Always allowed mask fields. */
+ mask_allowed |= ((1 << OVS_KEY_ATTR_TUNNEL)
+ | (1 << OVS_KEY_ATTR_IN_PORT)
+ | (1 << OVS_KEY_ATTR_ETHERTYPE));
+
+ /* Check key attributes. */
+ if (match->key->eth.type == htons(ETH_P_ARP)
+ || match->key->eth.type == htons(ETH_P_RARP)) {
+ key_expected |= 1 << OVS_KEY_ATTR_ARP;
+ if (match->mask && (match->mask->key.eth.type == htons(0xffff)))
+ mask_allowed |= 1 << OVS_KEY_ATTR_ARP;
+ }
+
+ if (match->key->eth.type == htons(ETH_P_IP)) {
+ key_expected |= 1 << OVS_KEY_ATTR_IPV4;
+ if (match->mask && (match->mask->key.eth.type == htons(0xffff)))
+ mask_allowed |= 1 << OVS_KEY_ATTR_IPV4;
+
+ if (match->key->ip.frag != OVS_FRAG_TYPE_LATER) {
+ if (match->key->ip.proto == IPPROTO_UDP) {
+ key_expected |= 1 << OVS_KEY_ATTR_UDP;
+ if (match->mask && (match->mask->key.ip.proto == 0xff))
+ mask_allowed |= 1 << OVS_KEY_ATTR_UDP;
+ }
+
+ if (match->key->ip.proto == IPPROTO_TCP) {
+ key_expected |= 1 << OVS_KEY_ATTR_TCP;
+ if (match->mask && (match->mask->key.ip.proto == 0xff))
+ mask_allowed |= 1 << OVS_KEY_ATTR_TCP;
+ }
+
+ if (match->key->ip.proto == IPPROTO_ICMP) {
+ key_expected |= 1 << OVS_KEY_ATTR_ICMP;
+ if (match->mask && (match->mask->key.ip.proto == 0xff))
+ mask_allowed |= 1 << OVS_KEY_ATTR_ICMP;
+ }
+ }
+ }
+
+ if (match->key->eth.type == htons(ETH_P_IPV6)) {
+ key_expected |= 1 << OVS_KEY_ATTR_IPV6;
+ if (match->mask && (match->mask->key.eth.type == htons(0xffff)))
+ mask_allowed |= 1 << OVS_KEY_ATTR_IPV6;
+
+ if (match->key->ip.frag != OVS_FRAG_TYPE_LATER) {
+ if (match->key->ip.proto == IPPROTO_UDP) {
+ key_expected |= 1 << OVS_KEY_ATTR_UDP;
+ if (match->mask && (match->mask->key.ip.proto == 0xff))
+ mask_allowed |= 1 << OVS_KEY_ATTR_UDP;
+ }
+
+ if (match->key->ip.proto == IPPROTO_TCP) {
+ key_expected |= 1 << OVS_KEY_ATTR_TCP;
+ if (match->mask && (match->mask->key.ip.proto == 0xff))
+ mask_allowed |= 1 << OVS_KEY_ATTR_TCP;
+ }
+
+ if (match->key->ip.proto == IPPROTO_ICMPV6) {
+ key_expected |= 1 << OVS_KEY_ATTR_ICMPV6;
+ if (match->mask && (match->mask->key.ip.proto == 0xff))
+ mask_allowed |= 1 << OVS_KEY_ATTR_ICMPV6;
+
+ if (match->key->ipv6.tp.src ==
+ htons(NDISC_NEIGHBOUR_SOLICITATION) ||
+ match->key->ipv6.tp.src == htons(NDISC_NEIGHBOUR_ADVERTISEMENT)) {
+ key_expected |= 1 << OVS_KEY_ATTR_ND;
+ if (match->mask && (match->mask->key.ipv6.tp.src == htons(0xffff)))
+ mask_allowed |= 1 << OVS_KEY_ATTR_ND;
+ }
+ }
+ }
+ }
+
+ if ((key_attrs & key_expected) != key_expected) {
+ /* Key attributes check failed. */
+ OVS_NLERR("Missing expected key attributes (key_attrs=%llx, expected=%llx).\n",
+ key_attrs, key_expected);
+ return false;
+ }
+
+ if ((mask_attrs & mask_allowed) != mask_attrs) {
+ /* Mask attributes check failed. */
+ OVS_NLERR("Contain more than allowed mask fields (mask_attrs=%llx, mask_allowed=%llx).\n",
+ mask_attrs, mask_allowed);
+ return false;
+ }
+
+ return true;
+}
+
static int check_header(struct sk_buff *skb, int len)
{
if (unlikely(skb->len < len))
@@ -121,12 +299,7 @@ u64 ovs_flow_used_time(unsigned long flow_jiffies)
return cur_ms - idle_ms;
}
-#define SW_FLOW_KEY_OFFSET(field) \
- (offsetof(struct sw_flow_key, field) + \
- FIELD_SIZEOF(struct sw_flow_key, field))
-
-static int parse_ipv6hdr(struct sk_buff *skb, struct sw_flow_key *key,
- int *key_lenp)
+static int parse_ipv6hdr(struct sk_buff *skb, struct sw_flow_key *key)
{
unsigned int nh_ofs = skb_network_offset(skb);
unsigned int nh_len;
@@ -136,8 +309,6 @@ static int parse_ipv6hdr(struct sk_buff *skb, struct sw_flow_key *key,
__be16 frag_off;
int err;
- *key_lenp = SW_FLOW_KEY_OFFSET(ipv6.label);
-
err = check_header(skb, nh_ofs + sizeof(*nh));
if (unlikely(err))
return err;
@@ -176,6 +347,21 @@ static bool icmp6hdr_ok(struct sk_buff *skb)
sizeof(struct icmp6hdr));
}
+void ovs_flow_key_mask(struct sw_flow_key *dst, const struct sw_flow_key *src,
+ const struct sw_flow_mask *mask)
+{
+ u8 *m = (u8 *)&mask->key + mask->range.start;
+ u8 *s = (u8 *)src + mask->range.start;
+ u8 *d = (u8 *)dst + mask->range.start;
+ int i;
+
+ memset(dst, 0, sizeof(*dst));
+ for (i = 0; i < ovs_sw_flow_mask_size_roundup(mask); i++) {
+ *d = *s & *m;
+ d++, s++, m++;
+ }
+}
+
#define TCP_FLAGS_OFFSET 13
#define TCP_FLAG_MASK 0x3f
@@ -224,6 +410,7 @@ struct sw_flow *ovs_flow_alloc(void)
spin_lock_init(&flow->lock);
flow->sf_acts = NULL;
+ flow->mask = NULL;
return flow;
}
@@ -263,7 +450,7 @@ static void free_buckets(struct flex_array *buckets)
flex_array_free(buckets);
}
-struct flow_table *ovs_flow_tbl_alloc(int new_size)
+static struct flow_table *__flow_tbl_alloc(int new_size)
{
struct flow_table *table = kmalloc(sizeof(*table), GFP_KERNEL);
@@ -281,17 +468,15 @@ struct flow_table *ovs_flow_tbl_alloc(int new_size)
table->node_ver = 0;
table->keep_flows = false;
get_random_bytes(&table->hash_seed, sizeof(u32));
+ table->mask_list = NULL;
return table;
}
-void ovs_flow_tbl_destroy(struct flow_table *table)
+static void __flow_tbl_destroy(struct flow_table *table)
{
int i;
- if (!table)
- return;
-
if (table->keep_flows)
goto skip_flows;
@@ -303,31 +488,55 @@ void ovs_flow_tbl_destroy(struct flow_table *table)
hlist_for_each_entry_safe(flow, n, head, hash_node[ver]) {
hlist_del(&flow->hash_node[ver]);
- ovs_flow_free(flow);
+ ovs_flow_free(flow, false);
}
}
+ BUG_ON(!list_empty(table->mask_list));
+ kfree(table->mask_list);
+
skip_flows:
free_buckets(table->buckets);
kfree(table);
}
+struct flow_table *ovs_flow_tbl_alloc(int new_size)
+{
+ struct flow_table *table = __flow_tbl_alloc(new_size);
+
+ if (!table)
+ return NULL;
+
+ table->mask_list = kmalloc(sizeof(struct list_head), GFP_KERNEL);
+ if (!table->mask_list) {
+ table->keep_flows = true;
+ __flow_tbl_destroy(table);
+ return NULL;
+ }
+ INIT_LIST_HEAD(table->mask_list);
+
+ return table;
+}
+
static void flow_tbl_destroy_rcu_cb(struct rcu_head *rcu)
{
struct flow_table *table = container_of(rcu, struct flow_table, rcu);
- ovs_flow_tbl_destroy(table);
+ __flow_tbl_destroy(table);
}
-void ovs_flow_tbl_deferred_destroy(struct flow_table *table)
+void ovs_flow_tbl_destroy(struct flow_table *table, bool deferred)
{
if (!table)
return;
- call_rcu(&table->rcu, flow_tbl_destroy_rcu_cb);
+ if (deferred)
+ call_rcu(&table->rcu, flow_tbl_destroy_rcu_cb);
+ else
+ __flow_tbl_destroy(table);
}
-struct sw_flow *ovs_flow_tbl_next(struct flow_table *table, u32 *bucket, u32 *last)
+struct sw_flow *ovs_flow_dump_next(struct flow_table *table, u32 *bucket, u32 *last)
{
struct sw_flow *flow;
struct hlist_head *head;
@@ -353,11 +562,13 @@ struct sw_flow *ovs_flow_tbl_next(struct flow_table *table, u32 *bucket, u32 *la
return NULL;
}
-static void __flow_tbl_insert(struct flow_table *table, struct sw_flow *flow)
+static void __tbl_insert(struct flow_table *table, struct sw_flow *flow)
{
struct hlist_head *head;
+
head = find_bucket(table, flow->hash);
hlist_add_head_rcu(&flow->hash_node[table->node_ver], head);
+
table->count++;
}
@@ -377,8 +588,10 @@ static void flow_table_copy_flows(struct flow_table *old, struct flow_table *new
head = flex_array_get(old->buckets, i);
hlist_for_each_entry(flow, head, hash_node[old_ver])
- __flow_tbl_insert(new, flow);
+ __tbl_insert(new, flow);
}
+
+ new->mask_list = old->mask_list;
old->keep_flows = true;
}
@@ -386,7 +599,7 @@ static struct flow_table *__flow_tbl_rehash(struct flow_table *table, int n_buck
{
struct flow_table *new_table;
- new_table = ovs_flow_tbl_alloc(n_buckets);
+ new_table = __flow_tbl_alloc(n_buckets);
if (!new_table)
return ERR_PTR(-ENOMEM);
@@ -405,28 +618,30 @@ struct flow_table *ovs_flow_tbl_expand(struct flow_table *table)
return __flow_tbl_rehash(table, table->n_buckets * 2);
}
-void ovs_flow_free(struct sw_flow *flow)
+static void __flow_free(struct sw_flow *flow)
{
- if (unlikely(!flow))
- return;
-
kfree((struct sf_flow_acts __force *)flow->sf_acts);
kmem_cache_free(flow_cache, flow);
}
-/* RCU callback used by ovs_flow_deferred_free. */
static void rcu_free_flow_callback(struct rcu_head *rcu)
{
struct sw_flow *flow = container_of(rcu, struct sw_flow, rcu);
- ovs_flow_free(flow);
+ __flow_free(flow);
}
-/* Schedules 'flow' to be freed after the next RCU grace period.
- * The caller must hold rcu_read_lock for this to be sensible. */
-void ovs_flow_deferred_free(struct sw_flow *flow)
+void ovs_flow_free(struct sw_flow *flow, bool deferred)
{
- call_rcu(&flow->rcu, rcu_free_flow_callback);
+ if (!flow)
+ return;
+
+ ovs_sw_flow_mask_del_ref(flow->mask, deferred);
+
+ if (deferred)
+ call_rcu(&flow->rcu, rcu_free_flow_callback);
+ else
+ __flow_free(flow);
}
/* Schedules 'sf_acts' to be freed after the next RCU grace period.
@@ -497,18 +712,15 @@ static __be16 parse_ethertype(struct sk_buff *skb)
}
static int parse_icmpv6(struct sk_buff *skb, struct sw_flow_key *key,
- int *key_lenp, int nh_len)
+ int nh_len)
{
struct icmp6hdr *icmp = icmp6_hdr(skb);
- int error = 0;
- int key_len;
/* The ICMPv6 type and code fields use the 16-bit transport port
* fields, so we need to store them in 16-bit network byte order.
*/
key->ipv6.tp.src = htons(icmp->icmp6_type);
key->ipv6.tp.dst = htons(icmp->icmp6_code);
- key_len = SW_FLOW_KEY_OFFSET(ipv6.tp);
if (icmp->icmp6_code == 0 &&
(icmp->icmp6_type == NDISC_NEIGHBOUR_SOLICITATION ||
@@ -517,21 +729,17 @@ static int parse_icmpv6(struct sk_buff *skb, struct sw_flow_key *key,
struct nd_msg *nd;
int offset;
- key_len = SW_FLOW_KEY_OFFSET(ipv6.nd);
-
/* In order to process neighbor discovery options, we need the
* entire packet.
*/
if (unlikely(icmp_len < sizeof(*nd)))
- goto out;
- if (unlikely(skb_linearize(skb))) {
- error = -ENOMEM;
- goto out;
- }
+ return 0;
+
+ if (unlikely(skb_linearize(skb)))
+ return -ENOMEM;
nd = (struct nd_msg *)skb_transport_header(skb);
key->ipv6.nd.target = nd->target;
- key_len = SW_FLOW_KEY_OFFSET(ipv6.nd);
icmp_len -= sizeof(*nd);
offset = 0;
@@ -541,7 +749,7 @@ static int parse_icmpv6(struct sk_buff *skb, struct sw_flow_key *key,
int opt_len = nd_opt->nd_opt_len * 8;
if (unlikely(!opt_len || opt_len > icmp_len))
- goto invalid;
+ return 0;
/* Store the link layer address if the appropriate
* option is provided. It is considered an error if
@@ -566,16 +774,14 @@ static int parse_icmpv6(struct sk_buff *skb, struct sw_flow_key *key,
}
}
- goto out;
+ return 0;
invalid:
memset(&key->ipv6.nd.target, 0, sizeof(key->ipv6.nd.target));
memset(key->ipv6.nd.sll, 0, sizeof(key->ipv6.nd.sll));
memset(key->ipv6.nd.tll, 0, sizeof(key->ipv6.nd.tll));
-out:
- *key_lenp = key_len;
- return error;
+ return 0;
}
/**
@@ -584,7 +790,6 @@ out:
* Ethernet header
* @in_port: port number on which @skb was received.
* @key: output flow key
- * @key_lenp: length of output flow key
*
* The caller must ensure that skb->len >= ETH_HLEN.
*
@@ -602,11 +807,9 @@ out:
* of a correct length, otherwise the same as skb->network_header.
* For other key->eth.type values it is left untouched.
*/
-int ovs_flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key,
- i