diff options
75 files changed, 1383 insertions, 858 deletions
diff --git a/Documentation/networking/nf_flowtable.txt b/Documentation/networking/nf_flowtable.txt new file mode 100644 index 000000000000..54128c50d508 --- /dev/null +++ b/Documentation/networking/nf_flowtable.txt @@ -0,0 +1,112 @@ +Netfilter's flowtable infrastructure +==================================== + +This documentation describes the software flowtable infrastructure available in +Netfilter since Linux kernel 4.16. + +Overview +-------- + +Initial packets follow the classic forwarding path, once the flow enters the +established state according to the conntrack semantics (ie. we have seen traffic +in both directions), then you can decide to offload the flow to the flowtable +from the forward chain via the 'flow offload' action available in nftables. + +Packets that find an entry in the flowtable (ie. flowtable hit) are sent to the +output netdevice via neigh_xmit(), hence, they bypass the classic forwarding +path (the visible effect is that you do not see these packets from any of the +netfilter hooks coming after the ingress). In case of flowtable miss, the packet +follows the classic forward path. + +The flowtable uses a resizable hashtable, lookups are based on the following +7-tuple selectors: source, destination, layer 3 and layer 4 protocols, source +and destination ports and the input interface (useful in case there are several +conntrack zones in place). + +Flowtables are populated via the 'flow offload' nftables action, so the user can +selectively specify what flows are placed into the flow table. Hence, packets +follow the classic forwarding path unless the user explicitly instruct packets +to use this new alternative forwarding path via nftables policy. + +This is represented in Fig.1, which describes the classic forwarding path +including the Netfilter hooks and the flowtable fastpath bypass. + + userspace process + ^ | + | | + _____|____ ____\/___ + / \ / \ + | input | | output | + \__________/ \_________/ + ^ | + | | + _________ __________ --------- _____\/_____ + / \ / \ |Routing | / \ + --> ingress ---> prerouting ---> |decision| | postrouting |--> neigh_xmit + \_________/ \__________/ ---------- \____________/ ^ + | ^ | | ^ | + flowtable | | ____\/___ | | + | | | / \ | | + __\/___ | --------->| forward |------------ | + |-----| | \_________/ | + |-----| | 'flow offload' rule | + |-----| | adds entry to | + |_____| | flowtable | + | | | + / \ | | + /hit\_no_| | + \ ? / | + \ / | + |__yes_________________fastpath bypass ____________________________| + + Fig.1 Netfilter hooks and flowtable interactions + +The flowtable entry also stores the NAT configuration, so all packets are +mangled according to the NAT policy that matches the initial packets that went +through the classic forwarding path. The TTL is decremented before calling +neigh_xmit(). Fragmented traffic is passed up to follow the classic forwarding +path given that the transport selectors are missing, therefore flowtable lookup +is not possible. + +Example configuration +--------------------- + +Enabling the flowtable bypass is relatively easy, you only need to create a +flowtable and add one rule to your forward chain. + + table inet x { + flowtable f { + hook ingress priority 0 devices = { eth0, eth1 }; + } + chain y { + type filter hook forward priority 0; policy accept; + ip protocol tcp flow offload @f + counter packets 0 bytes 0 + } + } + +This example adds the flowtable 'f' to the ingress hook of the eth0 and eth1 +netdevices. You can create as many flowtables as you want in case you need to +perform resource partitioning. The flowtable priority defines the order in which +hooks are run in the pipeline, this is convenient in case you already have a +nftables ingress chain (make sure the flowtable priority is smaller than the +nftables ingress chain hence the flowtable runs before in the pipeline). + +The 'flow offload' action from the forward chain 'y' adds an entry to the +flowtable for the TCP syn-ack packet coming in the reply direction. Once the +flow is offloaded, you will observe that the counter rule in the example above +does not get updated for the packets that are being forwarded through the +forwarding bypass. + +More reading +------------ + +This documentation is based on the LWN.net articles [1][2]. Rafal Milecki also +made a very complete and comprehensive summary called "A state of network +acceleration" that describes how things were before this infrastructure was +mailined [3] and it also makes a rough summary of this work [4]. + +[1] https://lwn.net/Articles/738214/ +[2] https://lwn.net/Articles/742164/ +[3] http://lists.infradead.org/pipermail/lede-dev/2018-January/010830.html +[4] http://lists.infradead.org/pipermail/lede-dev/2018-January/010829.html diff --git a/include/linux/netfilter/nfnetlink_acct.h b/include/linux/netfilter/nfnetlink_acct.h index b4d741195c28..beee8bffe49e 100644 --- a/include/linux/netfilter/nfnetlink_acct.h +++ b/include/linux/netfilter/nfnetlink_acct.h @@ -16,6 +16,5 @@ struct nf_acct; struct nf_acct *nfnl_acct_find_get(struct net *net, const char *filter_name); void nfnl_acct_put(struct nf_acct *acct); void nfnl_acct_update(const struct sk_buff *skb, struct nf_acct *nfacct); -int nfnl_acct_overquota(struct net *net, const struct sk_buff *skb, - struct nf_acct *nfacct); +int nfnl_acct_overquota(struct net *net, struct nf_acct *nfacct); #endif /* _NFNL_ACCT_H */ diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h index 14529511c4b8..9077b3ebea08 100644 --- a/include/linux/netfilter/x_tables.h +++ b/include/linux/netfilter/x_tables.h @@ -281,6 +281,8 @@ int xt_check_entry_offsets(const void *base, const char *elems, unsigned int target_offset, unsigned int next_offset); +int xt_check_table_hooks(const struct xt_table_info *info, unsigned int valid_hooks); + unsigned int *xt_alloc_entry_offsets(unsigned int size); bool xt_find_jump_offset(const unsigned int *offsets, unsigned int target, unsigned int size); @@ -301,6 +303,7 @@ int xt_data_to_user(void __user *dst, const void *src, void *xt_copy_counters_from_user(const void __user *user, unsigned int len, struct xt_counters_info *info, bool compat); +struct xt_counters *xt_counters_alloc(unsigned int counters); struct xt_table *xt_register_table(struct net *net, const struct xt_table *table, @@ -509,7 +512,7 @@ void xt_compat_unlock(u_int8_t af); int xt_compat_add_offset(u_int8_t af, unsigned int offset, int delta); void xt_compat_flush_offsets(u_int8_t af); -void xt_compat_init_offsets(u_int8_t af, unsigned int number); +int xt_compat_init_offsets(u8 af, unsigned int number); int xt_compat_calc_jump(u_int8_t af, unsigned int offset); int xt_compat_match_offset(const struct xt_match *match); diff --git a/include/net/netfilter/nf_conntrack_count.h b/include/net/netfilter/nf_conntrack_count.h index adf8db44cf86..e61184fbfb71 100644 --- a/include/net/netfilter/nf_conntrack_count.h +++ b/include/net/netfilter/nf_conntrack_count.h @@ -11,7 +11,6 @@ void nf_conncount_destroy(struct net *net, unsigned int family, unsigned int nf_conncount_count(struct net *net, struct nf_conncount_data *data, const u32 *key, - unsigned int family, const struct nf_conntrack_tuple *tuple, const struct nf_conntrack_zone *zone); #endif diff --git a/include/net/netfilter/nf_conntrack_helper.h b/include/net/netfilter/nf_conntrack_helper.h index fc39bbaf107c..32c2a94a219d 100644 --- a/include/net/netfilter/nf_conntrack_helper.h +++ b/include/net/netfilter/nf_conntrack_helper.h @@ -132,8 +132,7 @@ void nf_conntrack_helper_pernet_fini(struct net *net); int nf_conntrack_helper_init(void); void nf_conntrack_helper_fini(void); -int nf_conntrack_broadcast_help(struct sk_buff *skb, unsigned int protoff, - struct nf_conn *ct, +int nf_conntrack_broadcast_help(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, unsigned int timeout); diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 663b015dace5..bd2a18d66189 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -434,11 +434,11 @@ static inline struct nft_set *nft_set_container_of(const void *priv) return (void *)priv - offsetof(struct nft_set, data); } -struct nft_set *nft_set_lookup(const struct net *net, - const struct nft_table *table, - const struct nlattr *nla_set_name, - const struct nlattr *nla_set_id, - u8 genmask); +struct nft_set *nft_set_lookup_global(const struct net *net, + const struct nft_table *table, + const struct nlattr *nla_set_name, + const struct nlattr *nla_set_id, + u8 genmask); static inline unsigned long nft_set_gc_interval(const struct nft_set *set) { @@ -868,7 +868,7 @@ struct nft_chain { char *name; }; -enum nft_chain_type { +enum nft_chain_types { NFT_CHAIN_T_DEFAULT = 0, NFT_CHAIN_T_ROUTE, NFT_CHAIN_T_NAT, @@ -876,7 +876,7 @@ enum nft_chain_type { }; /** - * struct nf_chain_type - nf_tables chain type info + * struct nft_chain_type - nf_tables chain type info * * @name: name of the type * @type: numeric identifier @@ -884,18 +884,22 @@ enum nft_chain_type { * @owner: module owner * @hook_mask: mask of valid hooks * @hooks: array of hook functions + * @init: chain initialization function + * @free: chain release function */ -struct nf_chain_type { +struct nft_chain_type { const char *name; - enum nft_chain_type type; + enum nft_chain_types type; int family; struct module *owner; unsigned int hook_mask; nf_hookfn *hooks[NF_MAX_HOOKS]; + int (*init)(struct nft_ctx *ctx); + void (*free)(struct nft_ctx *ctx); }; int nft_chain_validate_dependency(const struct nft_chain *chain, - enum nft_chain_type type); + enum nft_chain_types type); int nft_chain_validate_hooks(const struct nft_chain *chain, unsigned int hook_flags); @@ -917,7 +921,7 @@ struct nft_stats { */ struct nft_base_chain { struct nf_hook_ops ops; - const struct nf_chain_type *type; + const struct nft_chain_type *type; u8 policy; u8 flags; struct nft_stats __percpu *stats; @@ -970,8 +974,8 @@ struct nft_table { char *name; }; -int nft_register_chain_type(const struct nf_chain_type *); -void nft_unregister_chain_type(const struct nf_chain_type *); +void nft_register_chain_type(const struct nft_chain_type *); +void nft_unregister_chain_type(const struct nft_chain_type *); int nft_register_expr(struct nft_expr_type *); void nft_unregister_expr(struct nft_expr_type *); @@ -1345,4 +1349,7 @@ struct nft_trans_flowtable { #define nft_trans_flowtable(trans) \ (((struct nft_trans_flowtable *)trans->data)->flowtable) +int __init nft_chain_filter_init(void); +void __exit nft_chain_filter_fini(void); + #endif /* _NET_NF_TABLES_H */ diff --git a/include/net/netfilter/xt_rateest.h b/include/net/netfilter/xt_rateest.h index b1db13772554..832ab69efda5 100644 --- a/include/net/netfilter/xt_rateest.h +++ b/include/net/netfilter/xt_rateest.h @@ -21,7 +21,7 @@ struct xt_rateest { struct net_rate_estimator __rcu *rate_est; }; -struct xt_rateest *xt_rateest_lookup(const char *name); -void xt_rateest_put(struct xt_rateest *est); +struct xt_rateest *xt_rateest_lookup(struct net *net, const char *name); +void xt_rateest_put(struct net *net, struct xt_rateest *est); #endif /* _XT_RATEEST_H */ diff --git a/include/uapi/linux/netfilter/nf_conntrack_common.h b/include/uapi/linux/netfilter/nf_conntrack_common.h index 9574bd40870b..c712eb6879f1 100644 --- a/include/uapi/linux/netfilter/nf_conntrack_common.h +++ b/include/uapi/linux/netfilter/nf_conntrack_common.h @@ -129,6 +129,7 @@ enum ip_conntrack_events { IPCT_NATSEQADJ = IPCT_SEQADJ, IPCT_SECMARK, /* new security mark has been set */ IPCT_LABEL, /* new connlabel has been set */ + IPCT_SYNPROXY, /* synproxy has been set */ #ifdef __KERNEL__ __IPCT_MAX #endif diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 66dceee0ae30..6a3d653d5b27 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -909,8 +909,8 @@ enum nft_rt_attributes { * @NFT_CT_EXPIRATION: relative conntrack expiration time in ms * @NFT_CT_HELPER: connection tracking helper assigned to conntrack * @NFT_CT_L3PROTOCOL: conntrack layer 3 protocol - * @NFT_CT_SRC: conntrack layer 3 protocol source (IPv4/IPv6 address) - * @NFT_CT_DST: conntrack layer 3 protocol destination (IPv4/IPv6 address) + * @NFT_CT_SRC: conntrack layer 3 protocol source (IPv4/IPv6 address, deprecated) + * @NFT_CT_DST: conntrack layer 3 protocol destination (IPv4/IPv6 address, deprecated) * @NFT_CT_PROTOCOL: conntrack layer 4 protocol * @NFT_CT_PROTO_SRC: conntrack layer 4 protocol source * @NFT_CT_PROTO_DST: conntrack layer 4 protocol destination @@ -920,6 +920,10 @@ enum nft_rt_attributes { * @NFT_CT_AVGPKT: conntrack average bytes per packet * @NFT_CT_ZONE: conntrack zone * @NFT_CT_EVENTMASK: ctnetlink events to be generated for this conntrack + * @NFT_CT_SRC_IP: conntrack layer 3 protocol source (IPv4 address) + * @NFT_CT_DST_IP: conntrack layer 3 protocol destination (IPv4 address) + * @NFT_CT_SRC_IP6: conntrack layer 3 protocol source (IPv6 address) + * @NFT_CT_DST_IP6: conntrack layer 3 protocol destination (IPv6 address) */ enum nft_ct_keys { NFT_CT_STATE, @@ -941,6 +945,10 @@ enum nft_ct_keys { NFT_CT_AVGPKT, NFT_CT_ZONE, NFT_CT_EVENTMASK, + NFT_CT_SRC_IP, + NFT_CT_DST_IP, + NFT_CT_SRC_IP6, + NFT_CT_DST_IP6, }; /** diff --git a/include/uapi/linux/netfilter/nfnetlink_conntrack.h b/include/uapi/linux/netfilter/nfnetlink_conntrack.h index 7397e022ce6e..77987111cab0 100644 --- a/include/uapi/linux/netfilter/nfnetlink_conntrack.h +++ b/include/uapi/linux/netfilter/nfnetlink_conntrack.h @@ -54,6 +54,7 @@ enum ctattr_type { CTA_MARK_MASK, CTA_LABELS, CTA_LABELS_MASK, + CTA_SYNPROXY, __CTA_MAX }; #define CTA_MAX (__CTA_MAX - 1) @@ -190,6 +191,15 @@ enum ctattr_natseq { }; #define CTA_NAT_SEQ_MAX (__CTA_NAT_SEQ_MAX - 1) +enum ctattr_synproxy { + CTA_SYNPROXY_UNSPEC, + CTA_SYNPROXY_ISN, + CTA_SYNPROXY_ITS, + CTA_SYNPROXY_TSOFF, + __CTA_SYNPROXY_MAX, +}; +#define CTA_SYNPROXY_MAX (__CTA_SYNPROXY_MAX - 1) + enum ctattr_expe |