From 9e4e948a3edafd2b7f4dc14c395e146ffd0d9611 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Wed, 9 Oct 2013 09:24:27 +0300 Subject: ipvs: avoid rcu_barrier during netns cleanup commit 578bc3ef1e473a ("ipvs: reorganize dest trash") added rcu_barrier() on cleanup to wait dest users and schedulers like LBLC and LBLCR to put their last dest reference. Using rcu_barrier with many namespaces is problematic. Trying to fix it by freeing dest with kfree_rcu is not a solution, RCU callbacks can run in parallel and execution order is random. Fix it by creating new function ip_vs_dest_put_and_free() which is heavier than ip_vs_dest_put(). We will use it just for schedulers like LBLC, LBLCR that can delay their dest release. By default, dests reference is above 0 if they are present in service and it is 0 when deleted but still in trash list. Change the dest trash code to use ip_vs_dest_put_and_free(), so that refcnt -1 can be used for freeing. As result, such checks remain in slow path and the rcu_barrier() from netns cleanup can be removed. Signed-off-by: Julian Anastasov Signed-off-by: Simon Horman --- net/netfilter/ipvs/ip_vs_ctl.c | 6 +----- net/netfilter/ipvs/ip_vs_lblc.c | 2 +- net/netfilter/ipvs/ip_vs_lblcr.c | 2 +- 3 files changed, 3 insertions(+), 7 deletions(-) (limited to 'net/netfilter/ipvs') diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index a3df9bddc4f7..62786a495cea 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -704,7 +704,7 @@ static void ip_vs_dest_free(struct ip_vs_dest *dest) __ip_vs_dst_cache_reset(dest); __ip_vs_svc_put(svc, false); free_percpu(dest->stats.cpustats); - kfree(dest); + ip_vs_dest_put_and_free(dest); } /* @@ -3820,10 +3820,6 @@ void __net_exit ip_vs_control_net_cleanup(struct net *net) { struct netns_ipvs *ipvs = net_ipvs(net); - /* Some dest can be in grace period even before cleanup, we have to - * defer ip_vs_trash_cleanup until ip_vs_dest_wait_readers is called. - */ - rcu_barrier(); ip_vs_trash_cleanup(net); ip_vs_stop_estimator(net, &ipvs->tot_stats); ip_vs_control_net_cleanup_sysctl(net); diff --git a/net/netfilter/ipvs/ip_vs_lblc.c b/net/netfilter/ipvs/ip_vs_lblc.c index eff13c94498e..ca056a331e60 100644 --- a/net/netfilter/ipvs/ip_vs_lblc.c +++ b/net/netfilter/ipvs/ip_vs_lblc.c @@ -136,7 +136,7 @@ static void ip_vs_lblc_rcu_free(struct rcu_head *head) struct ip_vs_lblc_entry, rcu_head); - ip_vs_dest_put(en->dest); + ip_vs_dest_put_and_free(en->dest); kfree(en); } diff --git a/net/netfilter/ipvs/ip_vs_lblcr.c b/net/netfilter/ipvs/ip_vs_lblcr.c index 0b8550089a2e..3f21a2f47de1 100644 --- a/net/netfilter/ipvs/ip_vs_lblcr.c +++ b/net/netfilter/ipvs/ip_vs_lblcr.c @@ -130,7 +130,7 @@ static void ip_vs_lblcr_elem_rcu_free(struct rcu_head *head) struct ip_vs_dest_set_elem *e; e = container_of(head, struct ip_vs_dest_set_elem, rcu_head); - ip_vs_dest_put(e->dest); + ip_vs_dest_put_and_free(e->dest); kfree(e); } -- cgit v1.2.3 From 1255ce5f10dbb4646c8d43b8d59faab48ae4a6b2 Mon Sep 17 00:00:00 2001 From: Alexander Frolkin Date: Fri, 27 Sep 2013 11:06:23 +0100 Subject: ipvs: improved SH fallback strategy Improve the SH fallback realserver selection strategy. With sh and sh-fallback, if a realserver is down, this attempts to distribute the traffic that would have gone to that server evenly among the remaining servers. Signed-off-by: Alexander Frolkin Acked-by: Julian Anastasov Signed-off-by: Simon Horman --- net/netfilter/ipvs/ip_vs_sh.c | 39 +++++++++++++++++++++++++++++---------- 1 file changed, 29 insertions(+), 10 deletions(-) (limited to 'net/netfilter/ipvs') diff --git a/net/netfilter/ipvs/ip_vs_sh.c b/net/netfilter/ipvs/ip_vs_sh.c index 3588faebe529..cc65b2f42cd4 100644 --- a/net/netfilter/ipvs/ip_vs_sh.c +++ b/net/netfilter/ipvs/ip_vs_sh.c @@ -115,27 +115,46 @@ ip_vs_sh_get(struct ip_vs_service *svc, struct ip_vs_sh_state *s, } -/* As ip_vs_sh_get, but with fallback if selected server is unavailable */ +/* As ip_vs_sh_get, but with fallback if selected server is unavailable + * + * The fallback strategy loops around the table starting from a "random" + * point (in fact, it is chosen to be the original hash value to make the + * algorithm deterministic) to find a new server. + */ static inline struct ip_vs_dest * ip_vs_sh_get_fallback(struct ip_vs_service *svc, struct ip_vs_sh_state *s, const union nf_inet_addr *addr, __be16 port) { - unsigned int offset; - unsigned int hash; + unsigned int offset, roffset; + unsigned int hash, ihash; struct ip_vs_dest *dest; + /* first try the dest it's supposed to go to */ + ihash = ip_vs_sh_hashkey(svc->af, addr, port, 0); + dest = rcu_dereference(s->buckets[ihash].dest); + if (!dest) + return NULL; + if (!is_unavailable(dest)) + return dest; + + IP_VS_DBG_BUF(6, "SH: selected unavailable server %s:%d, reselecting", + IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port)); + + /* if the original dest is unavailable, loop around the table + * starting from ihash to find a new dest + */ for (offset = 0; offset < IP_VS_SH_TAB_SIZE; offset++) { - hash = ip_vs_sh_hashkey(svc->af, addr, port, offset); + roffset = (offset + ihash) % IP_VS_SH_TAB_SIZE; + hash = ip_vs_sh_hashkey(svc->af, addr, port, roffset); dest = rcu_dereference(s->buckets[hash].dest); if (!dest) break; - if (is_unavailable(dest)) - IP_VS_DBG_BUF(6, "SH: selected unavailable server " - "%s:%d (offset %d)", - IP_VS_DBG_ADDR(svc->af, &dest->addr), - ntohs(dest->port), offset); - else + if (!is_unavailable(dest)) return dest; + IP_VS_DBG_BUF(6, "SH: selected unavailable " + "server %s:%d (offset %d), reselecting", + IP_VS_DBG_ADDR(svc->af, &dest->addr), + ntohs(dest->port), roffset); } return NULL; -- cgit v1.2.3 From 6e7cd27c0f77847f1b07a81ae2ed17b937a7531a Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 25 Oct 2013 11:05:04 +0200 Subject: net: ipvs: sctp: add missing verdict assignments in sctp_conn_schedule If skb_header_pointer() fails, we need to assign a verdict, that is NF_DROP in this case, otherwise, we would leave the verdict from conn_schedule() uninitialized when returning. Signed-off-by: Daniel Borkmann Acked-by: Jesper Dangaard Brouer Acked-by: Neil Horman Acked-by: Julian Anastasov Signed-off-by: Simon Horman --- net/netfilter/ipvs/ip_vs_proto_sctp.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'net/netfilter/ipvs') diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c index 23e596e438b3..9ca7aa033284 100644 --- a/net/netfilter/ipvs/ip_vs_proto_sctp.c +++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c @@ -20,13 +20,18 @@ sctp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, sctp_sctphdr_t *sh, _sctph; sh = skb_header_pointer(skb, iph->len, sizeof(_sctph), &_sctph); - if (sh == NULL) + if (sh == NULL) { + *verdict = NF_DROP; return 0; + } sch = skb_header_pointer(skb, iph->len + sizeof(sctp_sctphdr_t), sizeof(_schunkh), &_schunkh); - if (sch == NULL) + if (sch == NULL) { + *verdict = NF_DROP; return 0; + } + net = skb_net(skb); ipvs = net_ipvs(net); rcu_read_lock(); -- cgit v1.2.3 From 97203abe6bc41ee020f37c902bd1a761157f22c1 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 28 Oct 2013 10:56:20 +0100 Subject: net: ipvs: sctp: do not recalc sctp csum when ports didn't change Unlike UDP or TCP, we do not take the pseudo-header into account in SCTP checksums. So in case port mapping is the very same, we do not need to recalculate the whole SCTP checksum in software, which is very expensive. Also, similarly as in TCP, take into account when a private helper mangled the packet. In that case, we also need to recalculate the checksum even if ports might be same. Thanks for feedback regarding skb->ip_summed checks from Julian Anastasov; here's a discussion on these checks for snat and dnat: * For snat_handler(), we can see CHECKSUM_PARTIAL from virtual devices, and from LOCAL_OUT, otherwise it should be CHECKSUM_UNNECESSARY. In general, in snat it is more complex. skb contains the original route and ip_vs_route_me_harder() can change the route after snat_handler. So, for locally generated replies from local server we can not preserve the CHECKSUM_PARTIAL mode. It is an chicken or egg dilemma: snat_handler needs the device after rerouting (to check for NETIF_F_SCTP_CSUM), while ip_route_me_harder() wants the snat_handler() to put the new saddr for proper rerouting. * For dnat_handler(), we should not see CHECKSUM_COMPLETE for SCTP, in fact the small set of drivers that support SCTP offloading return CHECKSUM_UNNECESSARY on correctly received SCTP csum. We can see CHECKSUM_PARTIAL from local stack or received from virtual drivers. The idea is that SCTP decides to avoid csum calculation if hardware supports offloading. IPVS can change the device after rerouting to real server but we can preserve the CHECKSUM_PARTIAL mode if the new device supports offloading too. This works because skb dst is changed before dnat_handler and we see the new device. So, checks in the 'if' part will decide whether it is ok to keep CHECKSUM_PARTIAL for the output. If the packet was with CHECKSUM_NONE, hence we deal with unknown checksum. As we recalculate the sum for IP header in all cases, it should be safe to use CHECKSUM_UNNECESSARY. We can forward wrong checksum in this case (without cp->app). In case of CHECKSUM_UNNECESSARY, the csum was valid on receive. Signed-off-by: Daniel Borkmann Signed-off-by: Julian Anastasov Signed-off-by: Simon Horman --- net/netfilter/ipvs/ip_vs_proto_sctp.c | 39 +++++++++++++++++++++++++++++------ 1 file changed, 33 insertions(+), 6 deletions(-) (limited to 'net/netfilter/ipvs') diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c index 9ca7aa033284..2f7ea7564044 100644 --- a/net/netfilter/ipvs/ip_vs_proto_sctp.c +++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c @@ -81,6 +81,7 @@ sctp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, { sctp_sctphdr_t *sctph; unsigned int sctphoff = iph->len; + bool payload_csum = false; #ifdef CONFIG_IP_VS_IPV6 if (cp->af == AF_INET6 && iph->fragoffs) @@ -92,19 +93,31 @@ sctp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, return 0; if (unlikely(cp->app != NULL)) { + int ret; + /* Some checks before mangling */ if (pp->csum_check && !pp->csum_check(cp->af, skb, pp)) return 0; /* Call application helper if needed */ - if (!ip_vs_app_pkt_out(cp, skb)) + ret = ip_vs_app_pkt_out(cp, skb); + if (ret == 0) return 0; + /* ret=2: csum update is needed after payload mangling */ + if (ret == 2) + payload_csum = true; } sctph = (void *) skb_network_header(skb) + sctphoff; - sctph->source = cp->vport; - sctp_nat_csum(skb, sctph, sctphoff); + /* Only update csum if we really have to */ + if (sctph->source != cp->vport || payload_csum || + skb->ip_summed == CHECKSUM_PARTIAL) { + sctph->source = cp->vport; + sctp_nat_csum(skb, sctph, sctphoff); + } else { + skb->ip_summed = CHECKSUM_UNNECESSARY; + } return 1; } @@ -115,6 +128,7 @@ sctp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, { sctp_sctphdr_t *sctph; unsigned int sctphoff = iph->len; + bool payload_csum = false; #ifdef CONFIG_IP_VS_IPV6 if (cp->af == AF_INET6 && iph->fragoffs) @@ -126,19 +140,32 @@ sctp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, return 0; if (unlikely(cp->app != NULL)) { + int ret; + /* Some checks before mangling */ if (pp->csum_check && !pp->csum_check(cp->af, skb, pp)) return 0; /* Call application helper if needed */ - if (!ip_vs_app_pkt_in(cp, skb)) + ret = ip_vs_app_pkt_in(cp, skb); + if (ret == 0) return 0; + /* ret=2: csum update is needed after payload mangling */ + if (ret == 2) + payload_csum = true; } sctph = (void *) skb_network_header(skb) + sctphoff; - sctph->dest = cp->dport; - sctp_nat_csum(skb, sctph, sctphoff); + /* Only update csum if we really have to */ + if (sctph->dest != cp->dport || payload_csum || + (skb->ip_summed == CHECKSUM_PARTIAL && + !(skb_dst(skb)->dev->features & NETIF_F_SCTP_CSUM))) { + sctph->dest = cp->dport; + sctp_nat_csum(skb, sctph, sctphoff); + } else if (skb->ip_summed != CHECKSUM_PARTIAL) { + skb->ip_summed = CHECKSUM_UNNECESSARY; + } return 1; } -- cgit v1.2.3