Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[BPF] rework L2 overlay (vxlan) for performance #9965

Draft
wants to merge 12 commits into
base: master
Choose a base branch
from
25 changes: 16 additions & 9 deletions felix/bpf-gpl/bpf.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,11 +33,11 @@
// relative to the workload so the ingress program is applied at egress from the host namespace
// and vice-versa.
#define CALI_TC_INGRESS (1<<1)
// CALI_TC_TUNNEL is set when compiling the program for the IPIP tunnel. It is *not* set
// CALI_TC_IPIP is set when compiling the program for the IPIP tunnel. It is *not* set
// when compiling the wireguard or tunnel program (or VXLAN). IPIP is a special case because
// it is a layer 3 device, so we don't see an ethernet header on packets arriving from the IPIP
// device.
#define CALI_TC_TUNNEL (1<<2)
#define CALI_TC_IPIP (1<<2)
// CALI_CGROUP is set when compiling the cgroup connect-time load balancer programs.
#define CALI_CGROUP (1<<3)
// CALI_TC_DSR is set when compiling programs for DSR mode. In DSR mode, traffic to node
Expand All @@ -53,6 +53,7 @@
#define CALI_TC_NAT_IF (1<<7)
#define CALI_TC_LO (1<<8)
#define CALI_CT_CLEANUP (1<<9)
#define CALI_TC_VXLAN (1<<10)

#ifndef CALI_DROP_WORKLOAD_TO_HOST
#define CALI_DROP_WORKLOAD_TO_HOST false
Expand All @@ -67,13 +68,13 @@

#define CALI_F_HEP ((CALI_COMPILE_FLAGS) & (CALI_TC_HOST_EP | CALI_TC_NAT_IF))
#define CALI_F_WEP (!CALI_F_HEP)
#define CALI_F_TUNNEL (((CALI_COMPILE_FLAGS) & CALI_TC_TUNNEL) != 0)
#define CALI_F_IPIP (((CALI_COMPILE_FLAGS) & CALI_TC_IPIP) != 0)
#define CALI_F_L3_DEV (((CALI_COMPILE_FLAGS) & CALI_TC_L3_DEV) != 0)
#define CALI_F_NAT_IF (((CALI_COMPILE_FLAGS) & CALI_TC_NAT_IF) != 0)
#define CALI_F_LO (((CALI_COMPILE_FLAGS) & CALI_TC_LO) != 0)
#define CALI_F_CT_CLEANUP (((CALI_COMPILE_FLAGS) & CALI_CT_CLEANUP) != 0)

#define CALI_F_MAIN (CALI_F_HEP && !CALI_F_TUNNEL && !CALI_F_L3_DEV && !CALI_F_NAT_IF && !CALI_F_LO)
#define CALI_F_MAIN (CALI_F_HEP && !CALI_F_IPIP && !CALI_F_L3_DEV && !CALI_F_NAT_IF && !CALI_F_LO)

#define CALI_F_XDP ((CALI_COMPILE_FLAGS) & CALI_XDP_PROG)

Expand All @@ -85,10 +86,15 @@

#define CALI_F_TO_HOST ((CALI_F_FROM_HEP || CALI_F_FROM_WEP) != 0)
#define CALI_F_FROM_HOST (!CALI_F_TO_HOST)
#define CALI_F_L3 ((CALI_F_TO_HEP && CALI_F_TUNNEL) || CALI_F_L3_DEV)
#define CALI_F_IPIP_ENCAPPED ((CALI_F_INGRESS && CALI_F_TUNNEL))
#define CALI_F_L3 ((CALI_F_TO_HEP && CALI_F_IPIP) || CALI_F_L3_DEV)
#define CALI_F_IPIP_ENCAPPED ((CALI_F_INGRESS && CALI_F_IPIP))
#define CALI_F_L3_INGRESS (CALI_F_INGRESS && CALI_F_L3_DEV)

#define CALI_F_WIREGUARD CALI_F_L3_DEV
#define CALI_F_VXLAN (((CALI_COMPILE_FLAGS) & CALI_TC_VXLAN) != 0)

#define CALI_F_TUNNEL (CALI_F_IPIP || CALI_F_WIREGUARD || CALI_F_VXLAN)

#define CALI_F_CGROUP (((CALI_COMPILE_FLAGS) & CALI_CGROUP) != 0)
#define CALI_F_DSR ((CALI_COMPILE_FLAGS & CALI_TC_DSR) != 0)

Expand All @@ -115,7 +121,7 @@ static CALI_BPF_INLINE void __compile_asserts(void) {
CALI_COMPILE_FLAGS == 0 ||
CALI_F_CT_CLEANUP ||
!!(CALI_COMPILE_FLAGS & CALI_CGROUP) !=
!!(CALI_COMPILE_FLAGS & (CALI_TC_HOST_EP | CALI_TC_INGRESS | CALI_TC_TUNNEL | CALI_TC_DSR | CALI_XDP_PROG))
!!(CALI_COMPILE_FLAGS & (CALI_TC_HOST_EP | CALI_TC_INGRESS | CALI_TC_IPIP | CALI_TC_DSR | CALI_XDP_PROG))
);
COMPILE_TIME_ASSERT(!CALI_F_DSR || (CALI_F_DSR && CALI_F_FROM_WEP) || (CALI_F_DSR && CALI_F_HEP));
COMPILE_TIME_ASSERT(CALI_F_TO_HOST || CALI_F_FROM_HOST);
Expand Down Expand Up @@ -266,9 +272,9 @@ static CALI_BPF_INLINE void ip_dec_ttl(struct iphdr *ip)
}

#ifdef IPVER6
#define ip_ttl_exceeded(ip) (CALI_F_TO_HOST && !CALI_F_TUNNEL && (ip)->hop_limit <= 1)
#define ip_ttl_exceeded(ip) (CALI_F_TO_HOST && !CALI_F_IPIP && (ip)->hop_limit <= 1)
#else
#define ip_ttl_exceeded(ip) (CALI_F_TO_HOST && !CALI_F_TUNNEL && (ip)->ttl <= 1)
#define ip_ttl_exceeded(ip) (CALI_F_TO_HOST && !CALI_F_IPIP && (ip)->ttl <= 1)
#endif

#if CALI_F_XDP
Expand Down Expand Up @@ -310,6 +316,7 @@ extern const volatile struct cali_tc_preamble_globals __globals;
#define WG_PORT CALI_CONFIGURABLE(wg_port)
#define NATIN_IFACE CALI_CONFIGURABLE(natin_idx)
#define PROFILING CALI_CONFIGURABLE(profiling)
#define OVERLAY_TUNNEL_ID CALI_CONFIGURABLE(overlay_tunnel_id)

#define FLOWLOGS_ENABLED (GLOBAL_FLAGS & CALI_GLOBALS_FLOWLOGS_ENABLED)

Expand Down
15 changes: 10 additions & 5 deletions felix/bpf-gpl/calculate-flags
Original file line number Diff line number Diff line change
Expand Up @@ -45,27 +45,32 @@ flags=0
# WARNING: these constants must be kept in sync with bpf.h.
((CALI_TC_HOST_EP = 1 << 0))
((CALI_TC_INGRESS = 1 << 1))
((CALI_TC_TUNNEL = 1 << 2))
((CALI_TC_IPIP = 1 << 2))
((CALI_CGROUP = 1 << 3))
((CALI_TC_DSR = 1 << 4))
((CALI_TC_L3_DEV = 1 << 5))
((CALI_XDP_PROG = 1 << 6))
((CALI_TC_NAT_IF = 1 << 7))
((CALI_TC_LO = 1 << 8))
((CALI_CT_CLEANUP = 1 << 9))
((CALI_TC_VXLAN = 1 << 10))

if [[ "${filename}" =~ .*hep.* ]]; then
# Host endpoint.
((flags |= CALI_TC_HOST_EP))
ep_type="host"
elif [[ "${filename}" =~ .*tnl.* ]]; then
# Tunnel.
((flags |= CALI_TC_TUNNEL | CALI_TC_HOST_EP))
ep_type="tunnel"
elif [[ "${filename}" =~ .*ipip.* ]]; then
# IPIP.
((flags |= CALI_TC_IPIP | CALI_TC_HOST_EP))
ep_type="ipip"
elif [[ "${filename}" =~ .*l3.* ]]; then
# Any l3 device.
((flags |= CALI_TC_L3_DEV | CALI_TC_HOST_EP))
ep_type="l3dev"
elif [[ "${filename}" =~ .*vxlan.* ]]; then
# Any vxlan device.
((flags |= CALI_TC_VXLAN | CALI_TC_HOST_EP))
ep_type="vxlan"
elif [[ "${filename}" =~ .*connect.* ]]; then
# Connect-time load balancer (CGROUP attached).
((flags |= CALI_CGROUP))
Expand Down
2 changes: 1 addition & 1 deletion felix/bpf-gpl/conntrack.h
Original file line number Diff line number Diff line change
Expand Up @@ -1050,7 +1050,7 @@ static CALI_BPF_INLINE struct calico_ct_result calico_ct_lookup(struct cali_tc_c
}
}

if ((CALI_F_INGRESS && CALI_F_TUNNEL) || !skb_seen(ctx->skb)) {
if ((CALI_F_INGRESS && CALI_F_IPIP) || !skb_seen(ctx->skb)) {
/* Account for the src->dst leg if we haven't seen the packet yet.
* Since when the traffic is tunneled, BPF program on the host
* iface sees it first and marks it as seen before another
Expand Down
122 changes: 117 additions & 5 deletions felix/bpf-gpl/fib_co_re.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,25 @@

#include "profiling.h"

#include <linux/if_packet.h>

static CALI_BPF_INLINE int redirect_to_peer(struct cali_tc_ctx *ctx)
{
struct cali_tc_state *state = ctx->state;
bool redirect_peer = GLOBAL_FLAGS & CALI_GLOBALS_REDIRECT_PEER;

if (redirect_peer && ct_result_rc(state->ct_result.rc) == CALI_CT_ESTABLISHED_BYPASS &&
state->ct_result.ifindex_fwd != CT_INVALID_IFINDEX) {
int rc = bpf_redirect_peer(state->ct_result.ifindex_fwd, 0);
if (rc == TC_ACT_REDIRECT) {
CALI_DEBUG("Redirect to peer interface (%d) succeeded.", state->ct_result.ifindex_fwd);
return rc;
}
}

return TC_ACT_UNSPEC;
}

static CALI_BPF_INLINE int forward_or_drop(struct cali_tc_ctx *ctx)
{
int rc = ctx->fwd.res;
Expand Down Expand Up @@ -86,18 +105,108 @@ static CALI_BPF_INLINE int forward_or_drop(struct cali_tc_ctx *ctx)
/* fall through to FIB if enabled or the IP stack, don't give up yet. */
rc = TC_ACT_UNSPEC;
} else if (CALI_F_FROM_HEP && bpf_core_enum_value_exists(enum bpf_func_id, BPF_FUNC_redirect_peer)) {
bool redirect_peer = GLOBAL_FLAGS & CALI_GLOBALS_REDIRECT_PEER;
if ((rc = redirect_to_peer(ctx)) == TC_ACT_REDIRECT) {
goto skip_fib;
}
} else if (CALI_F_FROM_WEP && fwd_fib(&ctx->fwd)) {
struct cali_rt *dest_rt = cali_rt_lookup(&ctx->state->ip_dst);
if (dest_rt == NULL) {
CALI_DEBUG("No route for " IP_FMT " to forward from WEP", &ctx->state->ip_dst);
goto try_fib_external;
}

if (redirect_peer && ct_result_rc(state->ct_result.rc) == CALI_CT_ESTABLISHED_BYPASS &&
state->ct_result.ifindex_fwd != CT_INVALID_IFINDEX) {
rc = bpf_redirect_peer(state->ct_result.ifindex_fwd, 0);
if (cali_rt_flags_local_host(dest_rt->flags)) {
goto skip_fib;
}

if (state->ct_result.ifindex_fwd == CT_INVALID_IFINDEX) {
*fib_params(ctx) = (struct bpf_fib_lookup) {
#ifdef IPVER6
.family = 10, /* AF_INET6 */
#else
.family = 2, /* AF_INET */
#endif
.tot_len = 0,
.ifindex = CALI_F_TO_HOST ? ctx->skb->ingress_ifindex : ctx->skb->ifindex,
.l4_protocol = state->ip_proto,
};
#ifdef IPVER6
ipv6_addr_t_to_be32_4_ip(fib_params(ctx)->ipv6_src, &state->ip_src);
ipv6_addr_t_to_be32_4_ip(fib_params(ctx)->ipv6_dst, &state->ip_dst);
#else
fib_params(ctx)->ipv4_src = state->ip_src;
fib_params(ctx)->ipv4_dst = state->ip_dst;
#endif

rc = bpf_fib_lookup(ctx->skb, fib_params(ctx), sizeof(struct bpf_fib_lookup), 0);
state->ct_result.ifindex_fwd = fib_params(ctx)->ifindex;
}

if (cali_rt_flags_local_workload(dest_rt->flags)) {
if ((rc = redirect_to_peer(ctx)) == TC_ACT_REDIRECT) {
goto skip_fib;
}
} else if (cali_rt_is_vxlan(dest_rt)) {
struct bpf_tunnel_key key = {
.tunnel_id = OVERLAY_TUNNEL_ID,
};
__u64 flags = 0;
__u32 size = 0;
#ifdef IPVER6
ipv6_addr_t_to_be32_4_ip(key.remote_ipv6, &dest_rt->next_hop);
flags |= BPF_F_TUNINFO_IPV6;
size = offsetof(struct bpf_tunnel_key, local_ipv6);
#else
key.remote_ipv4 = bpf_htonl(dest_rt->next_hop);
flags |= BPF_F_ZERO_CSUM_TX;
size = offsetof(struct bpf_tunnel_key, local_ipv4);
#endif

int err = bpf_skb_set_tunnel_key(ctx->skb, &key, size, flags);
CALI_DEBUG("bpf_skb_set_tunnel_key %d nh " IP_FMT, err, &dest_rt->next_hop);

rc = bpf_redirect(state->ct_result.ifindex_fwd, 0);
if (rc == TC_ACT_REDIRECT) {
CALI_DEBUG("Redirect to peer interface (%d) succeeded.", state->ct_result.ifindex_fwd);
CALI_DEBUG("Redirect to dev %d without fib lookup",
state->ct_result.ifindex_fwd);
goto skip_fib;
}
}
} else if (CALI_F_VXLAN && CALI_F_TO_HEP) {
if (!(ctx->skb->mark & CALI_SKB_MARK_SEEN)) {
/* packet to vxlan from the host, needs to set tunnel key */
struct cali_rt *dest_rt = cali_rt_lookup(&ctx->state->ip_dst);
if (dest_rt == NULL) {
CALI_DEBUG("No route for " IP_FMT " at vxlan device", &ctx->state->ip_dst);
goto deny;
}
if (!cali_rt_is_vxlan(dest_rt)) {
CALI_DEBUG("Not a vxlan route for " IP_FMT " at vxlan device", &ctx->state->ip_dst);
goto deny;
}

struct bpf_tunnel_key key = {
.tunnel_id = OVERLAY_TUNNEL_ID,
};

__u64 flags = 0;
__u32 size = 0;
#ifdef IPVER6
ipv6_addr_t_to_be32_4_ip(key.remote_ipv6, &dest_rt->next_hop);
flags |= BPF_F_TUNINFO_IPV6;
size = offsetof(struct bpf_tunnel_key, local_ipv6);
#else
key.remote_ipv4 = bpf_htonl(dest_rt->next_hop);
flags |= BPF_F_ZERO_CSUM_TX;
size = offsetof(struct bpf_tunnel_key, local_ipv4);
#endif

int err = bpf_skb_set_tunnel_key(ctx->skb, &key, size, flags);
CALI_DEBUG("bpf_skb_set_tunnel_key %d nh " IP_FMT, err, &dest_rt->next_hop);
}
}

try_fib_external:
#if CALI_FIB_ENABLED
/* Only do FIB for packets to be turned around at a HEP on HEP egress. */
if (CALI_F_TO_HEP && !(ctx->state->flags & CALI_ST_CT_NP_LOOP)) {
Expand Down Expand Up @@ -363,6 +472,9 @@ static CALI_BPF_INLINE int forward_or_drop(struct cali_tc_ctx *ctx)
CALI_INFO("Final result=DENY (%d). Program execution time: %lluns",
reason, prog_end_time-state->prog_start_time);
} else {
if (CALI_F_VXLAN && CALI_F_TO_HOST) {
bpf_skb_change_type(ctx->skb, PACKET_HOST);
}
CALI_INFO("Final result=ALLOW rc %d. Program execution time: %lluns",
rc, prog_end_time-state->prog_start_time);
}
Expand Down
1 change: 1 addition & 0 deletions felix/bpf-gpl/globals.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ struct name { \
__be16 profiling; \
__u32 natin_idx; \
__u32 natout_idx; \
__u32 overlay_tunnel_id; \
__u8 iface_name[16]; \
__u32 log_filter_jmp; \
__u32 jumps[40]; \
Expand Down
2 changes: 1 addition & 1 deletion felix/bpf-gpl/list-objs
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ for log_level in debug no_log; do
# The workload-to-host drop setting only applies to the from-workload hook.
ep_types="wep"
else
ep_types="wep hep tnl l3 nat lo"
ep_types="wep hep ipip l3 nat lo vxlan"
fi
for fib in "" "fib_"; do
for ep_type in $ep_types; do
Expand Down
7 changes: 1 addition & 6 deletions felix/bpf-gpl/nat.h
Original file line number Diff line number Diff line change
Expand Up @@ -159,12 +159,7 @@ static CALI_BPF_INLINE int vxlan_attempt_decap(struct cali_tc_ctx *ctx)
{
/* decap on host ep only if directly for the node */
CALI_DEBUG("VXLAN tunnel packet to " IP_FMT " (host IP=" IP_FMT ")",
#ifdef IPVER6
bpf_ntohl(ip_hdr(ctx)->daddr.in6_u.u6_addr32[3]),
#else
bpf_ntohl(ip_hdr(ctx)->daddr),
#endif
debug_ip(HOST_IP));
&ip_hdr(ctx)->daddr, debug_ip(HOST_IP));

if (!rt_addr_is_local_host((ipv46_addr_t *)&ip_hdr(ctx)->daddr)) {
goto fall_through;
Expand Down
4 changes: 2 additions & 2 deletions felix/bpf-gpl/parsing.h
Original file line number Diff line number Diff line change
Expand Up @@ -197,7 +197,7 @@ static CALI_BPF_INLINE int tc_state_fill_from_nexthdr(struct cali_tc_ctx *ctx, b
break;
#endif
case IPPROTO_IPIP:
if (CALI_F_TUNNEL | CALI_F_L3_DEV) {
if (CALI_F_IPIP | CALI_F_L3_DEV) {
// IPIP should never be sent down the tunnel.
CALI_DEBUG("IPIP traffic to/from tunnel: drop");
deny_reason(ctx, CALI_REASON_UNAUTH_SOURCE);
Expand All @@ -212,7 +212,7 @@ static CALI_BPF_INLINE int tc_state_fill_from_nexthdr(struct cali_tc_ctx *ctx, b
deny_reason(ctx, CALI_REASON_UNAUTH_SOURCE);
goto deny;
}
} else if (CALI_F_TO_HEP && !CALI_F_TUNNEL && !CALI_F_L3_DEV) {
} else if (CALI_F_TO_HEP && !CALI_F_IPIP && !CALI_F_L3_DEV) {
if (rt_addr_is_remote_host(&ctx->state->ip_dst)) {
CALI_DEBUG("IPIP packet to known Calico host, allow.");
goto allow;
Expand Down
2 changes: 2 additions & 0 deletions felix/bpf-gpl/routes.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ enum cali_rt_flags {
CALI_RT_NO_DSR = 0x80,
CALI_RT_BLACKHOLE_DROP = 0x100,
CALI_RT_BLACKHOLE_REJECT = 0x200,
CALI_RT_VXLAN = 0x400,
};

struct cali_rt {
Expand Down Expand Up @@ -78,6 +79,7 @@ static CALI_BPF_INLINE enum cali_rt_flags cali_rt_lookup_flags(ipv46_addr_t *add
#define cali_rt_is_host(rt) ((rt)->flags & CALI_RT_HOST)
#define cali_rt_is_workload(rt) ((rt)->flags & CALI_RT_WORKLOAD)
#define cali_rt_is_tunneled(rt) ((rt)->flags & CALI_RT_TUNNELED)
#define cali_rt_is_vxlan(rt) ((rt)->flags & CALI_RT_VXLAN)
#define cali_rt_is_blackhole_drop(rt) ((rt)->flags & CALI_RT_BLACKHOLE_DROP)
#define cali_rt_is_blackhole_reject(rt) ((rt)->flags & CALI_RT_BLACKHOLE_REJECT)

Expand Down
5 changes: 3 additions & 2 deletions felix/bpf/hook/load.go
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ func (at AttachType) DefaultPolicy() DefPolicy {
return DefPolicyNone
}

if at.Type == tcdefs.EpTypeTunnel || at.Type == tcdefs.EpTypeL3Device {
if at.Type == tcdefs.EpTypeIPIP || at.Type == tcdefs.EpTypeL3Device || at.Type == tcdefs.EpTypeVXLAN {
return DefPolicyAllow
}

Expand All @@ -121,10 +121,11 @@ func initObjectFiles() {
epTypes := []tcdefs.EndpointType{
tcdefs.EpTypeWorkload,
tcdefs.EpTypeHost,
tcdefs.EpTypeTunnel,
tcdefs.EpTypeIPIP,
tcdefs.EpTypeL3Device,
tcdefs.EpTypeNAT,
tcdefs.EpTypeLO,
tcdefs.EpTypeVXLAN,
}
for _, epType := range epTypes {
epType := epType
Expand Down
1 change: 1 addition & 0 deletions felix/bpf/libbpf/libbpf.go
Original file line number Diff line number Diff line change
Expand Up @@ -420,6 +420,7 @@ func (t *TcGlobalData) Set(m *Map) error {
C.ushort(t.Profiling),
C.uint(t.NatIn),
C.uint(t.NatOut),
C.uint(t.OverlayTunnelID),
C.uint(t.LogFilterJmp),
&cJumps[0], // it is safe because we hold the reference here until we return.
&cJumpsV6[0],
Expand Down
2 changes: 2 additions & 0 deletions felix/bpf/libbpf/libbpf_api.h
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,7 @@ void bpf_tc_set_globals(struct bpf_map *map,
ushort profiling,
uint natin,
uint natout,
uint overlay_tunnel_id,
uint log_filter_jmp,
uint *jumps,
uint *jumps6)
Expand All @@ -171,6 +172,7 @@ void bpf_tc_set_globals(struct bpf_map *map,
.profiling = profiling,
.natin_idx = natin,
.natout_idx = natout,
overlay_tunnel_id = overlay_tunnel_id,
.log_filter_jmp = log_filter_jmp,
};

Expand Down
Loading