Skip to content

[BPF] rework L2 overlay (vxlan) for performance #9965

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
de9318a
[BPF] setup vxlan device in external no-learnign mode
tomastigera Mar 11, 2025
f029540
[BPF] redirect to vxlan with bpf_skb_set_tunnel_key
tomastigera Mar 12, 2025
02e65a3
[BPF] Apply tunnel key on for vxlan tunnels
tomastigera Mar 13, 2025
d8e28f7
[BPF] make host networked propcesses work with vxlan
tomastigera Mar 13, 2025
17d2539
[BPF] make the tunnel id (vxlan VNI) configurable
tomastigera Mar 14, 2025
91ffeb0
use logCtx consistently in vxlan_fdb
tomastigera Apr 1, 2025
2d52c0b
[BPF] fix forwarding to v6 vxlan
tomastigera Apr 1, 2025
bb849b2
[BPF] use a single vxlan device for dualstack
tomastigera Apr 2, 2025
8fb7481
Fix vxlan tests for bpf using a single device with mtu 1500
tomastigera Apr 7, 2025
37ecb15
[BPF] Do not set tunnel key on same-subnet dests
tomastigera Apr 7, 2025
acf6545
[BPF] fix host to service via vxlan without ctlb
tomastigera Apr 8, 2025
d4adcb0
[BPF] fix configuring vxlan-v6.calico in iptables mode
tomastigera Apr 8, 2025
5674eb6
[BPF] add v6 vxlan run with tcp and ctlb enabled
tomastigera Apr 8, 2025
f222c41
[BPF] remove vxlan-v6.calico device after switching to ebpf
tomastigera Apr 8, 2025
d89bd3f
"should have correct routes" is not modified for v6
tomastigera Apr 8, 2025
b72b0c3
[BPF] fix testing removing devices when vxlan is turned off
tomastigera Apr 9, 2025
71c969a
reduce bpf verbosity in vxlan tests
tomastigera Apr 9, 2025
1d817a3
[BPF] handle borrowed IP for tunenl HEPs correctly
tomastigera Apr 29, 2025
15ebcb1
[BPF] seamless switch to ebpf with vxlan does not work
tomastigera Apr 30, 2025
64798d9
[BPF] use the single vxlan dev only when CO-RE is available
tomastigera Apr 30, 2025
22066c8
[BPF] vxlan fix minor remarks / naming
tomastigera May 8, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions felix/bpf-gpl/bpf.h
Original file line number Diff line number Diff line change
Expand Up @@ -316,6 +316,7 @@ extern const volatile struct cali_tc_preamble_globals __globals;
#define WG_PORT CALI_CONFIGURABLE(wg_port)
#define NATIN_IFACE CALI_CONFIGURABLE(natin_idx)
#define PROFILING CALI_CONFIGURABLE(profiling)
#define OVERLAY_TUNNEL_ID CALI_CONFIGURABLE(overlay_tunnel_id)

#define FLOWLOGS_ENABLED (GLOBAL_FLAGS & CALI_GLOBALS_FLOWLOGS_ENABLED)

Expand Down
126 changes: 121 additions & 5 deletions felix/bpf-gpl/fib_co_re.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,26 @@

#include "profiling.h"

#include <linux/if_packet.h>

static CALI_BPF_INLINE int try_redirect_to_peer(struct cali_tc_ctx *ctx)
{
struct cali_tc_state *state = ctx->state;
bool redirect_peer = GLOBAL_FLAGS & CALI_GLOBALS_REDIRECT_PEER;

if (redirect_peer && ct_result_rc(state->ct_result.rc) == CALI_CT_ESTABLISHED_BYPASS &&
state->ct_result.ifindex_fwd != CT_INVALID_IFINDEX &&
!(ctx->state->ct_result.flags & CALI_CT_FLAG_SKIP_REDIR_PEER)) {
int rc = bpf_redirect_peer(state->ct_result.ifindex_fwd, 0);
if (rc == TC_ACT_REDIRECT) {
CALI_DEBUG("Redirect to peer interface (%d) succeeded.", state->ct_result.ifindex_fwd);
return rc;
}
}

return TC_ACT_UNSPEC;
}

static CALI_BPF_INLINE int forward_or_drop(struct cali_tc_ctx *ctx)
{
int rc = ctx->fwd.res;
Expand Down Expand Up @@ -86,18 +106,111 @@ static CALI_BPF_INLINE int forward_or_drop(struct cali_tc_ctx *ctx)
/* fall through to FIB if enabled or the IP stack, don't give up yet. */
rc = TC_ACT_UNSPEC;
} else if (CALI_F_FROM_HEP && bpf_core_enum_value_exists(enum bpf_func_id, BPF_FUNC_redirect_peer)) {
bool redirect_peer = GLOBAL_FLAGS & CALI_GLOBALS_REDIRECT_PEER;
if ((rc = try_redirect_to_peer(ctx)) == TC_ACT_REDIRECT) {
goto skip_fib;
}
} else if (CALI_F_FROM_WEP && fwd_fib(&ctx->fwd)) {
struct cali_rt *dest_rt = cali_rt_lookup(&ctx->state->ip_dst);
if (dest_rt == NULL) {
CALI_DEBUG("No route for " IP_FMT " to forward from WEP", &ctx->state->ip_dst);
goto try_fib_external;
}

if (cali_rt_flags_local_host(dest_rt->flags)) {
goto skip_fib;
}

if (state->ct_result.ifindex_fwd == CT_INVALID_IFINDEX) {
*fib_params(ctx) = (struct bpf_fib_lookup) {
#ifdef IPVER6
.family = 10, /* AF_INET6 */
#else
.family = 2, /* AF_INET */
#endif
.tot_len = 0,
.ifindex = CALI_F_TO_HOST ? ctx->skb->ingress_ifindex : ctx->skb->ifindex,
.l4_protocol = state->ip_proto,
};
#ifdef IPVER6
ipv6_addr_t_to_be32_4_ip(fib_params(ctx)->ipv6_src, &state->ip_src);
ipv6_addr_t_to_be32_4_ip(fib_params(ctx)->ipv6_dst, &state->ip_dst);
#else
fib_params(ctx)->ipv4_src = state->ip_src;
fib_params(ctx)->ipv4_dst = state->ip_dst;
#endif

if (redirect_peer && ct_result_rc(state->ct_result.rc) == CALI_CT_ESTABLISHED_BYPASS &&
state->ct_result.ifindex_fwd != CT_INVALID_IFINDEX && !(ctx->state->ct_result.flags & CALI_CT_FLAG_SKIP_REDIR_PEER)) {
rc = bpf_redirect_peer(state->ct_result.ifindex_fwd, 0);
rc = bpf_fib_lookup(ctx->skb, fib_params(ctx), sizeof(struct bpf_fib_lookup), 0);
state->ct_result.ifindex_fwd = fib_params(ctx)->ifindex;
}

if (cali_rt_flags_local_workload(dest_rt->flags)) {
if ((rc = try_redirect_to_peer(ctx)) == TC_ACT_REDIRECT) {
goto skip_fib;
}
} else if (cali_rt_is_vxlan(dest_rt) && !(cali_rt_is_same_subnet(dest_rt))) {
struct bpf_tunnel_key key = {
.tunnel_id = OVERLAY_TUNNEL_ID,
};
__u64 flags = 0;
__u32 size = 0;
#ifdef IPVER6
ipv6_addr_t_to_be32_4_ip(key.remote_ipv6, &dest_rt->next_hop);
flags |= BPF_F_TUNINFO_IPV6;
size = offsetof(struct bpf_tunnel_key, local_ipv6);
#else
key.remote_ipv4 = bpf_htonl(dest_rt->next_hop);
flags |= BPF_F_ZERO_CSUM_TX;
size = offsetof(struct bpf_tunnel_key, local_ipv4);
#endif

int err = bpf_skb_set_tunnel_key(ctx->skb, &key, size, flags);
CALI_DEBUG("bpf_skb_set_tunnel_key %d nh " IP_FMT, err, &dest_rt->next_hop);

rc = bpf_redirect(state->ct_result.ifindex_fwd, 0);
if (rc == TC_ACT_REDIRECT) {
CALI_DEBUG("Redirect to peer interface (%d) succeeded.", state->ct_result.ifindex_fwd);
CALI_DEBUG("Redirect to dev %d without fib lookup",
state->ct_result.ifindex_fwd);
goto skip_fib;
}
}
} else if (CALI_F_VXLAN && CALI_F_TO_HEP) {
if (!(ctx->skb->mark & CALI_SKB_MARK_SEEN) || (ctx->fwd.mark & CALI_SKB_MARK_FROM_NAT_IFACE_OUT)) {
/* packet to vxlan from the host, needs to set tunnel key. Either
* it wasn't seen or it was routed via the bpfnat device because
* its destination was a service and CTLB is disabled
*/
struct cali_rt *dest_rt = cali_rt_lookup(&ctx->state->ip_dst);
if (dest_rt == NULL) {
CALI_DEBUG("No route for " IP_FMT " at vxlan device", &ctx->state->ip_dst);
goto deny;
}
if (!cali_rt_is_vxlan(dest_rt)) {
CALI_DEBUG("Not a vxlan route for " IP_FMT " at vxlan device", &ctx->state->ip_dst);
goto deny;
}

struct bpf_tunnel_key key = {
.tunnel_id = OVERLAY_TUNNEL_ID,
};

__u64 flags = 0;
__u32 size = 0;
#ifdef IPVER6
ipv6_addr_t_to_be32_4_ip(key.remote_ipv6, &dest_rt->next_hop);
flags |= BPF_F_TUNINFO_IPV6;
size = offsetof(struct bpf_tunnel_key, local_ipv6);
#else
key.remote_ipv4 = bpf_htonl(dest_rt->next_hop);
flags |= BPF_F_ZERO_CSUM_TX;
size = offsetof(struct bpf_tunnel_key, local_ipv4);
#endif

int err = bpf_skb_set_tunnel_key(ctx->skb, &key, size, flags);
CALI_DEBUG("bpf_skb_set_tunnel_key %d nh " IP_FMT, err, &dest_rt->next_hop);
}
}

try_fib_external:
#if CALI_FIB_ENABLED
/* Only do FIB for packets to be turned around at a HEP on HEP egress. */
if (CALI_F_TO_HEP && !(ctx->state->flags & CALI_ST_CT_NP_LOOP)) {
Expand Down Expand Up @@ -363,6 +476,9 @@ static CALI_BPF_INLINE int forward_or_drop(struct cali_tc_ctx *ctx)
CALI_INFO("Final result=DENY (%d). Program execution time: %lluns",
reason, prog_end_time-state->prog_start_time);
} else {
if (CALI_F_VXLAN && CALI_F_TO_HOST) {
bpf_skb_change_type(ctx->skb, PACKET_HOST);
}
CALI_INFO("Final result=ALLOW rc %d. Program execution time: %lluns",
rc, prog_end_time-state->prog_start_time);
}
Expand Down
1 change: 1 addition & 0 deletions felix/bpf-gpl/globals.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ struct name { \
__be16 profiling; \
__u32 natin_idx; \
__u32 natout_idx; \
__u32 overlay_tunnel_id; \
__u8 iface_name[16]; \
__u32 log_filter_jmp; \
__u32 jumps[40]; \
Expand Down
7 changes: 1 addition & 6 deletions felix/bpf-gpl/nat.h
Original file line number Diff line number Diff line change
Expand Up @@ -159,12 +159,7 @@ static CALI_BPF_INLINE int vxlan_attempt_decap(struct cali_tc_ctx *ctx)
{
/* decap on host ep only if directly for the node */
CALI_DEBUG("VXLAN tunnel packet to " IP_FMT " (host IP=" IP_FMT ")",
#ifdef IPVER6
bpf_ntohl(ip_hdr(ctx)->daddr.in6_u.u6_addr32[3]),
#else
bpf_ntohl(ip_hdr(ctx)->daddr),
#endif
debug_ip(HOST_IP));
&ip_hdr(ctx)->daddr, debug_ip(HOST_IP));

if (!rt_addr_is_local_host((ipv46_addr_t *)&ip_hdr(ctx)->daddr)) {
goto fall_through;
Expand Down
3 changes: 3 additions & 0 deletions felix/bpf-gpl/routes.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ enum cali_rt_flags {
CALI_RT_NO_DSR = 0x80,
CALI_RT_BLACKHOLE_DROP = 0x100,
CALI_RT_BLACKHOLE_REJECT = 0x200,
CALI_RT_VXLAN = 0x400,
CALI_RT_VM_WORKLOAD = 0x800,
};

Expand Down Expand Up @@ -79,6 +80,8 @@ static CALI_BPF_INLINE enum cali_rt_flags cali_rt_lookup_flags(ipv46_addr_t *add
#define cali_rt_is_host(rt) ((rt)->flags & CALI_RT_HOST)
#define cali_rt_is_workload(rt) ((rt)->flags & CALI_RT_WORKLOAD)
#define cali_rt_is_tunneled(rt) ((rt)->flags & CALI_RT_TUNNELED)
#define cali_rt_is_vxlan(rt) ((rt)->flags & CALI_RT_VXLAN)
#define cali_rt_is_same_subnet(rt) ((rt)->flags & CALI_RT_SAME_SUBNET)
#define cali_rt_is_blackhole_drop(rt) ((rt)->flags & CALI_RT_BLACKHOLE_DROP)
#define cali_rt_is_blackhole_reject(rt) ((rt)->flags & CALI_RT_BLACKHOLE_REJECT)

Expand Down
4 changes: 2 additions & 2 deletions felix/bpf/bpf_syscall.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ import (
"golang.org/x/sys/unix"

"github.com/projectcalico/calico/felix/bpf/asm"
"github.com/projectcalico/calico/felix/bpf/bpfutils"
"github.com/projectcalico/calico/felix/bpf/utils"
)

// #include "bpf_syscall.h"
Expand All @@ -39,7 +39,7 @@ const maxLogSize = 128 * 1024 * 1024

func LoadBPFProgramFromInsns(insns asm.Insns, name, license string, progType uint32) (fd ProgFD, err error) {
log.Debugf("LoadBPFProgramFromInsns(%v, %q, %v, %v)", insns, name, license, progType)
bpfutils.IncreaseLockedMemoryQuota()
utils.IncreaseLockedMemoryQuota()

// Occasionally see retryable errors here, retry silently a few times before going into log-collection mode.
backoff := 1 * time.Millisecond
Expand Down
48 changes: 0 additions & 48 deletions felix/bpf/bpfutils/bpf_utils.go

This file was deleted.

2 changes: 1 addition & 1 deletion felix/bpf/hook/load.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@ package hook
import (
"strings"

"github.com/projectcalico/calico/felix/bpf/bpfutils"
tcdefs "github.com/projectcalico/calico/felix/bpf/tc/defs"
bpfutils "github.com/projectcalico/calico/felix/bpf/utils"
)

func init() {
Expand Down
5 changes: 3 additions & 2 deletions felix/bpf/libbpf/libbpf.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ import (

"golang.org/x/sys/unix"

"github.com/projectcalico/calico/felix/bpf/bpfutils"
"github.com/projectcalico/calico/felix/bpf/utils"
)

// #cgo CFLAGS: -I${SRCDIR}/../../bpf-gpl/libbpf/src -I${SRCDIR}/../../bpf-gpl/libbpf/include/uapi -I${SRCDIR}/../../bpf-gpl -Werror
Expand Down Expand Up @@ -101,7 +101,7 @@ func (m *Map) IsJumpMap() bool {
}

func OpenObject(filename string) (*Obj, error) {
bpfutils.IncreaseLockedMemoryQuota()
utils.IncreaseLockedMemoryQuota()
cFilename := C.CString(filename)
defer C.free(unsafe.Pointer(cFilename))
obj, err := C.bpf_obj_open(cFilename)
Expand Down Expand Up @@ -420,6 +420,7 @@ func (t *TcGlobalData) Set(m *Map) error {
C.ushort(t.Profiling),
C.uint(t.NatIn),
C.uint(t.NatOut),
C.uint(t.OverlayTunnelID),
C.uint(t.LogFilterJmp),
&cJumps[0], // it is safe because we hold the reference here until we return.
&cJumpsV6[0],
Expand Down
2 changes: 2 additions & 0 deletions felix/bpf/libbpf/libbpf_api.h
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,7 @@ void bpf_tc_set_globals(struct bpf_map *map,
ushort profiling,
uint natin,
uint natout,
uint overlay_tunnel_id,
uint log_filter_jmp,
uint *jumps,
uint *jumps6)
Expand All @@ -171,6 +172,7 @@ void bpf_tc_set_globals(struct bpf_map *map,
.profiling = profiling,
.natin_idx = natin,
.natout_idx = natout,
.overlay_tunnel_id = overlay_tunnel_id,
.log_filter_jmp = log_filter_jmp,
};

Expand Down
35 changes: 18 additions & 17 deletions felix/bpf/libbpf/libbpf_common.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,23 +23,24 @@ type GlobalData interface {
}

type TcGlobalData struct {
IfaceName string
HostIPv4 [16]byte
IntfIPv4 [16]byte
ExtToSvcMark uint32
Tmtu uint16
VxlanPort uint16
PSNatStart uint16
PSNatLen uint16
HostTunnelIPv4 [16]byte
Flags uint32
WgPort uint16
Wg6Port uint16
Profiling uint16
NatIn uint32
NatOut uint32
LogFilterJmp uint32
Jumps [40]uint32
IfaceName string
HostIPv4 [16]byte
IntfIPv4 [16]byte
ExtToSvcMark uint32
Tmtu uint16
VxlanPort uint16
PSNatStart uint16
PSNatLen uint16
HostTunnelIPv4 [16]byte
Flags uint32
WgPort uint16
Wg6Port uint16
Profiling uint16
NatIn uint32
NatOut uint32
OverlayTunnelID uint32
LogFilterJmp uint32
Jumps [40]uint32

HostIPv6 [16]byte
IntfIPv6 [16]byte
Expand Down
3 changes: 1 addition & 2 deletions felix/bpf/nat/connecttime.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@ import (

"github.com/projectcalico/calico/felix/bpf"
"github.com/projectcalico/calico/felix/bpf/bpfdefs"
"github.com/projectcalico/calico/felix/bpf/bpfutils"
"github.com/projectcalico/calico/felix/bpf/jump"
"github.com/projectcalico/calico/felix/bpf/libbpf"
"github.com/projectcalico/calico/felix/bpf/maps"
Expand Down Expand Up @@ -270,7 +269,7 @@ func ProgFileName(logLevel string, ipver string) string {
}

btf := ""
if bpfutils.BTFEnabled {
if utils.BTFEnabled {
btf = "_co-re"
}

Expand Down
1 change: 1 addition & 0 deletions felix/bpf/routes/map.go
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ const (
FlagNoDSR Flags = 0x80
FlagBlackHoleDrop Flags = 0x100
FlagBlackHoleReject Flags = 0x200
FlagVXLAN Flags = 0x400
FlagVMWorkload Flags = 0x800

FlagsUnknown Flags = 0
Expand Down
1 change: 1 addition & 0 deletions felix/bpf/tc/attach.go
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ type AttachPoint struct {
UDPOnly bool
RedirectPeer bool
FlowLogsEnabled bool
OverlayTunnelID uint32
}

var ErrDeviceNotFound = errors.New("device not found")
Expand Down
Loading