Skip to content

Commit 18f5774

Browse files
lunarthegreyclaude
andcommitted
fast-path: rewrite bpf_fib_lookup init to never produce memset patterns
Initial attempt (reverted in this commit) was to provide our own #[no_mangle] #[inline(always)] memset/memcpy/memmove. rustc warns "`#[inline]` is ignored on externally exported functions" — meaning the alwaysinline attribute is dropped, the functions stay as separate symbols, and LLVM's libcall lowering still produces bpf-to-bpf calls. Same problem. Real fix: never produce code patterns LLVM merges into memset libcalls. Three patterns triggered the verifier failure on UniFi: - 3 bytes: `(*mctx_ptr)._pad = [0; 3]` in forward_success - 6 bytes: `bpf_fib_lookup.smac` and `dmac` zero-init in struct literal - 22 bytes: combined zero block from `[u32::from_ne_bytes(src), 0, 0, 0]` (twice, for ipv6_src/dst), `tbid: 0`, and `[0; 6]` arms. Replaced the struct-literal init with `MaybeUninit<bpf_fib_lookup>` plus selective field writes via raw pointer: - Input fields (read by kernel): family, l4_protocol, sport, dport, tot_len, ifindex, tos/flowinfo, ipv4/v6_src, ipv4/v6_dst, tbid. - Output fields (written by kernel): smac, dmac — left uninit. - Unused union bytes (kernel doesn't read on this path): also left uninit. No `[0; N]` literals. No struct literal with adjacent zeros. Single field stores at non-overlapping offsets — LLVM has no run of zeros to merge. Also dropped the explicit `_pad = [0; 3]` write in forward_success: per-CPU map elements are zero-initialized at map creation and we never overwrite the padding region, so the explicit zero-init was both a no-op and the source of the 3-byte memset libcall. `&*p` borrows from the raw pointer for compare_and_bump and dispatch_fib. Those callers only read fields the kernel filled in (ifindex, smac, dmac) — uninit padding bytes aren't observed. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent fdca005 commit 18f5774

1 file changed

Lines changed: 85 additions & 73 deletions

File tree

  • crates/modules/fast-path/bpf/src

crates/modules/fast-path/bpf/src/main.rs

Lines changed: 85 additions & 73 deletions
Original file line numberDiff line numberDiff line change
@@ -13,11 +13,9 @@
1313

1414
use aya_ebpf::{
1515
bindings::{
16-
bpf_fib_lookup, bpf_fib_lookup__bindgen_ty_1, bpf_fib_lookup__bindgen_ty_2,
17-
bpf_fib_lookup__bindgen_ty_3, bpf_fib_lookup__bindgen_ty_4,
18-
bpf_fib_lookup__bindgen_ty_5, xdp_action, BPF_FIB_LKUP_RET_BLACKHOLE,
19-
BPF_FIB_LKUP_RET_FRAG_NEEDED, BPF_FIB_LKUP_RET_NO_NEIGH, BPF_FIB_LKUP_RET_PROHIBIT,
20-
BPF_FIB_LKUP_RET_SUCCESS, BPF_FIB_LKUP_RET_UNREACHABLE,
16+
bpf_fib_lookup, xdp_action, BPF_FIB_LKUP_RET_BLACKHOLE, BPF_FIB_LKUP_RET_FRAG_NEEDED,
17+
BPF_FIB_LKUP_RET_NO_NEIGH, BPF_FIB_LKUP_RET_PROHIBIT, BPF_FIB_LKUP_RET_SUCCESS,
18+
BPF_FIB_LKUP_RET_UNREACHABLE,
2119
},
2220
helpers::gen::{
2321
bpf_fib_lookup as fib_lookup_helper, bpf_xdp_adjust_head, bpf_xdp_adjust_tail,
@@ -252,54 +250,67 @@ fn handle_ipv4(
252250
return dispatch_custom_fib(custom, ctx, eth, ip as *mut u8, true, ingress_vid);
253251
}
254252

255-
// Build the FIB-lookup struct via a struct literal, NOT
256-
// `mem::zeroed()`. LLVM lowers a 60-byte zero-init into a memset
257-
// libcall that the BPF backend emits as a separate bpf-to-bpf
258-
// function. Combined with v0.2.5's `bpf_tail_call`, that violates
259-
// the kernel verifier's "tail_calls are not allowed in non-JITed
260-
// programs with bpf-to-bpf calls" guard on UniFi 5.15, which runs
261-
// BPF in interpreter mode. Each union is initialized via its
262-
// largest variant so every byte is written explicitly — no memset
263-
// libcall, no subprogram. See SPEC §11.x and the v0.2.5 UniFi
264-
// post-mortem in tail-call-architecture.md.
265-
let mut fib = bpf_fib_lookup {
266-
family: AF_INET,
267-
l4_protocol: proto,
268-
sport,
269-
dport,
270-
__bindgen_anon_1: bpf_fib_lookup__bindgen_ty_1 {
271-
tot_len: u16::from_be_bytes(unsafe { (*ip).tot_len }),
272-
},
273-
ifindex: unsafe { (*ctx.ctx).ingress_ifindex },
274-
// 4-byte union: write the 4-byte `flowinfo` arm so all bytes
275-
// are covered. For IPv4 the kernel only reads `tos` (the LSB),
276-
// so put tos in the low byte and zero the rest.
277-
__bindgen_anon_2: bpf_fib_lookup__bindgen_ty_2 {
278-
flowinfo: unsafe { (*ip).tos } as u32,
279-
},
280-
// 16-byte unions: write the 16-byte `ipv6_*` arms. For IPv4
281-
// the kernel reads the first 4 bytes as `ipv4_*`; that's the
282-
// first u32 of the array, with the rest zero.
283-
__bindgen_anon_3: bpf_fib_lookup__bindgen_ty_3 {
284-
ipv6_src: [u32::from_ne_bytes(src_bytes), 0, 0, 0],
285-
},
286-
__bindgen_anon_4: bpf_fib_lookup__bindgen_ty_4 {
287-
ipv6_dst: [u32::from_ne_bytes(dst_bytes), 0, 0, 0],
288-
},
289-
__bindgen_anon_5: bpf_fib_lookup__bindgen_ty_5 { tbid: 0 },
290-
smac: [0; 6],
291-
dmac: [0; 6],
292-
};
253+
// Build `bpf_fib_lookup` via `MaybeUninit` + selective field
254+
// writes — NEVER as a struct literal with zero-padded arrays
255+
// ([0; 6], [val, 0, 0, 0], etc.) and NEVER with `mem::zeroed()`.
256+
//
257+
// Why: LLVM's "merge stores" pass recognizes any run of adjacent
258+
// zero stores and lowers them into a `memset` libcall. The BPF
259+
// backend has no libc to link against, so memset becomes a
260+
// separate BPF function — a bpf-to-bpf subprogram call. That
261+
// combined with `bpf_tail_call` (which fast_path uses to enter
262+
// finalize) trips the kernel verifier's
263+
// "tail_calls are not allowed in non-JITed programs with bpf-to-bpf calls"
264+
// guard on kernels whose JIT returns false from
265+
// `bpf_jit_supports_subprog_tailcalls()` (UniFi 5.15-ui-cn9670
266+
// aarch64 is the canonical hit).
267+
//
268+
// We can't fix this with `#[no_mangle] #[inline(always)]` shim
269+
// functions either: rustc warns "`#[inline]` is ignored on
270+
// externally exported functions" and refuses to inline at the
271+
// libcall site.
272+
//
273+
// The only reliable mitigation is to never produce the merge-able
274+
// pattern in the first place. So: write each input field
275+
// individually via raw pointer; leave smac/dmac uninit (kernel
276+
// writes them as outputs); leave the unread union bytes uninit
277+
// (kernel doesn't read offsets 13-15 of __bindgen_anon_2 or
278+
// 20-31/36-47 of __bindgen_anon_3/4 in the IPv4 path).
279+
let mut fib_uninit: core::mem::MaybeUninit<bpf_fib_lookup> =
280+
core::mem::MaybeUninit::uninit();
281+
let p = fib_uninit.as_mut_ptr();
282+
unsafe {
283+
(*p).family = AF_INET;
284+
(*p).l4_protocol = proto;
285+
(*p).sport = sport;
286+
(*p).dport = dport;
287+
(*p).__bindgen_anon_1.tot_len = u16::from_be_bytes((*ip).tot_len);
288+
(*p).ifindex = (*ctx.ctx).ingress_ifindex;
289+
(*p).__bindgen_anon_2.tos = (*ip).tos;
290+
(*p).__bindgen_anon_3.ipv4_src = u32::from_ne_bytes(src_bytes);
291+
(*p).__bindgen_anon_4.ipv4_dst = u32::from_ne_bytes(dst_bytes);
292+
// tbid: routing table ID. 0 = main table. Single u32 store —
293+
// no merge with other zero stores because adjacent fields are
294+
// either non-zero (the writes above) or kernel-output (smac
295+
// immediately follows tbid at offset 52).
296+
(*p).__bindgen_anon_5.tbid = 0;
297+
}
293298

294299
let ret = unsafe {
295300
fib_lookup_helper(
296301
ctx.ctx as *mut _,
297-
&mut fib as *mut _,
302+
p as *mut _,
298303
mem::size_of::<bpf_fib_lookup>() as i32,
299304
0,
300305
)
301306
};
302307

308+
// Borrow from the raw pointer for the rest of the function.
309+
// After fib_lookup_helper, the kernel has written smac/dmac on
310+
// success; on failure, dispatch_fib doesn't read them. Other
311+
// unread union bytes stay uninit but aren't observed.
312+
let fib_ref = unsafe { &*p };
313+
303314
if compare {
304315
// Compare mode: we also ran above iff `use_custom`; but since
305316
// `compare` implies `use_custom` (userspace rejects otherwise),
@@ -309,10 +320,10 @@ fn handle_ipv4(
309320
// or manual map poke), the branch above is unreachable and
310321
// we still do only the kernel lookup here.
311322
let custom = fib::lookup_v4(src_bytes, dst_bytes, proto, sport_h, dport_h);
312-
compare_and_bump(ret as u32, &fib, &custom);
323+
compare_and_bump(ret as u32, fib_ref, &custom);
313324
}
314325

315-
dispatch_fib(ret as u32, ctx, eth, ip as *mut u8, true, &fib, ingress_vid)
326+
dispatch_fib(ret as u32, ctx, eth, ip as *mut u8, true, fib_ref, ingress_vid)
316327
}
317328

318329
#[inline(always)]
@@ -379,47 +390,44 @@ fn handle_ipv6(
379390
return dispatch_custom_fib(custom, ctx, eth, ip as *mut u8, false, ingress_vid);
380391
}
381392

382-
// See handle_ipv4 for the rationale behind the struct-literal
393+
// See handle_ipv4 for the rationale behind the MaybeUninit
383394
// pattern (avoids the LLVM-emitted memset bpf-to-bpf subprogram
384395
// that conflicts with bpf_tail_call on non-JITed kernels).
385-
let mut fib = bpf_fib_lookup {
386-
family: AF_INET6,
387-
l4_protocol: next,
388-
sport,
389-
dport,
390-
__bindgen_anon_1: bpf_fib_lookup__bindgen_ty_1 {
391-
tot_len: u16::from_be_bytes(unsafe { (*ip).payload_len }) + Ipv6Hdr::LEN as u16,
392-
},
393-
ifindex: unsafe { (*ctx.ctx).ingress_ifindex },
394-
__bindgen_anon_2: bpf_fib_lookup__bindgen_ty_2 {
395-
flowinfo: u32::from_be_bytes(unsafe { (*ip).vcf }),
396-
},
397-
__bindgen_anon_3: bpf_fib_lookup__bindgen_ty_3 {
398-
ipv6_src: bytes_to_u32x4(&src_bytes),
399-
},
400-
__bindgen_anon_4: bpf_fib_lookup__bindgen_ty_4 {
401-
ipv6_dst: bytes_to_u32x4(&dst_bytes),
402-
},
403-
__bindgen_anon_5: bpf_fib_lookup__bindgen_ty_5 { tbid: 0 },
404-
smac: [0; 6],
405-
dmac: [0; 6],
406-
};
396+
let mut fib_uninit: core::mem::MaybeUninit<bpf_fib_lookup> =
397+
core::mem::MaybeUninit::uninit();
398+
let p = fib_uninit.as_mut_ptr();
399+
unsafe {
400+
(*p).family = AF_INET6;
401+
(*p).l4_protocol = next;
402+
(*p).sport = sport;
403+
(*p).dport = dport;
404+
(*p).__bindgen_anon_1.tot_len =
405+
u16::from_be_bytes((*ip).payload_len) + Ipv6Hdr::LEN as u16;
406+
(*p).ifindex = (*ctx.ctx).ingress_ifindex;
407+
(*p).__bindgen_anon_2.flowinfo = u32::from_be_bytes((*ip).vcf);
408+
(*p).__bindgen_anon_3.ipv6_src = bytes_to_u32x4(&src_bytes);
409+
(*p).__bindgen_anon_4.ipv6_dst = bytes_to_u32x4(&dst_bytes);
410+
(*p).__bindgen_anon_5.tbid = 0;
411+
// smac, dmac: kernel-output, leave uninit.
412+
}
407413

408414
let ret = unsafe {
409415
fib_lookup_helper(
410416
ctx.ctx as *mut _,
411-
&mut fib as *mut _,
417+
p as *mut _,
412418
mem::size_of::<bpf_fib_lookup>() as i32,
413419
0,
414420
)
415421
};
416422

423+
let fib_ref = unsafe { &*p };
424+
417425
if compare {
418426
let custom = fib::lookup_v6(src_bytes, dst_bytes, next, sport_h, dport_h);
419-
compare_and_bump(ret as u32, &fib, &custom);
427+
compare_and_bump(ret as u32, fib_ref, &custom);
420428
}
421429

422-
dispatch_fib(ret as u32, ctx, eth, ip as *mut u8, false, &fib, ingress_vid)
430+
dispatch_fib(ret as u32, ctx, eth, ip as *mut u8, false, fib_ref, ingress_vid)
423431
}
424432

425433
#[inline(always)]
@@ -529,7 +537,11 @@ fn forward_success(
529537
(*mctx_ptr).ingress_vid = ingress_vid;
530538
(*mctx_ptr).ip_offset = ip_offset as u32;
531539
(*mctx_ptr).is_v4 = u8::from(is_v4);
532-
(*mctx_ptr)._pad = [0; 3];
540+
// _pad: per-CPU array elements are zero-initialized at map
541+
// creation and we never write the padding region — leaving
542+
// it as the residual zeros saves a memset libcall (now
543+
// inlined via libcalls.rs anyway, but the moot store is
544+
// still worth eliding).
533545
}
534546
} else {
535547
// Per-CPU array index 0 is always present; this branch should

0 commit comments

Comments
 (0)