diff --git a/Cargo.lock b/Cargo.lock index 3b64876..17dcd25 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -680,7 +680,7 @@ checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe" [[package]] name = "packetframe-cli" -version = "0.2.4" +version = "0.2.5" dependencies = [ "anyhow", "clap", @@ -697,7 +697,7 @@ dependencies = [ [[package]] name = "packetframe-common" -version = "0.2.4" +version = "0.2.5" dependencies = [ "anyhow", "flate2", @@ -710,7 +710,7 @@ dependencies = [ [[package]] name = "packetframe-fast-path" -version = "0.2.4" +version = "0.2.5" dependencies = [ "anyhow", "aya", @@ -733,7 +733,7 @@ dependencies = [ [[package]] name = "packetframe-probe" -version = "0.2.4" +version = "0.2.5" dependencies = [ "anyhow", "aya", diff --git a/Cargo.toml b/Cargo.toml index 2b0f9a3..df4122b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -23,7 +23,7 @@ exclude = [ # detach pacing for bridge members, integrity-check Total-line parse # fix). See SPEC.md §11.14 for the rollout history and # `docs/runbooks/custom-fib.md` for operations. -version = "0.2.4" +version = "0.2.5" edition = "2021" # MSRV. Deliberately behind the rust-toolchain.toml pin (which is the # latest stable) so a contributor with a slightly older toolchain still diff --git a/README.md b/README.md index cf2bde7..88fda90 100644 --- a/README.md +++ b/README.md @@ -71,8 +71,9 @@ PacketFrame complements existing routing daemons rather than replacing them. The | Connected-destination fast-path (`local-prefix`) | Production (v0.2.1+) | | `fallback-default` synthesis | Production (v0.2.1+) | | `block-prefix` XDP-time drop | Production (v0.2.1+) | -| `mss-clamp` directive (fast-path) | Production (v0.2.4+) | +| `mss-clamp` directive (fast-path) | Production (v0.2.4+; per-prefix loads on stricter kernels in v0.2.5+) | | `packetframe reconfigure` / `systemctl reload packetframe` | Production (v0.2.4+) | +| Two-stage BPF datapath (`fast_path` + `finalize` via `bpf_tail_call`) | Production (v0.2.5+) — see [docs/runbooks/tail-call-architecture.md](docs/runbooks/tail-call-architecture.md) | | `probe` module — diagnostic XDP | Production | | `ddos` module — XDP-time SYN-flood + amplification filter | Future — sketched in SPEC §5.2 (priority 0–999, security/admission) | | `sampler` module — per-flow ringbuf observability | Future — sketched in SPEC §5.3 (priority 2000–2999, observation) | @@ -86,7 +87,7 @@ Releases are published on the [GitHub releases page](https://github.com/unredact ### Debian / Ubuntu (.deb) ```sh -VERSION=v0.2.4 +VERSION=v0.2.5 ARCH=$(dpkg --print-architecture) # amd64 or arm64 curl -LO "https://github.com/unredacted/packetframe/releases/download/${VERSION}/packetframe_${VERSION#v}_${ARCH}.deb" @@ -103,7 +104,7 @@ Installs `/usr/bin/packetframe`, the systemd unit at `/lib/systemd/system/packet For musl-static deployments, non-Debian distros, or anything else: ```sh -VERSION=v0.2.4 +VERSION=v0.2.5 TARGET=aarch64-unknown-linux-gnu # or: x86_64-unknown-linux-{gnu,musl}, aarch64-unknown-linux-musl curl -LO "https://github.com/unredacted/packetframe/releases/download/${VERSION}/packetframe-${VERSION}-${TARGET}.tar.gz" diff --git a/VERSION b/VERSION index abd4105..3a4036f 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.2.4 +0.2.5 diff --git a/crates/cli/src/loader.rs b/crates/cli/src/loader.rs index 67aa852..ae867f1 100644 --- a/crates/cli/src/loader.rs +++ b/crates/cli/src/loader.rs @@ -747,6 +747,14 @@ pub fn status(config_path: &Path) -> Result<(), String> { Err(e) => return Err(format!("registry read: {e}")), } + // v0.2.5+ tail-call chain summary. Confirms MUTATION_PROGS[0] + // is populated with the `finalize` program FD; if empty, + // fast_path's tail_call hits ErrTailCall and traffic falls + // through to kernel slow-path. Operators see this immediately + // in the status output rather than chasing it via err counter. + #[cfg(target_os = "linux")] + print_tail_call_chain(&config.global.bpffs_root); + // Live counter readback from the pinned STATS map. Works // whether or not the loader is running — the pin survives // process exit (§8.5). @@ -757,6 +765,24 @@ pub fn status(config_path: &Path) -> Result<(), String> { Ok(()) } +#[cfg(all(target_os = "linux", feature = "fast-path"))] +fn print_tail_call_chain(bpffs_root: &Path) { + use packetframe_fast_path::tail_call_chain_from_pin; + println!(); + println!("tail-call chain (from {}):", bpffs_root.display()); + match tail_call_chain_from_pin(bpffs_root) { + Ok(true) => println!( + " MUTATION_PROGS[0]: populated (finalize) — \ + confirm prog_id via `bpftool prog show name finalize`" + ), + Ok(false) => println!( + " MUTATION_PROGS[0]: — fast_path's tail_call will fail; traffic \ + falls to kernel slow-path. Restart packetframe to repopulate." + ), + Err(e) => eprintln!(" MUTATION_PROGS pin unavailable ({e}); loader may not be attached"), + } +} + #[cfg(all(target_os = "linux", feature = "fast-path"))] fn print_stats(bpffs_root: &Path) { // §4.6 counter names, indexed by `StatIdx` discriminants. Order @@ -764,7 +790,7 @@ fn print_stats(bpffs_root: &Path) { // Append-only — adding new entries at the end is fine; renumbering // breaks dashboards. Indices 0-19 are the kernel-fib counter set; // 20-31 were appended in the Option F custom-FIB rollout (§4.11). - const NAMES: [&str; 33] = [ + const NAMES: [&str; 37] = [ "rx_total", "matched_v4", "matched_v6", @@ -799,6 +825,12 @@ fn print_stats(bpffs_root: &Path) { "nexthop_seq_retry", "bmp_peer_down", "bogon_dropped", + // --- v0.2.4: mss-clamp --- + "mss_clamp_applied", + "mss_clamp_skipped", + // --- v0.2.5: two-stage datapath --- + "err_tail_call", + "err_mutation_ctx", ]; print_fib_status(bpffs_root); diff --git a/crates/modules/fast-path/bpf/src/finalize.rs b/crates/modules/fast-path/bpf/src/finalize.rs new file mode 100644 index 0000000..a634d2a --- /dev/null +++ b/crates/modules/fast-path/bpf/src/finalize.rs @@ -0,0 +1,448 @@ +//! Finalize stage: tail-called by `fast_path` after classification + +//! L2/TTL mutations. Owns mss-clamp lookup + mutation, VLAN choreography, +//! and the final `bpf_redirect_map` call. +//! +//! Lives in its own XDP program so it gets a fresh 512-byte BPF stack +//! budget. v0.2.4 inlined this work into `fast_path` and ran into UniFi +//! 5.15's stricter stack accounting (rejected at `combined stack size of +//! 3 calls is 544. Too large`). Splitting fixes the budget and provides +//! the pattern for future fast-path-internal stages. +//! +//! Communication from `fast_path` is via two side channels: +//! - The packet itself (preserved across `bpf_tail_call`). +//! - `MUTATION_CTX` per-CPU scratch (egress info, ingress VID, IP offset, +//! v4/v6 discriminator) — written by fast_path, read here. +//! +//! See SPEC.md §4.x "Two-stage BPF datapath" and +//! `docs/runbooks/tail-call-architecture.md`. + +use aya_ebpf::{ + bindings::xdp_action, + helpers::gen::bpf_xdp_adjust_head, + macros::xdp, + maps::lpm_trie::Key, + programs::XdpContext, +}; +use network_types::ip::{IpProto, Ipv4Hdr, Ipv6Hdr}; + +use crate::maps::{ + bump_stat, StatIdx, CFG, MSS_CLAMP_BY_IFACE, MSS_CLAMP_V4, MSS_CLAMP_V6, MUTATION_CTX, + REDIRECT_DEVMAP, +}; + +/// 802.1Q TPID. Mirror of `main::TPID_8021Q`; kept local so finalize +/// is self-contained. +const TPID_8021Q: u16 = 0x8100; + +/// Sentinel for "no VLAN" — mirror of `main::VLAN_NONE`. +const VLAN_NONE: u16 = 0; + +/// SYN flag in TCP byte 13. +const TCP_FLAG_SYN: u8 = 0x02; + +/// IANA TCP protocol number, materialized from `IpProto` (network-types +/// 0.2 changed `proto`/`next_hdr` to raw `u8`). +const PROTO_TCP: u8 = IpProto::Tcp as u8; + +/// Upper bound on `ip_offset` post-VLAN-parse. Used to give the BPF +/// verifier a tight `umax` so range propagation through packet-pointer +/// arithmetic works — see commentary on the `ip_offset > MAX_IP_OFFSET` +/// check in `finalize`. +const MAX_IP_OFFSET: usize = 64; + +#[xdp] +pub fn finalize(ctx: XdpContext) -> u32 { + // Read the per-CPU mutation context written by fast_path right + // before its tail_call. Always present in production; fail-safe + // XDP_PASS if missing so traffic falls through to kernel rather + // than getting dropped silently. + let mctx = match unsafe { MUTATION_CTX.get(0) } { + Some(c) => *c, + None => { + bump_stat(StatIdx::ErrMutationCtx); + return xdp_action::XDP_PASS; + } + }; + + let egress_ifindex = mctx.egress_ifindex; + let egress_vid = mctx.egress_vid; + let ingress_vid = mctx.ingress_vid; + let is_v4 = mctx.is_v4 != 0; + + // Clamp ip_offset to MAX_IP_OFFSET (64). The BPF verifier's + // `find_good_pkt_pointers` refuses to propagate range information + // through packet-pointer arithmetic when the scalar offset's + // umax_value exceeds MAX_PACKET_OFF (0xffff) — which is the case + // for `mctx.ip_offset` since it's read from a map and the verifier + // sees its full u32 range. Capping the offset gives the verifier + // a tight umax it can reason about, so the subsequent + // `pkt + ip_offset + ip_hdr_size > end` bound check actually + // propagates a usable readable-range back to `pkt + ip_offset`. + // + // In practice fast_path writes `EthHdr::LEN` (14) or + // `EthHdr::LEN + VLAN_HDR_LEN` (18); 64 is comfortable headroom + // for a future second VLAN tag without having to revisit this. + let ip_offset = mctx.ip_offset as usize; + if ip_offset > MAX_IP_OFFSET { + return xdp_action::XDP_PASS; + } + + // mss-clamp first, then VLAN choreography (which can shift bytes + // via bpf_xdp_adjust_head). mss-clamp's TCP-options walk relies on + // ip_offset being valid relative to ctx.data() — true until VLAN + // push/pop changes the layout. + mss_clamp_inline(&ctx, ip_offset, is_v4, egress_ifindex); + + if apply_vlan_egress(&ctx, ingress_vid, egress_vid).is_err() { + bump_stat(StatIdx::ErrVlan); + return xdp_action::XDP_ABORTED; + } + + match REDIRECT_DEVMAP.redirect(egress_ifindex, 0) { + Ok(_) => { + bump_stat(StatIdx::FwdOk); + xdp_action::XDP_REDIRECT + } + Err(_) => { + bump_stat(StatIdx::ErrFibOther); + xdp_action::XDP_PASS + } + } +} + +// --- MSS clamping (relocated from main.rs in v0.2.5) ---------------------- + +/// Top-level entry: dispatch into the v4 or v6 path with a constant-sized +/// bounds check. Splitting upfront (rather than threading `is_v4` through +/// a single function) is what satisfies the BPF verifier — the bounds +/// check needs to use a compile-time-known size so the verifier can +/// track that subsequent reads via `*const Ipv4Hdr` / `*const Ipv6Hdr` +/// stay within the checked region. +/// +/// The ergonomic alternative — `let size = if is_v4 { 20 } else { 40 }; +/// if start + offset + size > end { ... }; ip_addr as *const Ipv4Hdr` — +/// loses the verifier's bound-tracking when the cast is reached: see +/// `R9 offset is outside of the packet` from the v0.2.5 prerelease build. +#[inline(always)] +fn mss_clamp_inline(ctx: &XdpContext, ip_offset: usize, is_v4: bool, egress_ifindex: u32) { + if is_v4 { + mss_clamp_v4(ctx, ip_offset, egress_ifindex); + } else { + mss_clamp_v6(ctx, ip_offset, egress_ifindex); + } +} + +/// IPv4 path: bounds-check exactly `Ipv4Hdr::LEN` bytes, then cast +/// directly to `*const Ipv4Hdr`. Mirrors the `ptr_at` pattern from +/// main.rs that the verifier accepts. +#[inline(always)] +fn mss_clamp_v4(ctx: &XdpContext, ip_offset: usize, egress_ifindex: u32) { + let start = ctx.data(); + let end = ctx.data_end(); + if start + ip_offset + Ipv4Hdr::LEN > end { + return; + } + let ip: *const Ipv4Hdr = (start + ip_offset) as *const Ipv4Hdr; + let proto = unsafe { (*ip).proto }; + if proto != PROTO_TCP { + return; + } + let clamp = lookup_mss_clamp_v4(ip, egress_ifindex); + if clamp == 0 { + return; + } + mss_clamp_tcp(ctx, ip as *const u8, Ipv4Hdr::LEN, clamp); +} + +/// IPv6 path: same pattern as `mss_clamp_v4` but with a 40-byte bound. +#[inline(always)] +fn mss_clamp_v6(ctx: &XdpContext, ip_offset: usize, egress_ifindex: u32) { + let start = ctx.data(); + let end = ctx.data_end(); + if start + ip_offset + Ipv6Hdr::LEN > end { + return; + } + let ip: *const Ipv6Hdr = (start + ip_offset) as *const Ipv6Hdr; + let proto = unsafe { (*ip).next_hdr }; + if proto != PROTO_TCP { + return; + } + let clamp = lookup_mss_clamp_v6(ip, egress_ifindex); + if clamp == 0 { + return; + } + mss_clamp_tcp(ctx, ip as *const u8, Ipv6Hdr::LEN, clamp); +} + +/// Walk the TCP-options block of a matched SYN/SYN-ACK and mutate the MSS +/// option in place if the existing MSS is greater than the clamp value. +/// Recomputes the TCP checksum incrementally (RFC 1624). Bumps +/// `MssClampApplied` on rewrite, `MssClampSkipped` on "policy applies but +/// no rewrite needed." +/// +/// Takes a typed `ip_ptr` (already bounds-checked for `ip_hdr_size` bytes) +/// rather than a raw `tcp_offset` scalar. Inside, we recover ip_offset as +/// `(ip_ptr as usize) - start`, which the BPF verifier tracks as a `pkt - +/// pkt` subtraction with `umax = MAX_PACKET_OFF (0xffff)`. That tight bound +/// is what makes subsequent `start + tcp_offset + N > end` checks +/// propagate readable-range to the read site (mirrors v0.2.4's working +/// pattern; passing `tcp_offset` directly as a `usize` from a map read +/// loses verifier tracking and the post-bound-check pkt pointer ends up +/// with `r=0`). +/// +/// Bounds-checked at every read against `ctx.data_end()`. Options walk +/// is fixed-bound at 8 iterations to keep BPF verifier state-space +/// exploration tractable (a 40-iteration walk hit the verifier's +/// 1M-instruction limit during v0.2.4 development). +#[inline(always)] +fn mss_clamp_tcp(ctx: &XdpContext, ip_ptr: *const u8, ip_hdr_size: usize, clamp: u16) { + let start = ctx.data(); + let end = ctx.data_end(); + + // pkt-derived scalar; verifier tracks umax tightly. + let ip_offset = (ip_ptr as usize) - start; + let tcp_offset = ip_offset + ip_hdr_size; + + // Need 20 bytes for the fixed TCP header before walking options. + if start + tcp_offset + 20 > end { + return; + } + + // Bytes 12-13 of TCP header: data_offset:4 | reserved:4 | flags:8. + let doff_byte = unsafe { *((start + tcp_offset + 12) as *const u8) }; + let flags = unsafe { *((start + tcp_offset + 13) as *const u8) }; + if flags & TCP_FLAG_SYN == 0 { + return; // Not SYN/SYN-ACK. + } + let doff_words = (doff_byte >> 4) as usize; + if !(5..=15).contains(&doff_words) { + return; + } + let tcp_hdr_len = doff_words * 4; + let opts_len = tcp_hdr_len - 20; + if opts_len == 0 { + bump_stat(StatIdx::MssClampSkipped); + return; + } + if start + tcp_offset + tcp_hdr_len > end { + return; + } + + // Walk options. Cap at 8 — real SYN packets put MSS in the first + // 1-4 options (Linux's tcp_options_write emits MSS very early); 8 + // is comfortable headroom while keeping verifier state-space bounded. + let opts_start_off = tcp_offset + 20; + let mut cursor: usize = 0; + let mut found = false; + + for _ in 0..8 { + if cursor >= opts_len { + break; + } + let p_addr = start + opts_start_off + cursor; + if p_addr + 4 > end { + break; + } + let p = p_addr as *const u8; + let kind = unsafe { *p }; + if kind == 0 { + break; // EOL. + } + if kind == 1 { + cursor += 1; // NOP. + continue; + } + let length = unsafe { *p.add(1) } as usize; + if length < 2 || cursor + length > opts_len { + break; // Malformed. + } + if kind == 2 && length == 4 { + // MSS option: [kind=2, length=4, mss_be:2]. + let mss_be = unsafe { [*p.add(2), *p.add(3)] }; + let mss = u16::from_be_bytes(mss_be); + if mss > clamp { + let new_mss_be = clamp.to_be_bytes(); + unsafe { + let pmut = p as *mut u8; + *pmut.add(2) = new_mss_be[0]; + *pmut.add(3) = new_mss_be[1]; + } + // RFC 1624 incremental TCP checksum update. + let csum_off = tcp_offset + 16; + if start + csum_off + 2 > end { + return; + } + let csum_p = (start + csum_off) as *mut u8; + let old_csum_be = unsafe { [*csum_p, *csum_p.add(1)] }; + let old_csum = u16::from_be_bytes(old_csum_be); + let new_csum = csum_replace_u16(old_csum, mss, clamp); + let new_csum_be = new_csum.to_be_bytes(); + unsafe { + *csum_p = new_csum_be[0]; + *csum_p.add(1) = new_csum_be[1]; + } + bump_stat(StatIdx::MssClampApplied); + } else { + bump_stat(StatIdx::MssClampSkipped); + } + found = true; + break; + } + cursor += length; + } + + if !found { + bump_stat(StatIdx::MssClampSkipped); + } +} + +#[inline(always)] +fn csum_replace_u16(old_csum: u16, old_val: u16, new_val: u16) -> u16 { + let mut sum: u32 = (!old_csum) as u32 + (!old_val) as u32 + new_val as u32; + sum = (sum & 0xffff) + (sum >> 16); + sum = (sum & 0xffff) + (sum >> 16); + !(sum as u16) +} + +#[inline(always)] +fn lookup_mss_clamp_v4(ip: *const Ipv4Hdr, egress_ifindex: u32) -> u16 { + { + let key = Key::new(32, unsafe { (*ip).src_addr }); + if let Some(entry) = MSS_CLAMP_V4.get(&key) { + if entry.iface_filter == 0 || entry.iface_filter == egress_ifindex { + return entry.mss; + } + } + } + { + let key = Key::new(32, unsafe { (*ip).dst_addr }); + if let Some(entry) = MSS_CLAMP_V4.get(&key) { + if entry.iface_filter == 0 || entry.iface_filter == egress_ifindex { + return entry.mss; + } + } + } + if let Some(mss) = unsafe { MSS_CLAMP_BY_IFACE.get(&egress_ifindex) } { + if *mss != 0 { + return *mss; + } + } + if let Some(c) = CFG.get(0) { + return c.mss_clamp_global; + } + 0 +} + +#[inline(always)] +fn lookup_mss_clamp_v6(ip: *const Ipv6Hdr, egress_ifindex: u32) -> u16 { + { + let key = Key::new(128, unsafe { (*ip).src_addr }); + if let Some(entry) = MSS_CLAMP_V6.get(&key) { + if entry.iface_filter == 0 || entry.iface_filter == egress_ifindex { + return entry.mss; + } + } + } + { + let key = Key::new(128, unsafe { (*ip).dst_addr }); + if let Some(entry) = MSS_CLAMP_V6.get(&key) { + if entry.iface_filter == 0 || entry.iface_filter == egress_ifindex { + return entry.mss; + } + } + } + if let Some(mss) = unsafe { MSS_CLAMP_BY_IFACE.get(&egress_ifindex) } { + if *mss != 0 { + return *mss; + } + } + if let Some(c) = CFG.get(0) { + return c.mss_clamp_global; + } + 0 +} + +// --- VLAN choreography (relocated from main.rs in v0.2.5) ----------------- + +/// SPEC §4.7's four-case VLAN matrix, keyed on VLAN_NONE-sentinel u16s +/// rather than `Option` (the verifier rejects the Option-argument +/// spill across a function boundary). +#[inline(always)] +fn apply_vlan_egress(ctx: &XdpContext, ingress_vid: u16, egress_vid: u16) -> Result<(), ()> { + let ingress_present = ingress_vid != VLAN_NONE; + let egress_present = egress_vid != VLAN_NONE; + match (ingress_present, egress_present) { + (false, false) => Ok(()), + (true, true) if ingress_vid == egress_vid => Ok(()), + (false, true) => vlan_push(ctx, egress_vid), + (true, false) => vlan_pop(ctx), + (true, true) => vlan_rewrite(ctx, egress_vid), + } +} + +/// Untagged → tagged. Grows headroom by 4, shifts the MAC pair left by +/// 4 bytes, writes TPID + TCI into the freed-up slot. Uses +/// `core::ptr::copy` (memmove) not `copy_nonoverlapping` because source +/// and destination overlap — SPEC calls this out as a footgun. +#[inline(always)] +fn vlan_push(ctx: &XdpContext, vid: u16) -> Result<(), ()> { + let rc = unsafe { bpf_xdp_adjust_head(ctx.ctx as *mut _, -4) }; + if rc != 0 { + return Err(()); + } + let start = ctx.data(); + let end = ctx.data_end(); + if start + 18 > end { + return Err(()); + } + unsafe { + let base = start as *mut u8; + core::ptr::copy(base.add(4), base, 6); + core::ptr::copy(base.add(10), base.add(6), 6); + let tpid = TPID_8021Q.to_be_bytes(); + *base.add(12) = tpid[0]; + *base.add(13) = tpid[1]; + let tci = (vid & 0x0fff).to_be_bytes(); + *base.add(14) = tci[0]; + *base.add(15) = tci[1]; + } + Ok(()) +} + +/// Tagged → untagged. Shifts the MAC pair right by 4 over the about-to- +/// be-discarded TPID+TCI slot, then shrinks headroom by 4. +#[inline(always)] +fn vlan_pop(ctx: &XdpContext) -> Result<(), ()> { + let start = ctx.data(); + let end = ctx.data_end(); + if start + 18 > end { + return Err(()); + } + unsafe { + let base = start as *mut u8; + core::ptr::copy(base.add(6), base.add(10), 6); + core::ptr::copy(base, base.add(4), 6); + } + let rc = unsafe { bpf_xdp_adjust_head(ctx.ctx as *mut _, 4) }; + if rc != 0 { + return Err(()); + } + Ok(()) +} + +/// Tagged VID X → tagged VID Y. No headroom change; overwrite TCI in place. +#[inline(always)] +fn vlan_rewrite(ctx: &XdpContext, vid: u16) -> Result<(), ()> { + let start = ctx.data(); + let end = ctx.data_end(); + if start + 16 > end { + return Err(()); + } + let tci = (vid & 0x0fff).to_be_bytes(); + unsafe { + let base = start as *mut u8; + *base.add(14) = tci[0]; + *base.add(15) = tci[1]; + } + Ok(()) +} diff --git a/crates/modules/fast-path/bpf/src/main.rs b/crates/modules/fast-path/bpf/src/main.rs index 39fd065..ae856b9 100644 --- a/crates/modules/fast-path/bpf/src/main.rs +++ b/crates/modules/fast-path/bpf/src/main.rs @@ -31,12 +31,13 @@ use network_types::{ }; mod fib; +mod finalize; mod maps; use maps::{ bump_stat, StatIdx, ALLOW_V4, ALLOW_V6, BLOCK_V4, BLOCK_V6, CFG, FP_CFG_FLAG_COMPARE_MODE, - FP_CFG_FLAG_CUSTOM_FIB, FP_CFG_FLAG_HEAD_SHIFT_128, MSS_CLAMP_BY_IFACE, MSS_CLAMP_V4, - MSS_CLAMP_V6, REDIRECT_DEVMAP, VLAN_RESOLVE, + FP_CFG_FLAG_CUSTOM_FIB, FP_CFG_FLAG_HEAD_SHIFT_128, MUTATION_CTX, MUTATION_PROGS, + REDIRECT_DEVMAP, VLAN_RESOLVE, }; const AF_INET: u8 = 2; @@ -50,9 +51,6 @@ const PROTO_TCP: u8 = IpProto::Tcp as u8; const PROTO_UDP: u8 = IpProto::Udp as u8; const PROTO_ICMPV6: u8 = IpProto::Ipv6Icmp as u8; -/// 802.1Q TPID. What we write back on push / rewrite. -const TPID_8021Q: u16 = 0x8100; - /// Sentinel u16 representing "no VLAN" in the VID-passing API below. /// 802.1Q reserves VID 0 for priority-only tagging, so `vid == 0` /// means absent from the fast-path's perspective. Using a single u16 @@ -418,8 +416,14 @@ fn dispatch_fib( /// Success path shared between the kernel-FIB (`dispatch_fib`) and /// custom-FIB (`dispatch_custom_fib`) code paths. Takes a decided /// `(egress_ifindex, smac, dmac)` and performs VLAN resolution, -/// devmap pre-check, TTL decrement, L2 rewrite, VLAN choreography, -/// and redirect. Must not be called without a valid forward decision. +/// devmap pre-check, TTL decrement, L2 rewrite, then writes the +/// per-CPU `MUTATION_CTX` and tail-calls into the `finalize` program +/// which handles mss-clamp + VLAN choreography + `bpf_redirect_map`. +/// +/// The split is v0.2.5+: prior versions did mss-clamp + VLAN + +/// redirect inline, but the cumulative stack pushed past UniFi 5.15's +/// stricter accounting. Splitting gives finalize its own 512-byte +/// stack budget. See SPEC §4.x "Two-stage BPF datapath." #[inline(always)] fn forward_success( ctx: &XdpContext, @@ -453,48 +457,52 @@ fn forward_success( return Ok(xdp_action::XDP_PASS); } - // MSS clamping (v0.2.4+, closes SPEC §11.4 gap). Mutate the TCP - // MSS option in SYN/SYN-ACK packets before they're handed to the - // egress NIC — must happen before `apply_vlan_egress` (which can - // shift packet bytes via `bpf_xdp_adjust_head`) but is order- - // independent w.r.t. TTL decrement and L2 rewrite (those edit - // existing bytes in place). No-op for non-TCP, non-SYN packets, - // or when no clamp policy applies. Skipped under `is_dry_run()` - // because dry-run returns XDP_PASS earlier in the flow. - mss_clamp_inline(ctx, ip, is_v4, egress_ifindex); - - // TTL/hop_limit + csum first — IP header's position in memory - // doesn't change with adjust_head, only its offset from `data`. + // TTL/hop_limit + csum — IP header's position in memory doesn't + // change with adjust_head, only its offset from `data`. Safe to do + // before VLAN choreography (which lives in finalize). if is_v4 { decrement_ipv4_ttl(ip as *mut Ipv4Hdr); } else { decrement_ipv6_hop_limit(ip as *mut Ipv6Hdr); } - // L2 rewrite BEFORE push/pop — push moves the current MAC - // positions into new slots, so the values there need to be the - // post-FIB MACs. + // L2 rewrite (in-place; no offset change). Done here so the post- + // FIB MACs are in place before VLAN push potentially shifts them. unsafe { (*eth).dst_addr = dmac; (*eth).src_addr = smac; } - // VLAN choreography (SPEC §4.7). On error, XDP_ABORTED + - // err_vlan per the spec. - if apply_vlan_egress(ctx, ingress_vid, egress_vid).is_err() { - bump_stat(StatIdx::ErrVlan); - return Ok(xdp_action::XDP_ABORTED); - } - - match REDIRECT_DEVMAP.redirect(egress_ifindex, 0) { - Ok(_) => { - bump_stat(StatIdx::FwdOk); - Ok(xdp_action::XDP_REDIRECT) - } - Err(_) => { - bump_stat(StatIdx::ErrFibOther); - Ok(xdp_action::XDP_PASS) + // Write decision state to per-CPU MUTATION_CTX, then tail-call + // into MUTATION_PROGS[0] (= finalize). Finalize handles mss-clamp, + // VLAN choreography, and the final bpf_redirect_map. `ip` is a + // packet pointer derived from `ctx.data()`; their difference is a + // verifier-tracked scalar. + let ip_offset = (ip as usize) - ctx.data(); + if let Some(mctx_ptr) = MUTATION_CTX.get_ptr_mut(0) { + unsafe { + (*mctx_ptr).egress_ifindex = egress_ifindex; + (*mctx_ptr).egress_vid = egress_vid; + (*mctx_ptr).ingress_vid = ingress_vid; + (*mctx_ptr).ip_offset = ip_offset as u32; + (*mctx_ptr).is_v4 = u8::from(is_v4); + (*mctx_ptr)._pad = [0; 3]; } + } else { + // Per-CPU array index 0 is always present; this branch should + // be unreachable. Fail-safe XDP_PASS so traffic falls to kernel + // rather than getting blackholed. + bump_stat(StatIdx::ErrMutationCtx); + return Ok(xdp_action::XDP_PASS); } + + // tail_call returns Err on slot-empty / invalid; on success it + // doesn't return at all (control transfers to finalize). The Err + // branch is fail-safe — if userspace's populate_mutation_progs + // somehow skipped slot 0, traffic falls through to kernel rather + // than getting silently dropped. + let _ = unsafe { MUTATION_PROGS.tail_call(ctx, 0) }; + bump_stat(StatIdx::ErrTailCall); + Ok(xdp_action::XDP_PASS) } /// Dispatch on a [`fib::CustomFibResult`] returned by the custom-FIB @@ -564,365 +572,6 @@ fn compare_and_bump( } } -// --- VLAN choreography ---------------------------------------------------- - -/// §4.7's four-case matrix, keyed on VLAN_NONE-sentinel u16s rather -/// than Option (the verifier rejects the Option-argument spill). -/// Returns Err(()) on any packet-manipulation failure. -#[inline(always)] -fn apply_vlan_egress(ctx: &XdpContext, ingress_vid: u16, egress_vid: u16) -> Result<(), ()> { - let ingress_present = ingress_vid != VLAN_NONE; - let egress_present = egress_vid != VLAN_NONE; - match (ingress_present, egress_present) { - (false, false) => Ok(()), - (true, true) if ingress_vid == egress_vid => Ok(()), - (false, true) => vlan_push(ctx, egress_vid), - (true, false) => vlan_pop(ctx), - (true, true) => vlan_rewrite(ctx, egress_vid), - } -} - -/// Untagged → tagged. Grows headroom by 4, shifts the MAC pair left by -/// 4 bytes, writes TPID + TCI into the freed-up slot. SPEC §4.7. -/// -/// Uses `core::ptr::copy` (true memmove) not `copy_nonoverlapping` — -/// the source and destination regions overlap. This is the footgun -/// SPEC calls out: on some VID combinations `copy_nonoverlapping` -/// produces wrong bytes on the wire and the verifier does NOT catch it. -#[inline(always)] -fn vlan_push(ctx: &XdpContext, vid: u16) -> Result<(), ()> { - // Grow headroom by 4 bytes. - let rc = unsafe { bpf_xdp_adjust_head(ctx.ctx as *mut _, -4) }; - if rc != 0 { - return Err(()); - } - - let start = ctx.data(); - let end = ctx.data_end(); - if start + 18 > end { - return Err(()); - } - - // SAFETY: pointers derived from the freshly-re-read data/data_end, - // both bounds-checked for the 18-byte range we touch. - unsafe { - let base = start as *mut u8; - // Move dst_mac left: [4..10] → [0..6]. 6-byte move, overlapping. - core::ptr::copy(base.add(4), base, 6); - // Move src_mac left: [10..16] → [6..12]. Overlapping. - core::ptr::copy(base.add(10), base.add(6), 6); - // TPID (0x8100) at [12..14], big-endian. - let tpid = TPID_8021Q.to_be_bytes(); - *base.add(12) = tpid[0]; - *base.add(13) = tpid[1]; - // TCI at [14..16]: PCP=0 | DEI=0 | VID (12 bits), big-endian. - let tci = (vid & 0x0fff).to_be_bytes(); - *base.add(14) = tci[0]; - *base.add(15) = tci[1]; - // [16..18] already holds the original inner ethertype — - // bpf_xdp_adjust_head didn't touch packet bytes, only the - // data pointer. - } - Ok(()) -} - -/// Tagged → untagged. Shifts the MAC pair right by 4 bytes over the -/// (about-to-be-discarded) TPID+TCI slot, then shrinks headroom by 4. -#[inline(always)] -fn vlan_pop(ctx: &XdpContext) -> Result<(), ()> { - let start = ctx.data(); - let end = ctx.data_end(); - if start + 18 > end { - return Err(()); - } - // SAFETY: bounds-checked 18-byte range. - unsafe { - let base = start as *mut u8; - // Move src_mac right first: [6..12] → [10..16]. Overlapping. - core::ptr::copy(base.add(6), base.add(10), 6); - // Move dst_mac right: [0..6] → [4..10]. Overlapping. - core::ptr::copy(base, base.add(4), 6); - } - // Shrink headroom by 4; new data starts 4 bytes later. - let rc = unsafe { bpf_xdp_adjust_head(ctx.ctx as *mut _, 4) }; - if rc != 0 { - return Err(()); - } - Ok(()) -} - -/// Tagged VID X → tagged VID Y (X ≠ Y). No headroom change; overwrite -/// the TCI bytes in place. -#[inline(always)] -fn vlan_rewrite(ctx: &XdpContext, vid: u16) -> Result<(), ()> { - let start = ctx.data(); - let end = ctx.data_end(); - if start + 16 > end { - return Err(()); - } - let tci = (vid & 0x0fff).to_be_bytes(); - unsafe { - let base = start as *mut u8; - *base.add(14) = tci[0]; - *base.add(15) = tci[1]; - } - Ok(()) -} - -// --- MSS clamping (v0.2.4+, SPEC §4.x — closes §11.4 gap) ----------------- - -/// SYN flag in TCP byte 13 (low byte of `doff_flags` over the wire). -const TCP_FLAG_SYN: u8 = 0x02; - -/// Walk the TCP-options block of a matched SYN/SYN-ACK and mutate the -/// MSS option in place if a clamp policy applies and the existing MSS -/// is greater than the clamp value. Recomputes the TCP checksum -/// incrementally (RFC 1624). Bumps `MssClampApplied` on rewrite and -/// `MssClampSkipped` on "policy applies but no rewrite needed" — i.e. -/// existing MSS already ≤ clamp, no MSS option present, or malformed -/// options walked past before finding it. -/// -/// Bounds-checked at every read against `ctx.data_end()`. The options -/// loop is fixed-bound at 40 iterations (TCP options max 40 bytes; -/// each iteration consumes at least 1 byte) so the BPF verifier -/// accepts it without unrolling concerns. -/// -/// Marked `#[inline(always)]` deliberately. Two earlier attempts to -/// split this into a subprogram (for stack budget) ran into the BPF -/// kernel verifier rejecting the bpf2bpf calling convention LLVM -/// emits: even when arguments are scalar, LLVM SROA decomposes -/// `&XdpContext` into `(data, data_end)` packet pointers, and the -/// verifier prohibits pointer-shift instructions on packet pointers -/// (the lift LLVM emits to extend 32-bit→64-bit). Inlining is the -/// only verifier-friendly option for code that touches the packet. -/// -/// Stack trim: each LPM key is block-scoped so the compiler can -/// reuse the same stack slot for src and dst keys rather than -/// holding both live; lookup helpers are also `#[inline(always)]` -/// for the same reason; src/dst addresses are read inside their -/// respective LPM blocks rather than at the function top. -#[inline(always)] -fn mss_clamp_inline(ctx: &XdpContext, ip: *mut u8, is_v4: bool, egress_ifindex: u32) { - let start = ctx.data(); - let end = ctx.data_end(); - - // Read protocol byte first — bail early on non-TCP, which is - // the overwhelmingly common case for fast-pathed traffic. - let proto = if is_v4 { - unsafe { (*(ip as *const Ipv4Hdr)).proto } - } else { - unsafe { (*(ip as *const Ipv6Hdr)).next_hdr } - }; - if proto != PROTO_TCP { - return; - } - - // Look up clamp value via the precedence chain. Returns 0 if no - // policy applies. Helper is `#[inline(always)]`; its locals share - // this function's frame and are block-scoped for slot reuse. - let clamp = if is_v4 { - lookup_mss_clamp_v4(ip as *const Ipv4Hdr, egress_ifindex) - } else { - lookup_mss_clamp_v6(ip as *const Ipv6Hdr, egress_ifindex) - }; - if clamp == 0 { - return; - } - - // Recover the IP-header offset so we can compute the TCP offset - // (and bounds-check) without holding `ip` as a separate pointer - // variable. ip - start is a scalar (pkt_a - pkt_b) per the - // verifier. - let ip_offset = (ip as usize) - start; - let tcp_offset = ip_offset + if is_v4 { Ipv4Hdr::LEN } else { Ipv6Hdr::LEN }; - - // Need 20 bytes for the fixed TCP header before walking options. - if start + tcp_offset + 20 > end { - return; - } - - // Bytes 12-13 of TCP header: data_offset:4 | reserved:4 | flags:8. - // doff is in 32-bit words; valid range [5, 15] = [20, 60] bytes. - let doff_byte = unsafe { *((start + tcp_offset + 12) as *const u8) }; - let flags = unsafe { *((start + tcp_offset + 13) as *const u8) }; - if flags & TCP_FLAG_SYN == 0 { - return; // Not SYN/SYN-ACK; clamp doesn't apply. - } - let doff_words = (doff_byte >> 4) as usize; - if !(5..=15).contains(&doff_words) { - return; - } - let tcp_hdr_len = doff_words * 4; - let opts_len = tcp_hdr_len - 20; - if opts_len == 0 { - // SYN with no options — operator policy says "clamp" but - // there's no MSS field to mutate. Count as skipped. - bump_stat(StatIdx::MssClampSkipped); - return; - } - if start + tcp_offset + tcp_hdr_len > end { - return; - } - - // Walk options. Cap at 8 iterations: real SYN packets put MSS in - // the first 1-4 options, and 8 is plenty of headroom while - // keeping the BPF verifier's state-exploration bounded. A 40- - // iteration walk hit the verifier's 1M-instruction processing - // limit due to combinatorial state explosion across the branches. - // - // Use sequential `if` checks rather than `match` for the same - // reason — fewer state-space splits per iteration. - let opts_start_off = tcp_offset + 20; - let mut cursor: usize = 0; - let mut found = false; - - for _ in 0..8 { - if cursor >= opts_len { - break; - } - let p_addr = start + opts_start_off + cursor; - // Need at least 4 bytes for a worst-case MSS option; if - // there's less than that left, no MSS is possible. Also - // bounds-checks the kind/length reads below. - if p_addr + 4 > end { - break; - } - let p = p_addr as *const u8; - let kind = unsafe { *p }; - if kind == 0 { - break; // EOL — no more options. - } - if kind == 1 { - cursor += 1; // NOP, single byte. - continue; - } - // Length-prefixed option (kind != 0, != 1). Length includes - // the kind+length bytes themselves; valid range is 2..=opts_len. - let length = unsafe { *p.add(1) } as usize; - if length < 2 || cursor + length > opts_len { - break; // Malformed. - } - if kind == 2 && length == 4 { - // MSS option: [kind=2, length=4, mss_be:2]. - let mss_be = unsafe { [*p.add(2), *p.add(3)] }; - let mss = u16::from_be_bytes(mss_be); - if mss > clamp { - let new_mss_be = clamp.to_be_bytes(); - unsafe { - let pmut = p as *mut u8; - *pmut.add(2) = new_mss_be[0]; - *pmut.add(3) = new_mss_be[1]; - } - // TCP csum is at offset 16 of the TCP header; do an - // RFC 1624 incremental update. - let csum_off = tcp_offset + 16; - if start + csum_off + 2 > end { - return; - } - let csum_p = (start + csum_off) as *mut u8; - let old_csum_be = unsafe { [*csum_p, *csum_p.add(1)] }; - let old_csum = u16::from_be_bytes(old_csum_be); - let new_csum = csum_replace_u16(old_csum, mss, clamp); - let new_csum_be = new_csum.to_be_bytes(); - unsafe { - *csum_p = new_csum_be[0]; - *csum_p.add(1) = new_csum_be[1]; - } - bump_stat(StatIdx::MssClampApplied); - } else { - bump_stat(StatIdx::MssClampSkipped); - } - found = true; - break; - } - cursor += length; - } - - if !found { - // Hit EOL or walked past the budget without an MSS option. - bump_stat(StatIdx::MssClampSkipped); - } -} - -/// Apply RFC 1624 incremental checksum update for a single 16-bit -/// field change: `HC' = ~(~HC + ~m + m')`. Two-iteration end-around -/// carry fold (max 2 needed for adding three 16-bit values into a -/// u32). Verifier-friendly — no loops. -#[inline(always)] -fn csum_replace_u16(old_csum: u16, old_val: u16, new_val: u16) -> u16 { - let mut sum: u32 = (!old_csum) as u32 + (!old_val) as u32 + new_val as u32; - sum = (sum & 0xffff) + (sum >> 16); - sum = (sum & 0xffff) + (sum >> 16); - !(sum as u16) -} - -/// Resolve the mss-clamp value for an IPv4 packet, in precedence -/// order: src-prefix → dst-prefix → per-egress → global. Returns 0 if -/// no policy applies. The LPM lookups respect each entry's -/// `iface_filter` (0 = wildcard). Block-scope each Key + addr so LLVM -/// can reuse the same stack slot rather than carrying both keys live -/// — matters for the cumulative BPF 512-byte stack budget. Reads -/// addresses through the IP-header pointer rather than taking them -/// by value so the caller doesn't pre-materialize them on its frame. -#[inline(always)] -fn lookup_mss_clamp_v4(ip: *const Ipv4Hdr, egress_ifindex: u32) -> u16 { - { - let key = Key::new(32, unsafe { (*ip).src_addr }); - if let Some(entry) = MSS_CLAMP_V4.get(&key) { - if entry.iface_filter == 0 || entry.iface_filter == egress_ifindex { - return entry.mss; - } - } - } - { - let key = Key::new(32, unsafe { (*ip).dst_addr }); - if let Some(entry) = MSS_CLAMP_V4.get(&key) { - if entry.iface_filter == 0 || entry.iface_filter == egress_ifindex { - return entry.mss; - } - } - } - if let Some(mss) = unsafe { MSS_CLAMP_BY_IFACE.get(&egress_ifindex) } { - if *mss != 0 { - return *mss; - } - } - if let Some(c) = CFG.get(0) { - return c.mss_clamp_global; - } - 0 -} - -/// IPv6 mirror of [`lookup_mss_clamp_v4`] — same precedence, /128 keys. -#[inline(always)] -fn lookup_mss_clamp_v6(ip: *const Ipv6Hdr, egress_ifindex: u32) -> u16 { - { - let key = Key::new(128, unsafe { (*ip).src_addr }); - if let Some(entry) = MSS_CLAMP_V6.get(&key) { - if entry.iface_filter == 0 || entry.iface_filter == egress_ifindex { - return entry.mss; - } - } - } - { - let key = Key::new(128, unsafe { (*ip).dst_addr }); - if let Some(entry) = MSS_CLAMP_V6.get(&key) { - if entry.iface_filter == 0 || entry.iface_filter == egress_ifindex { - return entry.mss; - } - } - } - if let Some(mss) = unsafe { MSS_CLAMP_BY_IFACE.get(&egress_ifindex) } { - if *mss != 0 { - return *mss; - } - } - if let Some(c) = CFG.get(0) { - return c.mss_clamp_global; - } - 0 -} - // --- TTL / csum / helpers ------------------------------------------------- /// Decrement IPv4 TTL and patch the header checksum using RFC 1624 diff --git a/crates/modules/fast-path/bpf/src/maps.rs b/crates/modules/fast-path/bpf/src/maps.rs index 4e9c6fe..61c3776 100644 --- a/crates/modules/fast-path/bpf/src/maps.rs +++ b/crates/modules/fast-path/bpf/src/maps.rs @@ -8,7 +8,7 @@ use aya_ebpf::{ macros::map, - maps::{Array, DevMapHash, HashMap, LpmTrie, PerCpuArray, RingBuf}, + maps::{Array, DevMapHash, HashMap, LpmTrie, PerCpuArray, ProgramArray, RingBuf}, }; /// Runtime flags poked by userspace via the `cfg` map. `version` is a @@ -134,11 +134,21 @@ pub enum StatIdx { /// to gauge how often clamps are firing vs being skipped on existing /// well-behaved traffic. MssClampSkipped = 34, + /// v0.2.5: fast-path's `bpf_tail_call` into `MUTATION_PROGS[0]` + /// returned an error (slot empty / invalid). Should be 0 in steady + /// state; non-zero means `populate_mutation_progs` failed at attach + /// time. fast_path falls back to XDP_PASS so traffic still flows + /// (kernel slow path) — the chain is fail-safe. + ErrTailCall = 35, + /// v0.2.5: finalize couldn't read the per-CPU `MUTATION_CTX` + /// scratch slot. Shouldn't happen — fast_path always writes before + /// tail_call. Diagnostic; finalize XDP_PASSes on this error. + ErrMutationCtx = 36, } /// Total counter count. Used as `stats` map `max_entries`. New counters /// bump this; dashboards keying on indices keep working. -pub const STATS_COUNT: u32 = 35; +pub const STATS_COUNT: u32 = 37; /// Flag bits for `FpCfg.flags`. Bits 0-1 are the IPv4/IPv6 enable /// mask (historical, load-bearing for dashboards). Bit 2 is the @@ -252,6 +262,45 @@ pub struct MssClampValue { pub iface_filter: u32, } +// --- Two-stage datapath: per-CPU mutation context (v0.2.5+) ------------ + +/// Per-CPU scratch carrying decision state from `fast_path` (XDP, attached +/// to ifaces) to `finalize` (XDP, tail-called by fast_path). +/// +/// fast_path writes this immediately before `MUTATION_PROGS.tail_call(0)`; +/// finalize reads it as its first action. The packet itself is preserved +/// across the tail-call by the kernel (`xdp_buff` survives, mutations +/// stick), but locally-computed scalars (FIB-resolved egress, ingress/egress +/// VID, IP-header offset, family discriminator) need this side channel. +/// +/// Per-CPU because the NAPI cycle is single-CPU; the read in finalize is +/// guaranteed to see the write in fast_path with no synchronization. +/// +/// 16 bytes, naturally aligned. Size + alignment is asserted in a +/// userspace test (see `crates/modules/fast-path/src/linux_impl.rs`'s +/// `MutationCtx` mirror). +#[repr(C)] +#[derive(Copy, Clone)] +pub struct MutationCtx { + /// FIB-resolved egress (pre-VLAN-resolve). Used by finalize for + /// VLAN_RESOLVE lookup, MSS_CLAMP_BY_IFACE lookup, and the final + /// `bpf_redirect_map` call. + pub egress_ifindex: u32, + /// Egress VLAN ID (from VLAN_RESOLVE in fast_path; 0 = untagged). + pub egress_vid: u16, + /// Ingress VLAN ID (from packet parse; 0 = untagged). Combined with + /// `egress_vid` drives the four-case VLAN choreography in finalize. + pub ingress_vid: u16, + /// Offset (bytes) from `ctx.data()` to the IP header. fast_path + /// already validated bounds; finalize uses this for mss-clamp's + /// TCP-header bounds check. + pub ip_offset: u32, + /// 1 = IPv4 packet, 0 = IPv6. Determines which IP-header struct + /// finalize casts to and which MSS_CLAMP map to consult. + pub is_v4: u8, + pub _pad: [u8; 3], +} + // --- Custom FIB value layouts (Option F) ------------------------------- /// Max nexthops in a single ECMP group. The XDP program walks `nh_idx` @@ -452,6 +501,28 @@ pub static MSS_CLAMP_V6: LpmTrie<[u8; 16], MssClampValue> = pub static MSS_CLAMP_BY_IFACE: HashMap = HashMap::with_max_entries(MSS_CLAMP_IFACE_MAX_ENTRIES, 0); +// --- Two-stage datapath maps (v0.2.5+) --------------------------------- + +/// Per-CPU scratch carrying decision state from `fast_path` to `finalize` +/// across `bpf_tail_call`. Single-element array; fast_path writes index 0 +/// before `tail_call`, finalize reads index 0 as its first action. Per- +/// CPU avoids contention; NAPI cycle is single-CPU so the read sees the +/// most recent write. +#[map] +pub static MUTATION_CTX: PerCpuArray = PerCpuArray::with_max_entries(1, 0); + +/// Tail-call jump table (v0.2.5+). fast_path tail_calls into slot 0 after +/// classification + L2/TTL mutations. Slot 0 holds `finalize` today. +/// Sized for 8 future stages (chained finalizers, alternate clamp +/// strategies, future packet transforms) — slot count is BPF-load-time- +/// fixed, so headroom here is cheap. +/// +/// Tail-call into an empty slot returns an error to the caller; fast_path +/// handles this by bumping `ErrTailCall` and returning XDP_PASS so traffic +/// fails open to kernel slow-path rather than getting blackholed. +#[map] +pub static MUTATION_PROGS: ProgramArray = ProgramArray::with_max_entries(8, 0); + // --- Custom-FIB maps (Option F, Phase 1) ------------------------------- // // These maps are declared and sized in Phase 1 but neither read nor diff --git a/crates/modules/fast-path/src/lib.rs b/crates/modules/fast-path/src/lib.rs index 6d88656..10bc4ea 100644 --- a/crates/modules/fast-path/src/lib.rs +++ b/crates/modules/fast-path/src/lib.rs @@ -29,7 +29,8 @@ pub mod reconcile; #[cfg(target_os = "linux")] pub use linux_impl::{ - fib_status_from_pin, stats_from_pin, trial_attach_native, FibStatusSnapshot, TrialResult, + fib_status_from_pin, stats_from_pin, tail_call_chain_from_pin, trial_attach_native, + FibStatusSnapshot, TrialResult, }; pub const MODULE_NAME: &str = "fast-path"; diff --git a/crates/modules/fast-path/src/linux_impl.rs b/crates/modules/fast-path/src/linux_impl.rs index 1e027e4..d44e312 100644 --- a/crates/modules/fast-path/src/linux_impl.rs +++ b/crates/modules/fast-path/src/linux_impl.rs @@ -657,6 +657,29 @@ fn populate_mss_clamp(ebpf: &mut Ebpf, mcfg: &ModuleConfig<'_>) -> ModuleResult< } pub fn attach(state: &mut ActiveState, cfg: &ModuleConfig<'_>) -> ModuleResult> { + // v0.2.5: load `finalize` first so its FD is available for the + // MUTATION_PROGS[0] population below. Order matters: fast_path's + // tail_call into MUTATION_PROGS[0] must succeed on every packet + // from the moment fast_path is attached, so finalize has to be + // loaded + populated *before* the per-iface attach loop below. + { + let finalize_prog: &mut Xdp = state + .ebpf + .program_mut(pin::FINALIZE_PROGRAM_NAME) + .ok_or_else(|| ModuleError::other(MODULE_NAME, "finalize program missing from ELF"))? + .try_into() + .map_err(|e| { + ModuleError::other(MODULE_NAME, format!("finalize program not XDP: {e}")) + })?; + finalize_prog.load().map_err(|e| { + ModuleError::other( + MODULE_NAME, + format!("Xdp::load(finalize) failed (verifier rejection?): {e}"), + ) + })?; + } + populate_mutation_progs(&mut state.ebpf)?; + let prog: &mut Xdp = state .ebpf .program_mut("fast_path") @@ -1170,18 +1193,26 @@ fn read_iface_driver(iface: &str) -> Option { } fn pin_program_and_maps(state: &mut ActiveState) -> ModuleResult<()> { - let prog_path = pin::program_path(&state.bpffs_root); - { + // v0.2.5: pin both `fast_path` (the iface-attached XDP) and + // `finalize` (the tail-called second stage). Both pins survive + // SIGTERM per SPEC §8.5; on restart, `pin::has_existing_pins` + // sees both and refuses to start until operator runs `detach --all`. + for prog_name in [pin::PROGRAM_NAME, pin::FINALIZE_PROGRAM_NAME] { + let prog_path = pin::program_path_for(&state.bpffs_root, prog_name); let prog: &mut Xdp = state .ebpf - .program_mut(pin::PROGRAM_NAME) - .ok_or_else(|| ModuleError::other(MODULE_NAME, "fast_path program missing for pin"))? + .program_mut(prog_name) + .ok_or_else(|| { + ModuleError::other(MODULE_NAME, format!("{prog_name} program missing for pin")) + })? .try_into() - .map_err(|e| ModuleError::other(MODULE_NAME, format!("pin: program not XDP: {e}")))?; + .map_err(|e| { + ModuleError::other(MODULE_NAME, format!("pin: {prog_name} not XDP: {e}")) + })?; prog.pin(&prog_path).map_err(|e| { ModuleError::other( MODULE_NAME, - format!("pin program at {}: {e}", prog_path.display()), + format!("pin {prog_name} at {}: {e}", prog_path.display()), ) })?; } @@ -1201,11 +1232,50 @@ fn pin_program_and_maps(state: &mut ActiveState) -> ModuleResult<()> { info!( pin_root = %pin::module_root(&state.bpffs_root).display(), - "program + maps pinned" + "fast_path + finalize programs + maps pinned" ); Ok(()) } +/// Populate `MUTATION_PROGS[0]` with `finalize`'s FD so fast_path's +/// `bpf_tail_call(MUTATION_PROGS, 0)` resolves to it. Must run after +/// `finalize.load()` (FD valid) and before `fast_path` attaches to any +/// iface (otherwise an early packet would tail-call into an empty slot, +/// trip ErrTailCall, and slow-path through the kernel). +/// +/// v0.2.5+. Single program in slot 0 for now; future stages either +/// replace slot 0 with a chain head or add to subsequent slots. +fn populate_mutation_progs(ebpf: &mut Ebpf) -> ModuleResult<()> { + use aya::maps::ProgramArray; + use aya::programs::ProgramFd; + + // Borrow scope: the ProgramFd has to outlive the ProgramArray::set + // call, but it borrows from `ebpf`. Open the ProgramFd first, then + // reborrow ebpf for the map. + let finalize_fd: ProgramFd = { + let prog: &Xdp = ebpf + .program(pin::FINALIZE_PROGRAM_NAME) + .ok_or_else(|| ModuleError::other(MODULE_NAME, "finalize program missing post-load"))? + .try_into() + .map_err(|e| ModuleError::other(MODULE_NAME, format!("finalize not XDP: {e}")))?; + prog.fd() + .map_err(|e| ModuleError::other(MODULE_NAME, format!("finalize fd: {e}")))? + .try_clone() + .map_err(|e| ModuleError::other(MODULE_NAME, format!("finalize fd clone: {e}")))? + }; + + let map = ebpf + .map_mut("MUTATION_PROGS") + .ok_or_else(|| ModuleError::other(MODULE_NAME, "MUTATION_PROGS map missing"))?; + let mut prog_array: ProgramArray<_> = ProgramArray::try_from(map) + .map_err(|e| ModuleError::other(MODULE_NAME, format!("MUTATION_PROGS try_from: {e}")))?; + prog_array.set(0, &finalize_fd, 0).map_err(|e| { + ModuleError::other(MODULE_NAME, format!("MUTATION_PROGS.set(0, finalize): {e}")) + })?; + info!("MUTATION_PROGS[0] populated with finalize program FD"); + Ok(()) +} + /// §2.3: per-interface trial-attach. `Native` and `Generic` are explicit /// (no fallback); `Auto` tries native first, falls back to generic on /// any error. The spec calls out that `bpftool feature probe` is @@ -1880,13 +1950,12 @@ pub fn stats_from_pin(bpffs_root: &Path) -> ModuleResult> { fn read_stats>( stats: &aya::maps::PerCpuArray, ) -> ModuleResult> { - // `STATS_COUNT` in bpf/src/maps.rs. v0.2.0 = 32 (20 core + 12 - // custom-FIB). v0.2.1 = 33 (added `bogon_dropped` for issue #33). - // Previous versions hardcoded 19 — an off-by-one that hid the - // `err_head_shift` counter from status readback. Keep this in - // lockstep with the BPF side or the last counters show zero - // unfairly. - const STATS_LEN: usize = 33; + // `STATS_COUNT` in bpf/src/maps.rs. Keep in lockstep with the BPF + // side or the last counters show zero unfairly. Prior versions + // hardcoded an off-by-one (19 hid `err_head_shift`; 33 hid + // `mss_clamp_*`); v0.2.5 = 37 (32 + bogon + 2 mss-clamp + 2 + // tail-call diagnostics). + const STATS_LEN: usize = 37; let mut out = vec![0u64; STATS_LEN]; for (idx, slot) in out.iter_mut().enumerate() { let values = stats @@ -1897,6 +1966,43 @@ fn read_stats>( Ok(out) } +/// Read `MUTATION_PROGS` from its bpffs pin and return whether slot 0 +/// is populated. Status command uses this to confirm the v0.2.5+ +/// tail-call chain (`fast_path` → `finalize`) is wired correctly. +/// An empty slot means an attach-time bug in `populate_mutation_progs`; +/// fast_path's `tail_call` will fail and bump `ErrTailCall` on every +/// fast-pathed packet. +/// +/// aya 0.13's ProgramArray exposes `indices()` (which keys are set) +/// but not a getter that returns the populated `ProgramFd`/prog_id — +/// the BPF_MAP_TYPE_PROG_ARRAY value is a kernel RawFd that becomes +/// invalid outside the loader's process. We just report populated/ +/// empty here; operators can confirm prog_id via +/// `bpftool prog show name finalize`. +pub fn tail_call_chain_from_pin(bpffs_root: &Path) -> ModuleResult { + use aya::maps::{Map, MapData, ProgramArray}; + + let pin_path = pin::map_path(bpffs_root, "MUTATION_PROGS"); + let map_data = MapData::from_pin(&pin_path).map_err(|e| { + ModuleError::other( + MODULE_NAME, + format!("open MUTATION_PROGS pin at {}: {e}", pin_path.display()), + ) + })?; + let map = Map::ProgramArray(map_data); + let prog_array: ProgramArray<_> = ProgramArray::try_from(map).map_err(|e| { + ModuleError::other(MODULE_NAME, format!("MUTATION_PROGS try_from pin: {e}")) + })?; + for idx in prog_array.indices() { + let key = idx + .map_err(|e| ModuleError::other(MODULE_NAME, format!("MUTATION_PROGS indices: {e}")))?; + if key == 0 { + return Ok(true); + } + } + Ok(false) +} + /// Accessor consumed by the bpffs-pin code in PR #6. For now, /// exposed so the CLI `status` can report the pin root without /// the module needing to expose `ActiveState` directly. diff --git a/crates/modules/fast-path/src/pin.rs b/crates/modules/fast-path/src/pin.rs index b24c1b7..b3cc01a 100644 --- a/crates/modules/fast-path/src/pin.rs +++ b/crates/modules/fast-path/src/pin.rs @@ -22,7 +22,7 @@ use std::path::{Path, PathBuf}; use crate::MODULE_NAME; /// Every §4.5 map that gets pinned. Order is not significant. -pub const MAP_NAMES: [&str; 14] = [ +pub const MAP_NAMES: [&str; 19] = [ "ALLOW_V4", "ALLOW_V6", "CFG", @@ -44,11 +44,27 @@ pub const MAP_NAMES: [&str; 14] = [ // pinned for uniform detach. "BLOCK_V4", "BLOCK_V6", + // --- v0.2.4: mss-clamp policy maps --- + "MSS_CLAMP_V4", + "MSS_CLAMP_V6", + "MSS_CLAMP_BY_IFACE", + // --- v0.2.5: two-stage datapath --- + "MUTATION_CTX", + "MUTATION_PROGS", ]; -/// The XDP program's pinned basename. +/// The fast-path XDP program's pinned basename (attached per-iface). pub const PROGRAM_NAME: &str = "fast_path"; +/// The finalize XDP program's pinned basename (tail-called by fast_path +/// via `MUTATION_PROGS[0]`; not directly attached). v0.2.5+. +pub const FINALIZE_PROGRAM_NAME: &str = "finalize"; + +/// All pinned program basenames in this module. Used by detach + +/// `has_existing_pins` to walk every pinned program. Append-only; new +/// programs go at the end. +pub const PROGRAM_NAMES: [&str; 2] = [PROGRAM_NAME, FINALIZE_PROGRAM_NAME]; + pub fn module_root(bpffs_root: &Path) -> PathBuf { bpffs_root.join(MODULE_NAME) } @@ -69,6 +85,13 @@ pub fn program_path(bpffs_root: &Path) -> PathBuf { progs_dir(bpffs_root).join(PROGRAM_NAME) } +/// Path for an arbitrary pinned program by basename. Used by the v0.2.5+ +/// pin lifecycle that walks `PROGRAM_NAMES` to pin both `fast_path` and +/// `finalize`. +pub fn program_path_for(bpffs_root: &Path, name: &str) -> PathBuf { + progs_dir(bpffs_root).join(name) +} + pub fn map_path(bpffs_root: &Path, name: &str) -> PathBuf { maps_dir(bpffs_root).join(name) } diff --git a/crates/modules/fast-path/tests/common/mod.rs b/crates/modules/fast-path/tests/common/mod.rs index f60aded..f30ac9c 100644 --- a/crates/modules/fast-path/tests/common/mod.rs +++ b/crates/modules/fast-path/tests/common/mod.rs @@ -23,7 +23,7 @@ use std::os::fd::{AsFd, AsRawFd}; use aya::{ - maps::{lpm_trie::Key as LpmKey, Array, LpmTrie, PerCpuArray}, + maps::{lpm_trie::Key as LpmKey, Array, LpmTrie, PerCpuArray, ProgramArray}, programs::{ProgramFd, Xdp}, Ebpf, Pod, }; @@ -116,18 +116,61 @@ pub struct Harness { } impl Harness { - /// Load + verify the fast-path program. Panics if BPF isn't built - /// or the kernel rejects it. + /// Load + verify both BPF programs (fast_path + finalize, v0.2.5+), + /// and populate `MUTATION_PROGS[0]` so fast_path's `bpf_tail_call` + /// jumps into finalize. Panics if BPF isn't built or the verifier + /// rejects either program. + /// + /// `bpf_prog_test_run` follows tail-calls — the kernel re-enters + /// its BPF dispatcher for the target program, so tests that issue + /// `harness.run(&packet)` see the verdict + mutations from the full + /// chain (fast_path → finalize) when the packet is a successful + /// forward. pub fn new() -> Self { let bytes = aligned_bpf_copy(); let mut bpf = Ebpf::load(&bytes).expect("aya::Ebpf::load"); - let prog: &mut Xdp = bpf - .program_mut("fast_path") - .expect("fast_path program present") - .try_into() - .expect("program is XDP-typed"); - prog.load().expect("verifier accepts program"); + // Load finalize first so its FD is available for the + // MUTATION_PROGS[0] population below. + { + let prog: &mut Xdp = bpf + .program_mut("finalize") + .expect("finalize program present") + .try_into() + .expect("finalize is XDP-typed"); + prog.load().expect("verifier accepts finalize program"); + } + + // Then fast_path. + { + let prog: &mut Xdp = bpf + .program_mut("fast_path") + .expect("fast_path program present") + .try_into() + .expect("program is XDP-typed"); + prog.load().expect("verifier accepts fast_path program"); + } + + // Populate the tail-call jump table. + { + let finalize_fd: ProgramFd = { + let prog: &Xdp = bpf + .program("finalize") + .expect("finalize program present") + .try_into() + .expect("finalize is XDP-typed"); + prog.fd() + .expect("finalize loaded") + .try_clone() + .expect("finalize fd dup") + }; + let map = bpf.map_mut("MUTATION_PROGS").expect("MUTATION_PROGS map"); + let mut prog_array: ProgramArray<_> = + ProgramArray::try_from(map).expect("ProgramArray try_from"); + prog_array + .set(0, &finalize_fd, 0) + .expect("MUTATION_PROGS.set(0, finalize)"); + } // Set a default cfg with dry_run=off and both families enabled. let mut harness = Self { bpf }; diff --git a/docs/runbooks/tail-call-architecture.md b/docs/runbooks/tail-call-architecture.md new file mode 100644 index 0000000..f049f65 --- /dev/null +++ b/docs/runbooks/tail-call-architecture.md @@ -0,0 +1,149 @@ +# Two-stage BPF datapath (v0.2.5+) + +PacketFrame's fast-path runs as **two BPF programs** chained by `bpf_tail_call`. This page exists for operators debugging the chain and contributors planning further BPF work. + +## Why two programs + +The single-program datapath (v0.2.4 and earlier) accumulated mutation, VLAN choreography, and redirect logic in one XDP program. On vanilla 5.15 + 6.6 kernels (CI's qemu test matrix) it loaded fine. On UniFi's `5.15.72-ui-cn9670` (real production hardware), the kernel verifier rejected at: + +``` +combined stack size of 3 calls is 544. Too large +stack depth 0+480+0+0 +``` + +UniFi's BPF patches plus the aarch64 JIT account stack ~120 bytes higher than vanilla 5.15 on x86_64 — same bytecode, different verifier accounting. + +Tail-calling into a second program gives that program its own fresh 512-byte stack. Beyond fixing the immediate budget issue, it establishes the pattern for future fast-path stages without re-bisecting stack bytes every time. + +This is **not** the multi-module dispatcher (SPEC §3.4 / §5.0). The dispatcher is for chaining independent modules at the same hook (ddos in front of fast-path, sampler behind it). Tail-call is for splitting one logical pipeline. Both are real and orthogonal; v0.2.5 ships only the latter. + +## Chain topology + +``` + packet ingress + │ + ▼ + ┌──────────────────────────────────────────────────┐ + │ fast_path (XDP, attached to eth0..ethN) │ Frame A + │ classification (allow-prefix, block-prefix) │ fits 512B + │ FIB lookup (kernel-fib | custom-fib | compare)│ + │ devmap pre-check │ + │ TTL decrement │ + │ L2 rewrite (smac/dmac in place) │ + │ write per-CPU MUTATION_CTX │ + │ bpf_tail_call(MUTATION_PROGS, 0) ──────────┐ │ + └────────────────────────────────────────────────│──┘ + │ + ▼ + ┌──────────────────────────────────────────────────┐ + │ finalize (XDP, tail-called by fast_path) │ Frame B + │ read MUTATION_CTX │ fresh 512B + │ mss-clamp lookup + (optional) MSS rewrite │ + │ VLAN choreography (push / pop / rewrite) │ + │ bpf_redirect_map(egress_ifindex) │ + └──────────────────────────────────────────────────┘ + │ + ▼ + egress NIC TX +``` + +The packet itself is preserved across the tail-call — `bpf_tail_call` doesn't touch `xdp_buff`, so any in-place mutations from fast_path (TTL, L2, etc.) carry over. What does NOT carry are the program's local variables, which is why we need a side channel. + +## MUTATION_CTX wire format + +`MUTATION_CTX` is a `PerCpuArray` with a single element. `fast_path` writes index 0 immediately before its `bpf_tail_call`; `finalize` reads index 0 as its first action. + +```rust +#[repr(C)] +pub struct MutationCtx { + egress_ifindex: u32, // FIB-resolved egress (pre-VLAN-resolve) + egress_vid: u16, // VLAN_RESOLVE result; 0 = untagged + ingress_vid: u16, // From packet parse; 0 = untagged + ip_offset: u32, // Bytes from ctx.data() to IP header + is_v4: u8, // 1 = IPv4, 0 = IPv6 + _pad: [u8; 3], +} +``` + +16 bytes, naturally aligned. Per-CPU because the NAPI cycle is single-CPU; the read in finalize sees the most recent write in fast_path with no synchronization. + +## MUTATION_PROGS jump table + +`MUTATION_PROGS` is a `ProgramArray` sized for 8 slots. Slot 0 holds `finalize`'s file descriptor. Slots 1–7 are reserved for future stages (see "Adding new stages" below). + +Userspace populates slot 0 at attach time, in `crates/modules/fast-path/src/linux_impl.rs::populate_mutation_progs`. Order is: load `finalize` → populate slot 0 → load + attach `fast_path` to ifaces. If the order is wrong, fast_path's first packet hits an empty slot, `bpf_tail_call` returns an error, and fast_path falls through to `XDP_PASS` (kernel slow-path) while bumping `ErrTailCall`. + +## Diagnostic commands + +```sh +# Confirm both programs are loaded. +sudo bpftool prog show name fast_path +sudo bpftool prog show name finalize + +# Confirm MUTATION_PROGS[0] points at finalize. +sudo bpftool map dump name MUTATION_PROGS +# Expected: key 0x00000000 value + +# packetframe status reports the same: +sudo packetframe status +# tail-call chain (from /sys/fs/bpf/packetframe): +# MUTATION_PROGS[0]: populated (finalize) — confirm prog_id via ... + +# Watch the diagnostic counters: +sudo packetframe status | grep -E 'err_tail_call|err_mutation_ctx' +# Both should be 0 in steady state. + +# Inspect MUTATION_CTX (per-CPU; one entry per CPU): +sudo bpftool map dump name MUTATION_CTX +# Decoded fields are the most recent decision from each CPU's fast-path. +# Useful for confirming the chain is firing on real traffic. +``` + +## What `ErrTailCall` and `ErrMutationCtx` mean + +Two new diagnostic counters at indices 35 and 36: + +- `err_tail_call`: fast_path called `MUTATION_PROGS.tail_call(ctx, 0)` and got an error back. Almost always means slot 0 is empty (attach-order bug). fast_path falls through to `XDP_PASS` so traffic still flows via kernel slow-path. +- `err_mutation_ctx`: finalize couldn't read `MUTATION_CTX[0]`. Per-CPU array index 0 is always present, so this should be 0; non-zero indicates a kernel/aya bug worth filing. + +Both are append-only per CLAUDE.md guardrail — operator dashboards keying on counter index keep working. + +## Pin lifecycle + +bpffs layout under `/sys/fs/bpf/packetframe/fast-path/`: + +``` +progs/ +├── fast_path ← attached to ifaces; pin survives SIGTERM +└── finalize ← tail-called; pin survives SIGTERM +maps/ +├── (existing maps: ALLOW_V*, BLOCK_V*, CFG, STATS, ...) +├── MSS_CLAMP_V4 / MSS_CLAMP_V6 / MSS_CLAMP_BY_IFACE (v0.2.4+) +├── MUTATION_CTX (v0.2.5+) +└── MUTATION_PROGS (v0.2.5+) +links/ +└── eth0, eth1, ... ← per-iface XDP attachments (fast_path only) +``` + +`packetframe detach --all` walks both program pins and every map pin. Existing pin lifecycle and SIGTERM-without-detach semantics from SPEC §8.5 apply unchanged: both program pins survive process exit; the bpffs inodes hold kernel references; on restart, `pin::has_existing_pins()` sees them and refuses to start until operator runs `detach --all`. + +## Adding new stages + +The `MUTATION_PROGS` array has room for 7 future stages (slots 1–7). Two patterns: + +**Replace slot 0** if the new stage subsumes finalize's responsibilities. `populate_mutation_progs` decides which program goes in slot 0 based on config. Example: a new `finalize_with_nat` program that does NAT + mss-clamp + VLAN + redirect. + +**Chain via subsequent slots** if the new stage runs *between* finalize-equivalent stages. finalize's last action becomes `tail_call(MUTATION_PROGS, 1)` instead of `bpf_redirect_map`; the slot-1 program does redirect. This adds one more 512-byte stack budget. + +In both patterns, all stages share the same `MUTATION_CTX` and `STATS` maps (one ELF, automatic map sharing in aya). New stages can introduce their own scratch maps as needed. + +## What this isn't + +- **Multi-module composition.** ddos / sampler / randomizer (SPEC §5.x) need the libxdp dispatcher, not tail-calls. The dispatcher chains *independent* modules at the same hook based on XDP verdicts; tail-call is one-way control transfer between cooperating stages. +- **A general "anything-goes" tail-call framework.** Tail calls have a depth limit (kernel cap is 33 chained calls; we never approach that) and one-way control flow. They're a tool for stack-budget relief, not a programmability layer. + +## See also + +- [docs/runbooks/mss-clamp.md](mss-clamp.md) — operator guide for the mss-clamp directive (which now lives inside `finalize`) +- [docs/runbooks/reconfigure.md](reconfigure.md) — SIGHUP / `packetframe reconfigure` semantics; both maps update through the same reconcile path regardless of which program reads them +- SPEC.md §3.2 (priority taxonomy), §3.4 (multi-program composition), §4.x (BPF map layouts), §11.x (kernel compatibility notes)