diff --git a/Cargo.lock b/Cargo.lock
index 3b64876..17dcd25 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -680,7 +680,7 @@ checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe"
 
 [[package]]
 name = "packetframe-cli"
-version = "0.2.4"
+version = "0.2.5"
 dependencies = [
  "anyhow",
  "clap",
@@ -697,7 +697,7 @@ dependencies = [
 
 [[package]]
 name = "packetframe-common"
-version = "0.2.4"
+version = "0.2.5"
 dependencies = [
  "anyhow",
  "flate2",
@@ -710,7 +710,7 @@ dependencies = [
 
 [[package]]
 name = "packetframe-fast-path"
-version = "0.2.4"
+version = "0.2.5"
 dependencies = [
  "anyhow",
  "aya",
@@ -733,7 +733,7 @@ dependencies = [
 
 [[package]]
 name = "packetframe-probe"
-version = "0.2.4"
+version = "0.2.5"
 dependencies = [
  "anyhow",
  "aya",
diff --git a/Cargo.toml b/Cargo.toml
index 2b0f9a3..df4122b 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -23,7 +23,7 @@ exclude = [
 # detach pacing for bridge members, integrity-check Total-line parse
 # fix). See SPEC.md §11.14 for the rollout history and
 # `docs/runbooks/custom-fib.md` for operations.
-version = "0.2.4"
+version = "0.2.5"
 edition = "2021"
 # MSRV. Deliberately behind the rust-toolchain.toml pin (which is the
 # latest stable) so a contributor with a slightly older toolchain still
diff --git a/README.md b/README.md
index cf2bde7..88fda90 100644
--- a/README.md
+++ b/README.md
@@ -71,8 +71,9 @@ PacketFrame complements existing routing daemons rather than replacing them. The
 | Connected-destination fast-path (`local-prefix`) | Production (v0.2.1+) |
 | `fallback-default` synthesis | Production (v0.2.1+) |
 | `block-prefix` XDP-time drop | Production (v0.2.1+) |
-| `mss-clamp` directive (fast-path) | Production (v0.2.4+) |
+| `mss-clamp` directive (fast-path) | Production (v0.2.4+; per-prefix loads on stricter kernels in v0.2.5+) |
 | `packetframe reconfigure` / `systemctl reload packetframe` | Production (v0.2.4+) |
+| Two-stage BPF datapath (`fast_path` + `finalize` via `bpf_tail_call`) | Production (v0.2.5+) — see [docs/runbooks/tail-call-architecture.md](docs/runbooks/tail-call-architecture.md) |
 | `probe` module — diagnostic XDP | Production |
 | `ddos` module — XDP-time SYN-flood + amplification filter | Future — sketched in SPEC §5.2 (priority 0–999, security/admission) |
 | `sampler` module — per-flow ringbuf observability | Future — sketched in SPEC §5.3 (priority 2000–2999, observation) |
@@ -86,7 +87,7 @@ Releases are published on the [GitHub releases page](https://github.com/unredact
 ### Debian / Ubuntu (.deb)
 
 ```sh
-VERSION=v0.2.4
+VERSION=v0.2.5
 ARCH=$(dpkg --print-architecture)   # amd64 or arm64
 
 curl -LO "https://github.com/unredacted/packetframe/releases/download/${VERSION}/packetframe_${VERSION#v}_${ARCH}.deb"
@@ -103,7 +104,7 @@ Installs `/usr/bin/packetframe`, the systemd unit at `/lib/systemd/system/packet
 For musl-static deployments, non-Debian distros, or anything else:
 
 ```sh
-VERSION=v0.2.4
+VERSION=v0.2.5
 TARGET=aarch64-unknown-linux-gnu     # or: x86_64-unknown-linux-{gnu,musl}, aarch64-unknown-linux-musl
 
 curl -LO "https://github.com/unredacted/packetframe/releases/download/${VERSION}/packetframe-${VERSION}-${TARGET}.tar.gz"
diff --git a/VERSION b/VERSION
index abd4105..3a4036f 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-0.2.4
+0.2.5
diff --git a/crates/cli/src/loader.rs b/crates/cli/src/loader.rs
index 67aa852..ae867f1 100644
--- a/crates/cli/src/loader.rs
+++ b/crates/cli/src/loader.rs
@@ -747,6 +747,14 @@ pub fn status(config_path: &Path) -> Result<(), String> {
             Err(e) => return Err(format!("registry read: {e}")),
         }
 
+        // v0.2.5+ tail-call chain summary. Confirms MUTATION_PROGS[0]
+        // is populated with the `finalize` program FD; if empty,
+        // fast_path's tail_call hits ErrTailCall and traffic falls
+        // through to kernel slow-path. Operators see this immediately
+        // in the status output rather than chasing it via err counter.
+        #[cfg(target_os = "linux")]
+        print_tail_call_chain(&config.global.bpffs_root);
+
         // Live counter readback from the pinned STATS map. Works
         // whether or not the loader is running — the pin survives
         // process exit (§8.5).
@@ -757,6 +765,24 @@ pub fn status(config_path: &Path) -> Result<(), String> {
     Ok(())
 }
 
+#[cfg(all(target_os = "linux", feature = "fast-path"))]
+fn print_tail_call_chain(bpffs_root: &Path) {
+    use packetframe_fast_path::tail_call_chain_from_pin;
+    println!();
+    println!("tail-call chain (from {}):", bpffs_root.display());
+    match tail_call_chain_from_pin(bpffs_root) {
+        Ok(true) => println!(
+            "  MUTATION_PROGS[0]: populated (finalize) — \
+             confirm prog_id via `bpftool prog show name finalize`"
+        ),
+        Ok(false) => println!(
+            "  MUTATION_PROGS[0]: <EMPTY> — fast_path's tail_call will fail; traffic \
+             falls to kernel slow-path. Restart packetframe to repopulate."
+        ),
+        Err(e) => eprintln!("  MUTATION_PROGS pin unavailable ({e}); loader may not be attached"),
+    }
+}
+
 #[cfg(all(target_os = "linux", feature = "fast-path"))]
 fn print_stats(bpffs_root: &Path) {
     // §4.6 counter names, indexed by `StatIdx` discriminants. Order
@@ -764,7 +790,7 @@ fn print_stats(bpffs_root: &Path) {
     // Append-only — adding new entries at the end is fine; renumbering
     // breaks dashboards. Indices 0-19 are the kernel-fib counter set;
     // 20-31 were appended in the Option F custom-FIB rollout (§4.11).
-    const NAMES: [&str; 33] = [
+    const NAMES: [&str; 37] = [
         "rx_total",
         "matched_v4",
         "matched_v6",
@@ -799,6 +825,12 @@ fn print_stats(bpffs_root: &Path) {
         "nexthop_seq_retry",
         "bmp_peer_down",
         "bogon_dropped",
+        // --- v0.2.4: mss-clamp ---
+        "mss_clamp_applied",
+        "mss_clamp_skipped",
+        // --- v0.2.5: two-stage datapath ---
+        "err_tail_call",
+        "err_mutation_ctx",
     ];
 
     print_fib_status(bpffs_root);
diff --git a/crates/modules/fast-path/bpf/src/finalize.rs b/crates/modules/fast-path/bpf/src/finalize.rs
new file mode 100644
index 0000000..a634d2a
--- /dev/null
+++ b/crates/modules/fast-path/bpf/src/finalize.rs
@@ -0,0 +1,448 @@
+//! Finalize stage: tail-called by `fast_path` after classification +
+//! L2/TTL mutations. Owns mss-clamp lookup + mutation, VLAN choreography,
+//! and the final `bpf_redirect_map` call.
+//!
+//! Lives in its own XDP program so it gets a fresh 512-byte BPF stack
+//! budget. v0.2.4 inlined this work into `fast_path` and ran into UniFi
+//! 5.15's stricter stack accounting (rejected at `combined stack size of
+//! 3 calls is 544. Too large`). Splitting fixes the budget and provides
+//! the pattern for future fast-path-internal stages.
+//!
+//! Communication from `fast_path` is via two side channels:
+//! - The packet itself (preserved across `bpf_tail_call`).
+//! - `MUTATION_CTX` per-CPU scratch (egress info, ingress VID, IP offset,
+//!   v4/v6 discriminator) — written by fast_path, read here.
+//!
+//! See SPEC.md §4.x "Two-stage BPF datapath" and
+//! `docs/runbooks/tail-call-architecture.md`.
+
+use aya_ebpf::{
+    bindings::xdp_action,
+    helpers::gen::bpf_xdp_adjust_head,
+    macros::xdp,
+    maps::lpm_trie::Key,
+    programs::XdpContext,
+};
+use network_types::ip::{IpProto, Ipv4Hdr, Ipv6Hdr};
+
+use crate::maps::{
+    bump_stat, StatIdx, CFG, MSS_CLAMP_BY_IFACE, MSS_CLAMP_V4, MSS_CLAMP_V6, MUTATION_CTX,
+    REDIRECT_DEVMAP,
+};
+
+/// 802.1Q TPID. Mirror of `main::TPID_8021Q`; kept local so finalize
+/// is self-contained.
+const TPID_8021Q: u16 = 0x8100;
+
+/// Sentinel for "no VLAN" — mirror of `main::VLAN_NONE`.
+const VLAN_NONE: u16 = 0;
+
+/// SYN flag in TCP byte 13.
+const TCP_FLAG_SYN: u8 = 0x02;
+
+/// IANA TCP protocol number, materialized from `IpProto` (network-types
+/// 0.2 changed `proto`/`next_hdr` to raw `u8`).
+const PROTO_TCP: u8 = IpProto::Tcp as u8;
+
+/// Upper bound on `ip_offset` post-VLAN-parse. Used to give the BPF
+/// verifier a tight `umax` so range propagation through packet-pointer
+/// arithmetic works — see commentary on the `ip_offset > MAX_IP_OFFSET`
+/// check in `finalize`.
+const MAX_IP_OFFSET: usize = 64;
+
+#[xdp]
+pub fn finalize(ctx: XdpContext) -> u32 {
+    // Read the per-CPU mutation context written by fast_path right
+    // before its tail_call. Always present in production; fail-safe
+    // XDP_PASS if missing so traffic falls through to kernel rather
+    // than getting dropped silently.
+    let mctx = match unsafe { MUTATION_CTX.get(0) } {
+        Some(c) => *c,
+        None => {
+            bump_stat(StatIdx::ErrMutationCtx);
+            return xdp_action::XDP_PASS;
+        }
+    };
+
+    let egress_ifindex = mctx.egress_ifindex;
+    let egress_vid = mctx.egress_vid;
+    let ingress_vid = mctx.ingress_vid;
+    let is_v4 = mctx.is_v4 != 0;
+
+    // Clamp ip_offset to MAX_IP_OFFSET (64). The BPF verifier's
+    // `find_good_pkt_pointers` refuses to propagate range information
+    // through packet-pointer arithmetic when the scalar offset's
+    // umax_value exceeds MAX_PACKET_OFF (0xffff) — which is the case
+    // for `mctx.ip_offset` since it's read from a map and the verifier
+    // sees its full u32 range. Capping the offset gives the verifier
+    // a tight umax it can reason about, so the subsequent
+    // `pkt + ip_offset + ip_hdr_size > end` bound check actually
+    // propagates a usable readable-range back to `pkt + ip_offset`.
+    //
+    // In practice fast_path writes `EthHdr::LEN` (14) or
+    // `EthHdr::LEN + VLAN_HDR_LEN` (18); 64 is comfortable headroom
+    // for a future second VLAN tag without having to revisit this.
+    let ip_offset = mctx.ip_offset as usize;
+    if ip_offset > MAX_IP_OFFSET {
+        return xdp_action::XDP_PASS;
+    }
+
+    // mss-clamp first, then VLAN choreography (which can shift bytes
+    // via bpf_xdp_adjust_head). mss-clamp's TCP-options walk relies on
+    // ip_offset being valid relative to ctx.data() — true until VLAN
+    // push/pop changes the layout.
+    mss_clamp_inline(&ctx, ip_offset, is_v4, egress_ifindex);
+
+    if apply_vlan_egress(&ctx, ingress_vid, egress_vid).is_err() {
+        bump_stat(StatIdx::ErrVlan);
+        return xdp_action::XDP_ABORTED;
+    }
+
+    match REDIRECT_DEVMAP.redirect(egress_ifindex, 0) {
+        Ok(_) => {
+            bump_stat(StatIdx::FwdOk);
+            xdp_action::XDP_REDIRECT
+        }
+        Err(_) => {
+            bump_stat(StatIdx::ErrFibOther);
+            xdp_action::XDP_PASS
+        }
+    }
+}
+
+// --- MSS clamping (relocated from main.rs in v0.2.5) ----------------------
+
+/// Top-level entry: dispatch into the v4 or v6 path with a constant-sized
+/// bounds check. Splitting upfront (rather than threading `is_v4` through
+/// a single function) is what satisfies the BPF verifier — the bounds
+/// check needs to use a compile-time-known size so the verifier can
+/// track that subsequent reads via `*const Ipv4Hdr` / `*const Ipv6Hdr`
+/// stay within the checked region.
+///
+/// The ergonomic alternative — `let size = if is_v4 { 20 } else { 40 };
+/// if start + offset + size > end { ... }; ip_addr as *const Ipv4Hdr` —
+/// loses the verifier's bound-tracking when the cast is reached: see
+/// `R9 offset is outside of the packet` from the v0.2.5 prerelease build.
+#[inline(always)]
+fn mss_clamp_inline(ctx: &XdpContext, ip_offset: usize, is_v4: bool, egress_ifindex: u32) {
+    if is_v4 {
+        mss_clamp_v4(ctx, ip_offset, egress_ifindex);
+    } else {
+        mss_clamp_v6(ctx, ip_offset, egress_ifindex);
+    }
+}
+
+/// IPv4 path: bounds-check exactly `Ipv4Hdr::LEN` bytes, then cast
+/// directly to `*const Ipv4Hdr`. Mirrors the `ptr_at` pattern from
+/// main.rs that the verifier accepts.
+#[inline(always)]
+fn mss_clamp_v4(ctx: &XdpContext, ip_offset: usize, egress_ifindex: u32) {
+    let start = ctx.data();
+    let end = ctx.data_end();
+    if start + ip_offset + Ipv4Hdr::LEN > end {
+        return;
+    }
+    let ip: *const Ipv4Hdr = (start + ip_offset) as *const Ipv4Hdr;
+    let proto = unsafe { (*ip).proto };
+    if proto != PROTO_TCP {
+        return;
+    }
+    let clamp = lookup_mss_clamp_v4(ip, egress_ifindex);
+    if clamp == 0 {
+        return;
+    }
+    mss_clamp_tcp(ctx, ip as *const u8, Ipv4Hdr::LEN, clamp);
+}
+
+/// IPv6 path: same pattern as `mss_clamp_v4` but with a 40-byte bound.
+#[inline(always)]
+fn mss_clamp_v6(ctx: &XdpContext, ip_offset: usize, egress_ifindex: u32) {
+    let start = ctx.data();
+    let end = ctx.data_end();
+    if start + ip_offset + Ipv6Hdr::LEN > end {
+        return;
+    }
+    let ip: *const Ipv6Hdr = (start + ip_offset) as *const Ipv6Hdr;
+    let proto = unsafe { (*ip).next_hdr };
+    if proto != PROTO_TCP {
+        return;
+    }
+    let clamp = lookup_mss_clamp_v6(ip, egress_ifindex);
+    if clamp == 0 {
+        return;
+    }
+    mss_clamp_tcp(ctx, ip as *const u8, Ipv6Hdr::LEN, clamp);
+}
+
+/// Walk the TCP-options block of a matched SYN/SYN-ACK and mutate the MSS
+/// option in place if the existing MSS is greater than the clamp value.
+/// Recomputes the TCP checksum incrementally (RFC 1624). Bumps
+/// `MssClampApplied` on rewrite, `MssClampSkipped` on "policy applies but
+/// no rewrite needed."
+///
+/// Takes a typed `ip_ptr` (already bounds-checked for `ip_hdr_size` bytes)
+/// rather than a raw `tcp_offset` scalar. Inside, we recover ip_offset as
+/// `(ip_ptr as usize) - start`, which the BPF verifier tracks as a `pkt -
+/// pkt` subtraction with `umax = MAX_PACKET_OFF (0xffff)`. That tight bound
+/// is what makes subsequent `start + tcp_offset + N > end` checks
+/// propagate readable-range to the read site (mirrors v0.2.4's working
+/// pattern; passing `tcp_offset` directly as a `usize` from a map read
+/// loses verifier tracking and the post-bound-check pkt pointer ends up
+/// with `r=0`).
+///
+/// Bounds-checked at every read against `ctx.data_end()`. Options walk
+/// is fixed-bound at 8 iterations to keep BPF verifier state-space
+/// exploration tractable (a 40-iteration walk hit the verifier's
+/// 1M-instruction limit during v0.2.4 development).
+#[inline(always)]
+fn mss_clamp_tcp(ctx: &XdpContext, ip_ptr: *const u8, ip_hdr_size: usize, clamp: u16) {
+    let start = ctx.data();
+    let end = ctx.data_end();
+
+    // pkt-derived scalar; verifier tracks umax tightly.
+    let ip_offset = (ip_ptr as usize) - start;
+    let tcp_offset = ip_offset + ip_hdr_size;
+
+    // Need 20 bytes for the fixed TCP header before walking options.
+    if start + tcp_offset + 20 > end {
+        return;
+    }
+
+    // Bytes 12-13 of TCP header: data_offset:4 | reserved:4 | flags:8.
+    let doff_byte = unsafe { *((start + tcp_offset + 12) as *const u8) };
+    let flags = unsafe { *((start + tcp_offset + 13) as *const u8) };
+    if flags & TCP_FLAG_SYN == 0 {
+        return; // Not SYN/SYN-ACK.
+    }
+    let doff_words = (doff_byte >> 4) as usize;
+    if !(5..=15).contains(&doff_words) {
+        return;
+    }
+    let tcp_hdr_len = doff_words * 4;
+    let opts_len = tcp_hdr_len - 20;
+    if opts_len == 0 {
+        bump_stat(StatIdx::MssClampSkipped);
+        return;
+    }
+    if start + tcp_offset + tcp_hdr_len > end {
+        return;
+    }
+
+    // Walk options. Cap at 8 — real SYN packets put MSS in the first
+    // 1-4 options (Linux's tcp_options_write emits MSS very early); 8
+    // is comfortable headroom while keeping verifier state-space bounded.
+    let opts_start_off = tcp_offset + 20;
+    let mut cursor: usize = 0;
+    let mut found = false;
+
+    for _ in 0..8 {
+        if cursor >= opts_len {
+            break;
+        }
+        let p_addr = start + opts_start_off + cursor;
+        if p_addr + 4 > end {
+            break;
+        }
+        let p = p_addr as *const u8;
+        let kind = unsafe { *p };
+        if kind == 0 {
+            break; // EOL.
+        }
+        if kind == 1 {
+            cursor += 1; // NOP.
+            continue;
+        }
+        let length = unsafe { *p.add(1) } as usize;
+        if length < 2 || cursor + length > opts_len {
+            break; // Malformed.
+        }
+        if kind == 2 && length == 4 {
+            // MSS option: [kind=2, length=4, mss_be:2].
+            let mss_be = unsafe { [*p.add(2), *p.add(3)] };
+            let mss = u16::from_be_bytes(mss_be);
+            if mss > clamp {
+                let new_mss_be = clamp.to_be_bytes();
+                unsafe {
+                    let pmut = p as *mut u8;
+                    *pmut.add(2) = new_mss_be[0];
+                    *pmut.add(3) = new_mss_be[1];
+                }
+                // RFC 1624 incremental TCP checksum update.
+                let csum_off = tcp_offset + 16;
+                if start + csum_off + 2 > end {
+                    return;
+                }
+                let csum_p = (start + csum_off) as *mut u8;
+                let old_csum_be = unsafe { [*csum_p, *csum_p.add(1)] };
+                let old_csum = u16::from_be_bytes(old_csum_be);
+                let new_csum = csum_replace_u16(old_csum, mss, clamp);
+                let new_csum_be = new_csum.to_be_bytes();
+                unsafe {
+                    *csum_p = new_csum_be[0];
+                    *csum_p.add(1) = new_csum_be[1];
+                }
+                bump_stat(StatIdx::MssClampApplied);
+            } else {
+                bump_stat(StatIdx::MssClampSkipped);
+            }
+            found = true;
+            break;
+        }
+        cursor += length;
+    }
+
+    if !found {
+        bump_stat(StatIdx::MssClampSkipped);
+    }
+}
+
+#[inline(always)]
+fn csum_replace_u16(old_csum: u16, old_val: u16, new_val: u16) -> u16 {
+    let mut sum: u32 = (!old_csum) as u32 + (!old_val) as u32 + new_val as u32;
+    sum = (sum & 0xffff) + (sum >> 16);
+    sum = (sum & 0xffff) + (sum >> 16);
+    !(sum as u16)
+}
+
+#[inline(always)]
+fn lookup_mss_clamp_v4(ip: *const Ipv4Hdr, egress_ifindex: u32) -> u16 {
+    {
+        let key = Key::new(32, unsafe { (*ip).src_addr });
+        if let Some(entry) = MSS_CLAMP_V4.get(&key) {
+            if entry.iface_filter == 0 || entry.iface_filter == egress_ifindex {
+                return entry.mss;
+            }
+        }
+    }
+    {
+        let key = Key::new(32, unsafe { (*ip).dst_addr });
+        if let Some(entry) = MSS_CLAMP_V4.get(&key) {
+            if entry.iface_filter == 0 || entry.iface_filter == egress_ifindex {
+                return entry.mss;
+            }
+        }
+    }
+    if let Some(mss) = unsafe { MSS_CLAMP_BY_IFACE.get(&egress_ifindex) } {
+        if *mss != 0 {
+            return *mss;
+        }
+    }
+    if let Some(c) = CFG.get(0) {
+        return c.mss_clamp_global;
+    }
+    0
+}
+
+#[inline(always)]
+fn lookup_mss_clamp_v6(ip: *const Ipv6Hdr, egress_ifindex: u32) -> u16 {
+    {
+        let key = Key::new(128, unsafe { (*ip).src_addr });
+        if let Some(entry) = MSS_CLAMP_V6.get(&key) {
+            if entry.iface_filter == 0 || entry.iface_filter == egress_ifindex {
+                return entry.mss;
+            }
+        }
+    }
+    {
+        let key = Key::new(128, unsafe { (*ip).dst_addr });
+        if let Some(entry) = MSS_CLAMP_V6.get(&key) {
+            if entry.iface_filter == 0 || entry.iface_filter == egress_ifindex {
+                return entry.mss;
+            }
+        }
+    }
+    if let Some(mss) = unsafe { MSS_CLAMP_BY_IFACE.get(&egress_ifindex) } {
+        if *mss != 0 {
+            return *mss;
+        }
+    }
+    if let Some(c) = CFG.get(0) {
+        return c.mss_clamp_global;
+    }
+    0
+}
+
+// --- VLAN choreography (relocated from main.rs in v0.2.5) -----------------
+
+/// SPEC §4.7's four-case VLAN matrix, keyed on VLAN_NONE-sentinel u16s
+/// rather than `Option<u16>` (the verifier rejects the Option-argument
+/// spill across a function boundary).
+#[inline(always)]
+fn apply_vlan_egress(ctx: &XdpContext, ingress_vid: u16, egress_vid: u16) -> Result<(), ()> {
+    let ingress_present = ingress_vid != VLAN_NONE;
+    let egress_present = egress_vid != VLAN_NONE;
+    match (ingress_present, egress_present) {
+        (false, false) => Ok(()),
+        (true, true) if ingress_vid == egress_vid => Ok(()),
+        (false, true) => vlan_push(ctx, egress_vid),
+        (true, false) => vlan_pop(ctx),
+        (true, true) => vlan_rewrite(ctx, egress_vid),
+    }
+}
+
+/// Untagged → tagged. Grows headroom by 4, shifts the MAC pair left by
+/// 4 bytes, writes TPID + TCI into the freed-up slot. Uses
+/// `core::ptr::copy` (memmove) not `copy_nonoverlapping` because source
+/// and destination overlap — SPEC calls this out as a footgun.
+#[inline(always)]
+fn vlan_push(ctx: &XdpContext, vid: u16) -> Result<(), ()> {
+    let rc = unsafe { bpf_xdp_adjust_head(ctx.ctx as *mut _, -4) };
+    if rc != 0 {
+        return Err(());
+    }
+    let start = ctx.data();
+    let end = ctx.data_end();
+    if start + 18 > end {
+        return Err(());
+    }
+    unsafe {
+        let base = start as *mut u8;
+        core::ptr::copy(base.add(4), base, 6);
+        core::ptr::copy(base.add(10), base.add(6), 6);
+        let tpid = TPID_8021Q.to_be_bytes();
+        *base.add(12) = tpid[0];
+        *base.add(13) = tpid[1];
+        let tci = (vid & 0x0fff).to_be_bytes();
+        *base.add(14) = tci[0];
+        *base.add(15) = tci[1];
+    }
+    Ok(())
+}
+
+/// Tagged → untagged. Shifts the MAC pair right by 4 over the about-to-
+/// be-discarded TPID+TCI slot, then shrinks headroom by 4.
+#[inline(always)]
+fn vlan_pop(ctx: &XdpContext) -> Result<(), ()> {
+    let start = ctx.data();
+    let end = ctx.data_end();
+    if start + 18 > end {
+        return Err(());
+    }
+    unsafe {
+        let base = start as *mut u8;
+        core::ptr::copy(base.add(6), base.add(10), 6);
+        core::ptr::copy(base, base.add(4), 6);
+    }
+    let rc = unsafe { bpf_xdp_adjust_head(ctx.ctx as *mut _, 4) };
+    if rc != 0 {
+        return Err(());
+    }
+    Ok(())
+}
+
+/// Tagged VID X → tagged VID Y. No headroom change; overwrite TCI in place.
+#[inline(always)]
+fn vlan_rewrite(ctx: &XdpContext, vid: u16) -> Result<(), ()> {
+    let start = ctx.data();
+    let end = ctx.data_end();
+    if start + 16 > end {
+        return Err(());
+    }
+    let tci = (vid & 0x0fff).to_be_bytes();
+    unsafe {
+        let base = start as *mut u8;
+        *base.add(14) = tci[0];
+        *base.add(15) = tci[1];
+    }
+    Ok(())
+}
diff --git a/crates/modules/fast-path/bpf/src/main.rs b/crates/modules/fast-path/bpf/src/main.rs
index 39fd065..ae856b9 100644
--- a/crates/modules/fast-path/bpf/src/main.rs
+++ b/crates/modules/fast-path/bpf/src/main.rs
@@ -31,12 +31,13 @@ use network_types::{
 };
 
 mod fib;
+mod finalize;
 mod maps;
 
 use maps::{
     bump_stat, StatIdx, ALLOW_V4, ALLOW_V6, BLOCK_V4, BLOCK_V6, CFG, FP_CFG_FLAG_COMPARE_MODE,
-    FP_CFG_FLAG_CUSTOM_FIB, FP_CFG_FLAG_HEAD_SHIFT_128, MSS_CLAMP_BY_IFACE, MSS_CLAMP_V4,
-    MSS_CLAMP_V6, REDIRECT_DEVMAP, VLAN_RESOLVE,
+    FP_CFG_FLAG_CUSTOM_FIB, FP_CFG_FLAG_HEAD_SHIFT_128, MUTATION_CTX, MUTATION_PROGS,
+    REDIRECT_DEVMAP, VLAN_RESOLVE,
 };
 
 const AF_INET: u8 = 2;
@@ -50,9 +51,6 @@ const PROTO_TCP: u8 = IpProto::Tcp as u8;
 const PROTO_UDP: u8 = IpProto::Udp as u8;
 const PROTO_ICMPV6: u8 = IpProto::Ipv6Icmp as u8;
 
-/// 802.1Q TPID. What we write back on push / rewrite.
-const TPID_8021Q: u16 = 0x8100;
-
 /// Sentinel u16 representing "no VLAN" in the VID-passing API below.
 /// 802.1Q reserves VID 0 for priority-only tagging, so `vid == 0`
 /// means absent from the fast-path's perspective. Using a single u16
@@ -418,8 +416,14 @@ fn dispatch_fib(
 /// Success path shared between the kernel-FIB (`dispatch_fib`) and
 /// custom-FIB (`dispatch_custom_fib`) code paths. Takes a decided
 /// `(egress_ifindex, smac, dmac)` and performs VLAN resolution,
-/// devmap pre-check, TTL decrement, L2 rewrite, VLAN choreography,
-/// and redirect. Must not be called without a valid forward decision.
+/// devmap pre-check, TTL decrement, L2 rewrite, then writes the
+/// per-CPU `MUTATION_CTX` and tail-calls into the `finalize` program
+/// which handles mss-clamp + VLAN choreography + `bpf_redirect_map`.
+///
+/// The split is v0.2.5+: prior versions did mss-clamp + VLAN +
+/// redirect inline, but the cumulative stack pushed past UniFi 5.15's
+/// stricter accounting. Splitting gives finalize its own 512-byte
+/// stack budget. See SPEC §4.x "Two-stage BPF datapath."
 #[inline(always)]
 fn forward_success(
     ctx: &XdpContext,
@@ -453,48 +457,52 @@ fn forward_success(
         return Ok(xdp_action::XDP_PASS);
     }
 
-    // MSS clamping (v0.2.4+, closes SPEC §11.4 gap). Mutate the TCP
-    // MSS option in SYN/SYN-ACK packets before they're handed to the
-    // egress NIC — must happen before `apply_vlan_egress` (which can
-    // shift packet bytes via `bpf_xdp_adjust_head`) but is order-
-    // independent w.r.t. TTL decrement and L2 rewrite (those edit
-    // existing bytes in place). No-op for non-TCP, non-SYN packets,
-    // or when no clamp policy applies. Skipped under `is_dry_run()`
-    // because dry-run returns XDP_PASS earlier in the flow.
-    mss_clamp_inline(ctx, ip, is_v4, egress_ifindex);
-
-    // TTL/hop_limit + csum first — IP header's position in memory
-    // doesn't change with adjust_head, only its offset from `data`.
+    // TTL/hop_limit + csum — IP header's position in memory doesn't
+    // change with adjust_head, only its offset from `data`. Safe to do
+    // before VLAN choreography (which lives in finalize).
     if is_v4 {
         decrement_ipv4_ttl(ip as *mut Ipv4Hdr);
     } else {
         decrement_ipv6_hop_limit(ip as *mut Ipv6Hdr);
     }
-    // L2 rewrite BEFORE push/pop — push moves the current MAC
-    // positions into new slots, so the values there need to be the
-    // post-FIB MACs.
+    // L2 rewrite (in-place; no offset change). Done here so the post-
+    // FIB MACs are in place before VLAN push potentially shifts them.
     unsafe {
         (*eth).dst_addr = dmac;
         (*eth).src_addr = smac;
     }
 
-    // VLAN choreography (SPEC §4.7). On error, XDP_ABORTED +
-    // err_vlan per the spec.
-    if apply_vlan_egress(ctx, ingress_vid, egress_vid).is_err() {
-        bump_stat(StatIdx::ErrVlan);
-        return Ok(xdp_action::XDP_ABORTED);
-    }
-
-    match REDIRECT_DEVMAP.redirect(egress_ifindex, 0) {
-        Ok(_) => {
-            bump_stat(StatIdx::FwdOk);
-            Ok(xdp_action::XDP_REDIRECT)
-        }
-        Err(_) => {
-            bump_stat(StatIdx::ErrFibOther);
-            Ok(xdp_action::XDP_PASS)
+    // Write decision state to per-CPU MUTATION_CTX, then tail-call
+    // into MUTATION_PROGS[0] (= finalize). Finalize handles mss-clamp,
+    // VLAN choreography, and the final bpf_redirect_map. `ip` is a
+    // packet pointer derived from `ctx.data()`; their difference is a
+    // verifier-tracked scalar.
+    let ip_offset = (ip as usize) - ctx.data();
+    if let Some(mctx_ptr) = MUTATION_CTX.get_ptr_mut(0) {
+        unsafe {
+            (*mctx_ptr).egress_ifindex = egress_ifindex;
+            (*mctx_ptr).egress_vid = egress_vid;
+            (*mctx_ptr).ingress_vid = ingress_vid;
+            (*mctx_ptr).ip_offset = ip_offset as u32;
+            (*mctx_ptr).is_v4 = u8::from(is_v4);
+            (*mctx_ptr)._pad = [0; 3];
         }
+    } else {
+        // Per-CPU array index 0 is always present; this branch should
+        // be unreachable. Fail-safe XDP_PASS so traffic falls to kernel
+        // rather than getting blackholed.
+        bump_stat(StatIdx::ErrMutationCtx);
+        return Ok(xdp_action::XDP_PASS);
     }
+
+    // tail_call returns Err on slot-empty / invalid; on success it
+    // doesn't return at all (control transfers to finalize). The Err
+    // branch is fail-safe — if userspace's populate_mutation_progs
+    // somehow skipped slot 0, traffic falls through to kernel rather
+    // than getting silently dropped.
+    let _ = unsafe { MUTATION_PROGS.tail_call(ctx, 0) };
+    bump_stat(StatIdx::ErrTailCall);
+    Ok(xdp_action::XDP_PASS)
 }
 
 /// Dispatch on a [`fib::CustomFibResult`] returned by the custom-FIB
@@ -564,365 +572,6 @@ fn compare_and_bump(
     }
 }
 
-// --- VLAN choreography ----------------------------------------------------
-
-/// §4.7's four-case matrix, keyed on VLAN_NONE-sentinel u16s rather
-/// than Option<u16> (the verifier rejects the Option-argument spill).
-/// Returns Err(()) on any packet-manipulation failure.
-#[inline(always)]
-fn apply_vlan_egress(ctx: &XdpContext, ingress_vid: u16, egress_vid: u16) -> Result<(), ()> {
-    let ingress_present = ingress_vid != VLAN_NONE;
-    let egress_present = egress_vid != VLAN_NONE;
-    match (ingress_present, egress_present) {
-        (false, false) => Ok(()),
-        (true, true) if ingress_vid == egress_vid => Ok(()),
-        (false, true) => vlan_push(ctx, egress_vid),
-        (true, false) => vlan_pop(ctx),
-        (true, true) => vlan_rewrite(ctx, egress_vid),
-    }
-}
-
-/// Untagged → tagged. Grows headroom by 4, shifts the MAC pair left by
-/// 4 bytes, writes TPID + TCI into the freed-up slot. SPEC §4.7.
-///
-/// Uses `core::ptr::copy` (true memmove) not `copy_nonoverlapping` —
-/// the source and destination regions overlap. This is the footgun
-/// SPEC calls out: on some VID combinations `copy_nonoverlapping`
-/// produces wrong bytes on the wire and the verifier does NOT catch it.
-#[inline(always)]
-fn vlan_push(ctx: &XdpContext, vid: u16) -> Result<(), ()> {
-    // Grow headroom by 4 bytes.
-    let rc = unsafe { bpf_xdp_adjust_head(ctx.ctx as *mut _, -4) };
-    if rc != 0 {
-        return Err(());
-    }
-
-    let start = ctx.data();
-    let end = ctx.data_end();
-    if start + 18 > end {
-        return Err(());
-    }
-
-    // SAFETY: pointers derived from the freshly-re-read data/data_end,
-    // both bounds-checked for the 18-byte range we touch.
-    unsafe {
-        let base = start as *mut u8;
-        // Move dst_mac left: [4..10] → [0..6]. 6-byte move, overlapping.
-        core::ptr::copy(base.add(4), base, 6);
-        // Move src_mac left: [10..16] → [6..12]. Overlapping.
-        core::ptr::copy(base.add(10), base.add(6), 6);
-        // TPID (0x8100) at [12..14], big-endian.
-        let tpid = TPID_8021Q.to_be_bytes();
-        *base.add(12) = tpid[0];
-        *base.add(13) = tpid[1];
-        // TCI at [14..16]: PCP=0 | DEI=0 | VID (12 bits), big-endian.
-        let tci = (vid & 0x0fff).to_be_bytes();
-        *base.add(14) = tci[0];
-        *base.add(15) = tci[1];
-        // [16..18] already holds the original inner ethertype —
-        // bpf_xdp_adjust_head didn't touch packet bytes, only the
-        // data pointer.
-    }
-    Ok(())
-}
-
-/// Tagged → untagged. Shifts the MAC pair right by 4 bytes over the
-/// (about-to-be-discarded) TPID+TCI slot, then shrinks headroom by 4.
-#[inline(always)]
-fn vlan_pop(ctx: &XdpContext) -> Result<(), ()> {
-    let start = ctx.data();
-    let end = ctx.data_end();
-    if start + 18 > end {
-        return Err(());
-    }
-    // SAFETY: bounds-checked 18-byte range.
-    unsafe {
-        let base = start as *mut u8;
-        // Move src_mac right first: [6..12] → [10..16]. Overlapping.
-        core::ptr::copy(base.add(6), base.add(10), 6);
-        // Move dst_mac right: [0..6] → [4..10]. Overlapping.
-        core::ptr::copy(base, base.add(4), 6);
-    }
-    // Shrink headroom by 4; new data starts 4 bytes later.
-    let rc = unsafe { bpf_xdp_adjust_head(ctx.ctx as *mut _, 4) };
-    if rc != 0 {
-        return Err(());
-    }
-    Ok(())
-}
-
-/// Tagged VID X → tagged VID Y (X ≠ Y). No headroom change; overwrite
-/// the TCI bytes in place.
-#[inline(always)]
-fn vlan_rewrite(ctx: &XdpContext, vid: u16) -> Result<(), ()> {
-    let start = ctx.data();
-    let end = ctx.data_end();
-    if start + 16 > end {
-        return Err(());
-    }
-    let tci = (vid & 0x0fff).to_be_bytes();
-    unsafe {
-        let base = start as *mut u8;
-        *base.add(14) = tci[0];
-        *base.add(15) = tci[1];
-    }
-    Ok(())
-}
-
-// --- MSS clamping (v0.2.4+, SPEC §4.x — closes §11.4 gap) -----------------
-
-/// SYN flag in TCP byte 13 (low byte of `doff_flags` over the wire).
-const TCP_FLAG_SYN: u8 = 0x02;
-
-/// Walk the TCP-options block of a matched SYN/SYN-ACK and mutate the
-/// MSS option in place if a clamp policy applies and the existing MSS
-/// is greater than the clamp value. Recomputes the TCP checksum
-/// incrementally (RFC 1624). Bumps `MssClampApplied` on rewrite and
-/// `MssClampSkipped` on "policy applies but no rewrite needed" — i.e.
-/// existing MSS already ≤ clamp, no MSS option present, or malformed
-/// options walked past before finding it.
-///
-/// Bounds-checked at every read against `ctx.data_end()`. The options
-/// loop is fixed-bound at 40 iterations (TCP options max 40 bytes;
-/// each iteration consumes at least 1 byte) so the BPF verifier
-/// accepts it without unrolling concerns.
-///
-/// Marked `#[inline(always)]` deliberately. Two earlier attempts to
-/// split this into a subprogram (for stack budget) ran into the BPF
-/// kernel verifier rejecting the bpf2bpf calling convention LLVM
-/// emits: even when arguments are scalar, LLVM SROA decomposes
-/// `&XdpContext` into `(data, data_end)` packet pointers, and the
-/// verifier prohibits pointer-shift instructions on packet pointers
-/// (the lift LLVM emits to extend 32-bit→64-bit). Inlining is the
-/// only verifier-friendly option for code that touches the packet.
-///
-/// Stack trim: each LPM key is block-scoped so the compiler can
-/// reuse the same stack slot for src and dst keys rather than
-/// holding both live; lookup helpers are also `#[inline(always)]`
-/// for the same reason; src/dst addresses are read inside their
-/// respective LPM blocks rather than at the function top.
-#[inline(always)]
-fn mss_clamp_inline(ctx: &XdpContext, ip: *mut u8, is_v4: bool, egress_ifindex: u32) {
-    let start = ctx.data();
-    let end = ctx.data_end();
-
-    // Read protocol byte first — bail early on non-TCP, which is
-    // the overwhelmingly common case for fast-pathed traffic.
-    let proto = if is_v4 {
-        unsafe { (*(ip as *const Ipv4Hdr)).proto }
-    } else {
-        unsafe { (*(ip as *const Ipv6Hdr)).next_hdr }
-    };
-    if proto != PROTO_TCP {
-        return;
-    }
-
-    // Look up clamp value via the precedence chain. Returns 0 if no
-    // policy applies. Helper is `#[inline(always)]`; its locals share
-    // this function's frame and are block-scoped for slot reuse.
-    let clamp = if is_v4 {
-        lookup_mss_clamp_v4(ip as *const Ipv4Hdr, egress_ifindex)
-    } else {
-        lookup_mss_clamp_v6(ip as *const Ipv6Hdr, egress_ifindex)
-    };
-    if clamp == 0 {
-        return;
-    }
-
-    // Recover the IP-header offset so we can compute the TCP offset
-    // (and bounds-check) without holding `ip` as a separate pointer
-    // variable. ip - start is a scalar (pkt_a - pkt_b) per the
-    // verifier.
-    let ip_offset = (ip as usize) - start;
-    let tcp_offset = ip_offset + if is_v4 { Ipv4Hdr::LEN } else { Ipv6Hdr::LEN };
-
-    // Need 20 bytes for the fixed TCP header before walking options.
-    if start + tcp_offset + 20 > end {
-        return;
-    }
-
-    // Bytes 12-13 of TCP header: data_offset:4 | reserved:4 | flags:8.
-    // doff is in 32-bit words; valid range [5, 15] = [20, 60] bytes.
-    let doff_byte = unsafe { *((start + tcp_offset + 12) as *const u8) };
-    let flags = unsafe { *((start + tcp_offset + 13) as *const u8) };
-    if flags & TCP_FLAG_SYN == 0 {
-        return; // Not SYN/SYN-ACK; clamp doesn't apply.
-    }
-    let doff_words = (doff_byte >> 4) as usize;
-    if !(5..=15).contains(&doff_words) {
-        return;
-    }
-    let tcp_hdr_len = doff_words * 4;
-    let opts_len = tcp_hdr_len - 20;
-    if opts_len == 0 {
-        // SYN with no options — operator policy says "clamp" but
-        // there's no MSS field to mutate. Count as skipped.
-        bump_stat(StatIdx::MssClampSkipped);
-        return;
-    }
-    if start + tcp_offset + tcp_hdr_len > end {
-        return;
-    }
-
-    // Walk options. Cap at 8 iterations: real SYN packets put MSS in
-    // the first 1-4 options, and 8 is plenty of headroom while
-    // keeping the BPF verifier's state-exploration bounded. A 40-
-    // iteration walk hit the verifier's 1M-instruction processing
-    // limit due to combinatorial state explosion across the branches.
-    //
-    // Use sequential `if` checks rather than `match` for the same
-    // reason — fewer state-space splits per iteration.
-    let opts_start_off = tcp_offset + 20;
-    let mut cursor: usize = 0;
-    let mut found = false;
-
-    for _ in 0..8 {
-        if cursor >= opts_len {
-            break;
-        }
-        let p_addr = start + opts_start_off + cursor;
-        // Need at least 4 bytes for a worst-case MSS option; if
-        // there's less than that left, no MSS is possible. Also
-        // bounds-checks the kind/length reads below.
-        if p_addr + 4 > end {
-            break;
-        }
-        let p = p_addr as *const u8;
-        let kind = unsafe { *p };
-        if kind == 0 {
-            break; // EOL — no more options.
-        }
-        if kind == 1 {
-            cursor += 1; // NOP, single byte.
-            continue;
-        }
-        // Length-prefixed option (kind != 0, != 1). Length includes
-        // the kind+length bytes themselves; valid range is 2..=opts_len.
-        let length = unsafe { *p.add(1) } as usize;
-        if length < 2 || cursor + length > opts_len {
-            break; // Malformed.
-        }
-        if kind == 2 && length == 4 {
-            // MSS option: [kind=2, length=4, mss_be:2].
-            let mss_be = unsafe { [*p.add(2), *p.add(3)] };
-            let mss = u16::from_be_bytes(mss_be);
-            if mss > clamp {
-                let new_mss_be = clamp.to_be_bytes();
-                unsafe {
-                    let pmut = p as *mut u8;
-                    *pmut.add(2) = new_mss_be[0];
-                    *pmut.add(3) = new_mss_be[1];
-                }
-                // TCP csum is at offset 16 of the TCP header; do an
-                // RFC 1624 incremental update.
-                let csum_off = tcp_offset + 16;
-                if start + csum_off + 2 > end {
-                    return;
-                }
-                let csum_p = (start + csum_off) as *mut u8;
-                let old_csum_be = unsafe { [*csum_p, *csum_p.add(1)] };
-                let old_csum = u16::from_be_bytes(old_csum_be);
-                let new_csum = csum_replace_u16(old_csum, mss, clamp);
-                let new_csum_be = new_csum.to_be_bytes();
-                unsafe {
-                    *csum_p = new_csum_be[0];
-                    *csum_p.add(1) = new_csum_be[1];
-                }
-                bump_stat(StatIdx::MssClampApplied);
-            } else {
-                bump_stat(StatIdx::MssClampSkipped);
-            }
-            found = true;
-            break;
-        }
-        cursor += length;
-    }
-
-    if !found {
-        // Hit EOL or walked past the budget without an MSS option.
-        bump_stat(StatIdx::MssClampSkipped);
-    }
-}
-
-/// Apply RFC 1624 incremental checksum update for a single 16-bit
-/// field change: `HC' = ~(~HC + ~m + m')`. Two-iteration end-around
-/// carry fold (max 2 needed for adding three 16-bit values into a
-/// u32). Verifier-friendly — no loops.
-#[inline(always)]
-fn csum_replace_u16(old_csum: u16, old_val: u16, new_val: u16) -> u16 {
-    let mut sum: u32 = (!old_csum) as u32 + (!old_val) as u32 + new_val as u32;
-    sum = (sum & 0xffff) + (sum >> 16);
-    sum = (sum & 0xffff) + (sum >> 16);
-    !(sum as u16)
-}
-
-/// Resolve the mss-clamp value for an IPv4 packet, in precedence
-/// order: src-prefix → dst-prefix → per-egress → global. Returns 0 if
-/// no policy applies. The LPM lookups respect each entry's
-/// `iface_filter` (0 = wildcard). Block-scope each Key + addr so LLVM
-/// can reuse the same stack slot rather than carrying both keys live
-/// — matters for the cumulative BPF 512-byte stack budget. Reads
-/// addresses through the IP-header pointer rather than taking them
-/// by value so the caller doesn't pre-materialize them on its frame.
-#[inline(always)]
-fn lookup_mss_clamp_v4(ip: *const Ipv4Hdr, egress_ifindex: u32) -> u16 {
-    {
-        let key = Key::new(32, unsafe { (*ip).src_addr });
-        if let Some(entry) = MSS_CLAMP_V4.get(&key) {
-            if entry.iface_filter == 0 || entry.iface_filter == egress_ifindex {
-                return entry.mss;
-            }
-        }
-    }
-    {
-        let key = Key::new(32, unsafe { (*ip).dst_addr });
-        if let Some(entry) = MSS_CLAMP_V4.get(&key) {
-            if entry.iface_filter == 0 || entry.iface_filter == egress_ifindex {
-                return entry.mss;
-            }
-        }
-    }
-    if let Some(mss) = unsafe { MSS_CLAMP_BY_IFACE.get(&egress_ifindex) } {
-        if *mss != 0 {
-            return *mss;
-        }
-    }
-    if let Some(c) = CFG.get(0) {
-        return c.mss_clamp_global;
-    }
-    0
-}
-
-/// IPv6 mirror of [`lookup_mss_clamp_v4`] — same precedence, /128 keys.
-#[inline(always)]
-fn lookup_mss_clamp_v6(ip: *const Ipv6Hdr, egress_ifindex: u32) -> u16 {
-    {
-        let key = Key::new(128, unsafe { (*ip).src_addr });
-        if let Some(entry) = MSS_CLAMP_V6.get(&key) {
-            if entry.iface_filter == 0 || entry.iface_filter == egress_ifindex {
-                return entry.mss;
-            }
-        }
-    }
-    {
-        let key = Key::new(128, unsafe { (*ip).dst_addr });
-        if let Some(entry) = MSS_CLAMP_V6.get(&key) {
-            if entry.iface_filter == 0 || entry.iface_filter == egress_ifindex {
-                return entry.mss;
-            }
-        }
-    }
-    if let Some(mss) = unsafe { MSS_CLAMP_BY_IFACE.get(&egress_ifindex) } {
-        if *mss != 0 {
-            return *mss;
-        }
-    }
-    if let Some(c) = CFG.get(0) {
-        return c.mss_clamp_global;
-    }
-    0
-}
-
 // --- TTL / csum / helpers -------------------------------------------------
 
 /// Decrement IPv4 TTL and patch the header checksum using RFC 1624
diff --git a/crates/modules/fast-path/bpf/src/maps.rs b/crates/modules/fast-path/bpf/src/maps.rs
index 4e9c6fe..61c3776 100644
--- a/crates/modules/fast-path/bpf/src/maps.rs
+++ b/crates/modules/fast-path/bpf/src/maps.rs
@@ -8,7 +8,7 @@
 
 use aya_ebpf::{
     macros::map,
-    maps::{Array, DevMapHash, HashMap, LpmTrie, PerCpuArray, RingBuf},
+    maps::{Array, DevMapHash, HashMap, LpmTrie, PerCpuArray, ProgramArray, RingBuf},
 };
 
 /// Runtime flags poked by userspace via the `cfg` map. `version` is a
@@ -134,11 +134,21 @@ pub enum StatIdx {
     /// to gauge how often clamps are firing vs being skipped on existing
     /// well-behaved traffic.
     MssClampSkipped = 34,
+    /// v0.2.5: fast-path's `bpf_tail_call` into `MUTATION_PROGS[0]`
+    /// returned an error (slot empty / invalid). Should be 0 in steady
+    /// state; non-zero means `populate_mutation_progs` failed at attach
+    /// time. fast_path falls back to XDP_PASS so traffic still flows
+    /// (kernel slow path) — the chain is fail-safe.
+    ErrTailCall = 35,
+    /// v0.2.5: finalize couldn't read the per-CPU `MUTATION_CTX`
+    /// scratch slot. Shouldn't happen — fast_path always writes before
+    /// tail_call. Diagnostic; finalize XDP_PASSes on this error.
+    ErrMutationCtx = 36,
 }
 
 /// Total counter count. Used as `stats` map `max_entries`. New counters
 /// bump this; dashboards keying on indices keep working.
-pub const STATS_COUNT: u32 = 35;
+pub const STATS_COUNT: u32 = 37;
 
 /// Flag bits for `FpCfg.flags`. Bits 0-1 are the IPv4/IPv6 enable
 /// mask (historical, load-bearing for dashboards). Bit 2 is the
@@ -252,6 +262,45 @@ pub struct MssClampValue {
     pub iface_filter: u32,
 }
 
+// --- Two-stage datapath: per-CPU mutation context (v0.2.5+) ------------
+
+/// Per-CPU scratch carrying decision state from `fast_path` (XDP, attached
+/// to ifaces) to `finalize` (XDP, tail-called by fast_path).
+///
+/// fast_path writes this immediately before `MUTATION_PROGS.tail_call(0)`;
+/// finalize reads it as its first action. The packet itself is preserved
+/// across the tail-call by the kernel (`xdp_buff` survives, mutations
+/// stick), but locally-computed scalars (FIB-resolved egress, ingress/egress
+/// VID, IP-header offset, family discriminator) need this side channel.
+///
+/// Per-CPU because the NAPI cycle is single-CPU; the read in finalize is
+/// guaranteed to see the write in fast_path with no synchronization.
+///
+/// 16 bytes, naturally aligned. Size + alignment is asserted in a
+/// userspace test (see `crates/modules/fast-path/src/linux_impl.rs`'s
+/// `MutationCtx` mirror).
+#[repr(C)]
+#[derive(Copy, Clone)]
+pub struct MutationCtx {
+    /// FIB-resolved egress (pre-VLAN-resolve). Used by finalize for
+    /// VLAN_RESOLVE lookup, MSS_CLAMP_BY_IFACE lookup, and the final
+    /// `bpf_redirect_map` call.
+    pub egress_ifindex: u32,
+    /// Egress VLAN ID (from VLAN_RESOLVE in fast_path; 0 = untagged).
+    pub egress_vid: u16,
+    /// Ingress VLAN ID (from packet parse; 0 = untagged). Combined with
+    /// `egress_vid` drives the four-case VLAN choreography in finalize.
+    pub ingress_vid: u16,
+    /// Offset (bytes) from `ctx.data()` to the IP header. fast_path
+    /// already validated bounds; finalize uses this for mss-clamp's
+    /// TCP-header bounds check.
+    pub ip_offset: u32,
+    /// 1 = IPv4 packet, 0 = IPv6. Determines which IP-header struct
+    /// finalize casts to and which MSS_CLAMP map to consult.
+    pub is_v4: u8,
+    pub _pad: [u8; 3],
+}
+
 // --- Custom FIB value layouts (Option F) -------------------------------
 
 /// Max nexthops in a single ECMP group. The XDP program walks `nh_idx`
@@ -452,6 +501,28 @@ pub static MSS_CLAMP_V6: LpmTrie<[u8; 16], MssClampValue> =
 pub static MSS_CLAMP_BY_IFACE: HashMap<u32, u16> =
     HashMap::with_max_entries(MSS_CLAMP_IFACE_MAX_ENTRIES, 0);
 
+// --- Two-stage datapath maps (v0.2.5+) ---------------------------------
+
+/// Per-CPU scratch carrying decision state from `fast_path` to `finalize`
+/// across `bpf_tail_call`. Single-element array; fast_path writes index 0
+/// before `tail_call`, finalize reads index 0 as its first action. Per-
+/// CPU avoids contention; NAPI cycle is single-CPU so the read sees the
+/// most recent write.
+#[map]
+pub static MUTATION_CTX: PerCpuArray<MutationCtx> = PerCpuArray::with_max_entries(1, 0);
+
+/// Tail-call jump table (v0.2.5+). fast_path tail_calls into slot 0 after
+/// classification + L2/TTL mutations. Slot 0 holds `finalize` today.
+/// Sized for 8 future stages (chained finalizers, alternate clamp
+/// strategies, future packet transforms) — slot count is BPF-load-time-
+/// fixed, so headroom here is cheap.
+///
+/// Tail-call into an empty slot returns an error to the caller; fast_path
+/// handles this by bumping `ErrTailCall` and returning XDP_PASS so traffic
+/// fails open to kernel slow-path rather than getting blackholed.
+#[map]
+pub static MUTATION_PROGS: ProgramArray = ProgramArray::with_max_entries(8, 0);
+
 // --- Custom-FIB maps (Option F, Phase 1) -------------------------------
 //
 // These maps are declared and sized in Phase 1 but neither read nor
diff --git a/crates/modules/fast-path/src/lib.rs b/crates/modules/fast-path/src/lib.rs
index 6d88656..10bc4ea 100644
--- a/crates/modules/fast-path/src/lib.rs
+++ b/crates/modules/fast-path/src/lib.rs
@@ -29,7 +29,8 @@ pub mod reconcile;
 
 #[cfg(target_os = "linux")]
 pub use linux_impl::{
-    fib_status_from_pin, stats_from_pin, trial_attach_native, FibStatusSnapshot, TrialResult,
+    fib_status_from_pin, stats_from_pin, tail_call_chain_from_pin, trial_attach_native,
+    FibStatusSnapshot, TrialResult,
 };
 
 pub const MODULE_NAME: &str = "fast-path";
diff --git a/crates/modules/fast-path/src/linux_impl.rs b/crates/modules/fast-path/src/linux_impl.rs
index 1e027e4..d44e312 100644
--- a/crates/modules/fast-path/src/linux_impl.rs
+++ b/crates/modules/fast-path/src/linux_impl.rs
@@ -657,6 +657,29 @@ fn populate_mss_clamp(ebpf: &mut Ebpf, mcfg: &ModuleConfig<'_>) -> ModuleResult<
 }
 
 pub fn attach(state: &mut ActiveState, cfg: &ModuleConfig<'_>) -> ModuleResult<Vec<Attachment>> {
+    // v0.2.5: load `finalize` first so its FD is available for the
+    // MUTATION_PROGS[0] population below. Order matters: fast_path's
+    // tail_call into MUTATION_PROGS[0] must succeed on every packet
+    // from the moment fast_path is attached, so finalize has to be
+    // loaded + populated *before* the per-iface attach loop below.
+    {
+        let finalize_prog: &mut Xdp = state
+            .ebpf
+            .program_mut(pin::FINALIZE_PROGRAM_NAME)
+            .ok_or_else(|| ModuleError::other(MODULE_NAME, "finalize program missing from ELF"))?
+            .try_into()
+            .map_err(|e| {
+                ModuleError::other(MODULE_NAME, format!("finalize program not XDP: {e}"))
+            })?;
+        finalize_prog.load().map_err(|e| {
+            ModuleError::other(
+                MODULE_NAME,
+                format!("Xdp::load(finalize) failed (verifier rejection?): {e}"),
+            )
+        })?;
+    }
+    populate_mutation_progs(&mut state.ebpf)?;
+
     let prog: &mut Xdp = state
         .ebpf
         .program_mut("fast_path")
@@ -1170,18 +1193,26 @@ fn read_iface_driver(iface: &str) -> Option<String> {
 }
 
 fn pin_program_and_maps(state: &mut ActiveState) -> ModuleResult<()> {
-    let prog_path = pin::program_path(&state.bpffs_root);
-    {
+    // v0.2.5: pin both `fast_path` (the iface-attached XDP) and
+    // `finalize` (the tail-called second stage). Both pins survive
+    // SIGTERM per SPEC §8.5; on restart, `pin::has_existing_pins`
+    // sees both and refuses to start until operator runs `detach --all`.
+    for prog_name in [pin::PROGRAM_NAME, pin::FINALIZE_PROGRAM_NAME] {
+        let prog_path = pin::program_path_for(&state.bpffs_root, prog_name);
         let prog: &mut Xdp = state
             .ebpf
-            .program_mut(pin::PROGRAM_NAME)
-            .ok_or_else(|| ModuleError::other(MODULE_NAME, "fast_path program missing for pin"))?
+            .program_mut(prog_name)
+            .ok_or_else(|| {
+                ModuleError::other(MODULE_NAME, format!("{prog_name} program missing for pin"))
+            })?
             .try_into()
-            .map_err(|e| ModuleError::other(MODULE_NAME, format!("pin: program not XDP: {e}")))?;
+            .map_err(|e| {
+                ModuleError::other(MODULE_NAME, format!("pin: {prog_name} not XDP: {e}"))
+            })?;
         prog.pin(&prog_path).map_err(|e| {
             ModuleError::other(
                 MODULE_NAME,
-                format!("pin program at {}: {e}", prog_path.display()),
+                format!("pin {prog_name} at {}: {e}", prog_path.display()),
             )
         })?;
     }
@@ -1201,11 +1232,50 @@ fn pin_program_and_maps(state: &mut ActiveState) -> ModuleResult<()> {
 
     info!(
         pin_root = %pin::module_root(&state.bpffs_root).display(),
-        "program + maps pinned"
+        "fast_path + finalize programs + maps pinned"
     );
     Ok(())
 }
 
+/// Populate `MUTATION_PROGS[0]` with `finalize`'s FD so fast_path's
+/// `bpf_tail_call(MUTATION_PROGS, 0)` resolves to it. Must run after
+/// `finalize.load()` (FD valid) and before `fast_path` attaches to any
+/// iface (otherwise an early packet would tail-call into an empty slot,
+/// trip ErrTailCall, and slow-path through the kernel).
+///
+/// v0.2.5+. Single program in slot 0 for now; future stages either
+/// replace slot 0 with a chain head or add to subsequent slots.
+fn populate_mutation_progs(ebpf: &mut Ebpf) -> ModuleResult<()> {
+    use aya::maps::ProgramArray;
+    use aya::programs::ProgramFd;
+
+    // Borrow scope: the ProgramFd has to outlive the ProgramArray::set
+    // call, but it borrows from `ebpf`. Open the ProgramFd first, then
+    // reborrow ebpf for the map.
+    let finalize_fd: ProgramFd = {
+        let prog: &Xdp = ebpf
+            .program(pin::FINALIZE_PROGRAM_NAME)
+            .ok_or_else(|| ModuleError::other(MODULE_NAME, "finalize program missing post-load"))?
+            .try_into()
+            .map_err(|e| ModuleError::other(MODULE_NAME, format!("finalize not XDP: {e}")))?;
+        prog.fd()
+            .map_err(|e| ModuleError::other(MODULE_NAME, format!("finalize fd: {e}")))?
+            .try_clone()
+            .map_err(|e| ModuleError::other(MODULE_NAME, format!("finalize fd clone: {e}")))?
+    };
+
+    let map = ebpf
+        .map_mut("MUTATION_PROGS")
+        .ok_or_else(|| ModuleError::other(MODULE_NAME, "MUTATION_PROGS map missing"))?;
+    let mut prog_array: ProgramArray<_> = ProgramArray::try_from(map)
+        .map_err(|e| ModuleError::other(MODULE_NAME, format!("MUTATION_PROGS try_from: {e}")))?;
+    prog_array.set(0, &finalize_fd, 0).map_err(|e| {
+        ModuleError::other(MODULE_NAME, format!("MUTATION_PROGS.set(0, finalize): {e}"))
+    })?;
+    info!("MUTATION_PROGS[0] populated with finalize program FD");
+    Ok(())
+}
+
 /// §2.3: per-interface trial-attach. `Native` and `Generic` are explicit
 /// (no fallback); `Auto` tries native first, falls back to generic on
 /// any error. The spec calls out that `bpftool feature probe` is
@@ -1880,13 +1950,12 @@ pub fn stats_from_pin(bpffs_root: &Path) -> ModuleResult<Vec<u64>> {
 fn read_stats<T: std::borrow::Borrow<aya::maps::MapData>>(
     stats: &aya::maps::PerCpuArray<T, u64>,
 ) -> ModuleResult<Vec<u64>> {
-    // `STATS_COUNT` in bpf/src/maps.rs. v0.2.0 = 32 (20 core + 12
-    // custom-FIB). v0.2.1 = 33 (added `bogon_dropped` for issue #33).
-    // Previous versions hardcoded 19 — an off-by-one that hid the
-    // `err_head_shift` counter from status readback. Keep this in
-    // lockstep with the BPF side or the last counters show zero
-    // unfairly.
-    const STATS_LEN: usize = 33;
+    // `STATS_COUNT` in bpf/src/maps.rs. Keep in lockstep with the BPF
+    // side or the last counters show zero unfairly. Prior versions
+    // hardcoded an off-by-one (19 hid `err_head_shift`; 33 hid
+    // `mss_clamp_*`); v0.2.5 = 37 (32 + bogon + 2 mss-clamp + 2
+    // tail-call diagnostics).
+    const STATS_LEN: usize = 37;
     let mut out = vec![0u64; STATS_LEN];
     for (idx, slot) in out.iter_mut().enumerate() {
         let values = stats
@@ -1897,6 +1966,43 @@ fn read_stats<T: std::borrow::Borrow<aya::maps::MapData>>(
     Ok(out)
 }
 
+/// Read `MUTATION_PROGS` from its bpffs pin and return whether slot 0
+/// is populated. Status command uses this to confirm the v0.2.5+
+/// tail-call chain (`fast_path` → `finalize`) is wired correctly.
+/// An empty slot means an attach-time bug in `populate_mutation_progs`;
+/// fast_path's `tail_call` will fail and bump `ErrTailCall` on every
+/// fast-pathed packet.
+///
+/// aya 0.13's ProgramArray exposes `indices()` (which keys are set)
+/// but not a getter that returns the populated `ProgramFd`/prog_id —
+/// the BPF_MAP_TYPE_PROG_ARRAY value is a kernel RawFd that becomes
+/// invalid outside the loader's process. We just report populated/
+/// empty here; operators can confirm prog_id via
+/// `bpftool prog show name finalize`.
+pub fn tail_call_chain_from_pin(bpffs_root: &Path) -> ModuleResult<bool> {
+    use aya::maps::{Map, MapData, ProgramArray};
+
+    let pin_path = pin::map_path(bpffs_root, "MUTATION_PROGS");
+    let map_data = MapData::from_pin(&pin_path).map_err(|e| {
+        ModuleError::other(
+            MODULE_NAME,
+            format!("open MUTATION_PROGS pin at {}: {e}", pin_path.display()),
+        )
+    })?;
+    let map = Map::ProgramArray(map_data);
+    let prog_array: ProgramArray<_> = ProgramArray::try_from(map).map_err(|e| {
+        ModuleError::other(MODULE_NAME, format!("MUTATION_PROGS try_from pin: {e}"))
+    })?;
+    for idx in prog_array.indices() {
+        let key = idx
+            .map_err(|e| ModuleError::other(MODULE_NAME, format!("MUTATION_PROGS indices: {e}")))?;
+        if key == 0 {
+            return Ok(true);
+        }
+    }
+    Ok(false)
+}
+
 /// Accessor consumed by the bpffs-pin code in PR #6. For now,
 /// exposed so the CLI `status` can report the pin root without
 /// the module needing to expose `ActiveState` directly.
diff --git a/crates/modules/fast-path/src/pin.rs b/crates/modules/fast-path/src/pin.rs
index b24c1b7..b3cc01a 100644
--- a/crates/modules/fast-path/src/pin.rs
+++ b/crates/modules/fast-path/src/pin.rs
@@ -22,7 +22,7 @@ use std::path::{Path, PathBuf};
 use crate::MODULE_NAME;
 
 /// Every §4.5 map that gets pinned. Order is not significant.
-pub const MAP_NAMES: [&str; 14] = [
+pub const MAP_NAMES: [&str; 19] = [
     "ALLOW_V4",
     "ALLOW_V6",
     "CFG",
@@ -44,11 +44,27 @@ pub const MAP_NAMES: [&str; 14] = [
     // pinned for uniform detach.
     "BLOCK_V4",
     "BLOCK_V6",
+    // --- v0.2.4: mss-clamp policy maps ---
+    "MSS_CLAMP_V4",
+    "MSS_CLAMP_V6",
+    "MSS_CLAMP_BY_IFACE",
+    // --- v0.2.5: two-stage datapath ---
+    "MUTATION_CTX",
+    "MUTATION_PROGS",
 ];
 
-/// The XDP program's pinned basename.
+/// The fast-path XDP program's pinned basename (attached per-iface).
 pub const PROGRAM_NAME: &str = "fast_path";
 
+/// The finalize XDP program's pinned basename (tail-called by fast_path
+/// via `MUTATION_PROGS[0]`; not directly attached). v0.2.5+.
+pub const FINALIZE_PROGRAM_NAME: &str = "finalize";
+
+/// All pinned program basenames in this module. Used by detach +
+/// `has_existing_pins` to walk every pinned program. Append-only; new
+/// programs go at the end.
+pub const PROGRAM_NAMES: [&str; 2] = [PROGRAM_NAME, FINALIZE_PROGRAM_NAME];
+
 pub fn module_root(bpffs_root: &Path) -> PathBuf {
     bpffs_root.join(MODULE_NAME)
 }
@@ -69,6 +85,13 @@ pub fn program_path(bpffs_root: &Path) -> PathBuf {
     progs_dir(bpffs_root).join(PROGRAM_NAME)
 }
 
+/// Path for an arbitrary pinned program by basename. Used by the v0.2.5+
+/// pin lifecycle that walks `PROGRAM_NAMES` to pin both `fast_path` and
+/// `finalize`.
+pub fn program_path_for(bpffs_root: &Path, name: &str) -> PathBuf {
+    progs_dir(bpffs_root).join(name)
+}
+
 pub fn map_path(bpffs_root: &Path, name: &str) -> PathBuf {
     maps_dir(bpffs_root).join(name)
 }
diff --git a/crates/modules/fast-path/tests/common/mod.rs b/crates/modules/fast-path/tests/common/mod.rs
index f60aded..f30ac9c 100644
--- a/crates/modules/fast-path/tests/common/mod.rs
+++ b/crates/modules/fast-path/tests/common/mod.rs
@@ -23,7 +23,7 @@
 use std::os::fd::{AsFd, AsRawFd};
 
 use aya::{
-    maps::{lpm_trie::Key as LpmKey, Array, LpmTrie, PerCpuArray},
+    maps::{lpm_trie::Key as LpmKey, Array, LpmTrie, PerCpuArray, ProgramArray},
     programs::{ProgramFd, Xdp},
     Ebpf, Pod,
 };
@@ -116,18 +116,61 @@ pub struct Harness {
 }
 
 impl Harness {
-    /// Load + verify the fast-path program. Panics if BPF isn't built
-    /// or the kernel rejects it.
+    /// Load + verify both BPF programs (fast_path + finalize, v0.2.5+),
+    /// and populate `MUTATION_PROGS[0]` so fast_path's `bpf_tail_call`
+    /// jumps into finalize. Panics if BPF isn't built or the verifier
+    /// rejects either program.
+    ///
+    /// `bpf_prog_test_run` follows tail-calls — the kernel re-enters
+    /// its BPF dispatcher for the target program, so tests that issue
+    /// `harness.run(&packet)` see the verdict + mutations from the full
+    /// chain (fast_path → finalize) when the packet is a successful
+    /// forward.
     pub fn new() -> Self {
         let bytes = aligned_bpf_copy();
         let mut bpf = Ebpf::load(&bytes).expect("aya::Ebpf::load");
 
-        let prog: &mut Xdp = bpf
-            .program_mut("fast_path")
-            .expect("fast_path program present")
-            .try_into()
-            .expect("program is XDP-typed");
-        prog.load().expect("verifier accepts program");
+        // Load finalize first so its FD is available for the
+        // MUTATION_PROGS[0] population below.
+        {
+            let prog: &mut Xdp = bpf
+                .program_mut("finalize")
+                .expect("finalize program present")
+                .try_into()
+                .expect("finalize is XDP-typed");
+            prog.load().expect("verifier accepts finalize program");
+        }
+
+        // Then fast_path.
+        {
+            let prog: &mut Xdp = bpf
+                .program_mut("fast_path")
+                .expect("fast_path program present")
+                .try_into()
+                .expect("program is XDP-typed");
+            prog.load().expect("verifier accepts fast_path program");
+        }
+
+        // Populate the tail-call jump table.
+        {
+            let finalize_fd: ProgramFd = {
+                let prog: &Xdp = bpf
+                    .program("finalize")
+                    .expect("finalize program present")
+                    .try_into()
+                    .expect("finalize is XDP-typed");
+                prog.fd()
+                    .expect("finalize loaded")
+                    .try_clone()
+                    .expect("finalize fd dup")
+            };
+            let map = bpf.map_mut("MUTATION_PROGS").expect("MUTATION_PROGS map");
+            let mut prog_array: ProgramArray<_> =
+                ProgramArray::try_from(map).expect("ProgramArray try_from");
+            prog_array
+                .set(0, &finalize_fd, 0)
+                .expect("MUTATION_PROGS.set(0, finalize)");
+        }
 
         // Set a default cfg with dry_run=off and both families enabled.
         let mut harness = Self { bpf };
diff --git a/docs/runbooks/tail-call-architecture.md b/docs/runbooks/tail-call-architecture.md
new file mode 100644
index 0000000..f049f65
--- /dev/null
+++ b/docs/runbooks/tail-call-architecture.md
@@ -0,0 +1,149 @@
+# Two-stage BPF datapath (v0.2.5+)
+
+PacketFrame's fast-path runs as **two BPF programs** chained by `bpf_tail_call`. This page exists for operators debugging the chain and contributors planning further BPF work.
+
+## Why two programs
+
+The single-program datapath (v0.2.4 and earlier) accumulated mutation, VLAN choreography, and redirect logic in one XDP program. On vanilla 5.15 + 6.6 kernels (CI's qemu test matrix) it loaded fine. On UniFi's `5.15.72-ui-cn9670` (real production hardware), the kernel verifier rejected at:
+
+```
+combined stack size of 3 calls is 544. Too large
+stack depth 0+480+0+0
+```
+
+UniFi's BPF patches plus the aarch64 JIT account stack ~120 bytes higher than vanilla 5.15 on x86_64 — same bytecode, different verifier accounting.
+
+Tail-calling into a second program gives that program its own fresh 512-byte stack. Beyond fixing the immediate budget issue, it establishes the pattern for future fast-path stages without re-bisecting stack bytes every time.
+
+This is **not** the multi-module dispatcher (SPEC §3.4 / §5.0). The dispatcher is for chaining independent modules at the same hook (ddos in front of fast-path, sampler behind it). Tail-call is for splitting one logical pipeline. Both are real and orthogonal; v0.2.5 ships only the latter.
+
+## Chain topology
+
+```
+                       packet ingress
+                              │
+                              ▼
+   ┌──────────────────────────────────────────────────┐
+   │ fast_path  (XDP, attached to eth0..ethN)         │  Frame A
+   │   classification (allow-prefix, block-prefix)     │  fits 512B
+   │   FIB lookup  (kernel-fib | custom-fib | compare)│
+   │   devmap pre-check                                │
+   │   TTL decrement                                   │
+   │   L2 rewrite (smac/dmac in place)                 │
+   │   write per-CPU MUTATION_CTX                      │
+   │   bpf_tail_call(MUTATION_PROGS, 0)  ──────────┐  │
+   └────────────────────────────────────────────────│──┘
+                                                    │
+                                                    ▼
+   ┌──────────────────────────────────────────────────┐
+   │ finalize  (XDP, tail-called by fast_path)        │  Frame B
+   │   read MUTATION_CTX                               │  fresh 512B
+   │   mss-clamp lookup + (optional) MSS rewrite       │
+   │   VLAN choreography (push / pop / rewrite)        │
+   │   bpf_redirect_map(egress_ifindex)                │
+   └──────────────────────────────────────────────────┘
+                              │
+                              ▼
+                      egress NIC TX
+```
+
+The packet itself is preserved across the tail-call — `bpf_tail_call` doesn't touch `xdp_buff`, so any in-place mutations from fast_path (TTL, L2, etc.) carry over. What does NOT carry are the program's local variables, which is why we need a side channel.
+
+## MUTATION_CTX wire format
+
+`MUTATION_CTX` is a `PerCpuArray<MutationCtx>` with a single element. `fast_path` writes index 0 immediately before its `bpf_tail_call`; `finalize` reads index 0 as its first action.
+
+```rust
+#[repr(C)]
+pub struct MutationCtx {
+    egress_ifindex: u32,   // FIB-resolved egress (pre-VLAN-resolve)
+    egress_vid: u16,       // VLAN_RESOLVE result; 0 = untagged
+    ingress_vid: u16,      // From packet parse; 0 = untagged
+    ip_offset: u32,        // Bytes from ctx.data() to IP header
+    is_v4: u8,             // 1 = IPv4, 0 = IPv6
+    _pad: [u8; 3],
+}
+```
+
+16 bytes, naturally aligned. Per-CPU because the NAPI cycle is single-CPU; the read in finalize sees the most recent write in fast_path with no synchronization.
+
+## MUTATION_PROGS jump table
+
+`MUTATION_PROGS` is a `ProgramArray` sized for 8 slots. Slot 0 holds `finalize`'s file descriptor. Slots 1–7 are reserved for future stages (see "Adding new stages" below).
+
+Userspace populates slot 0 at attach time, in `crates/modules/fast-path/src/linux_impl.rs::populate_mutation_progs`. Order is: load `finalize` → populate slot 0 → load + attach `fast_path` to ifaces. If the order is wrong, fast_path's first packet hits an empty slot, `bpf_tail_call` returns an error, and fast_path falls through to `XDP_PASS` (kernel slow-path) while bumping `ErrTailCall`.
+
+## Diagnostic commands
+
+```sh
+# Confirm both programs are loaded.
+sudo bpftool prog show name fast_path
+sudo bpftool prog show name finalize
+
+# Confirm MUTATION_PROGS[0] points at finalize.
+sudo bpftool map dump name MUTATION_PROGS
+# Expected: key 0x00000000 value <fd of finalize>
+
+# packetframe status reports the same:
+sudo packetframe status
+# tail-call chain (from /sys/fs/bpf/packetframe):
+#   MUTATION_PROGS[0]: populated (finalize) — confirm prog_id via ...
+
+# Watch the diagnostic counters:
+sudo packetframe status | grep -E 'err_tail_call|err_mutation_ctx'
+# Both should be 0 in steady state.
+
+# Inspect MUTATION_CTX (per-CPU; one entry per CPU):
+sudo bpftool map dump name MUTATION_CTX
+# Decoded fields are the most recent decision from each CPU's fast-path.
+# Useful for confirming the chain is firing on real traffic.
+```
+
+## What `ErrTailCall` and `ErrMutationCtx` mean
+
+Two new diagnostic counters at indices 35 and 36:
+
+- `err_tail_call`: fast_path called `MUTATION_PROGS.tail_call(ctx, 0)` and got an error back. Almost always means slot 0 is empty (attach-order bug). fast_path falls through to `XDP_PASS` so traffic still flows via kernel slow-path.
+- `err_mutation_ctx`: finalize couldn't read `MUTATION_CTX[0]`. Per-CPU array index 0 is always present, so this should be 0; non-zero indicates a kernel/aya bug worth filing.
+
+Both are append-only per CLAUDE.md guardrail — operator dashboards keying on counter index keep working.
+
+## Pin lifecycle
+
+bpffs layout under `/sys/fs/bpf/packetframe/fast-path/`:
+
+```
+progs/
+├── fast_path     ← attached to ifaces; pin survives SIGTERM
+└── finalize      ← tail-called; pin survives SIGTERM
+maps/
+├── (existing maps: ALLOW_V*, BLOCK_V*, CFG, STATS, ...)
+├── MSS_CLAMP_V4 / MSS_CLAMP_V6 / MSS_CLAMP_BY_IFACE  (v0.2.4+)
+├── MUTATION_CTX                                       (v0.2.5+)
+└── MUTATION_PROGS                                     (v0.2.5+)
+links/
+└── eth0, eth1, ...   ← per-iface XDP attachments (fast_path only)
+```
+
+`packetframe detach --all` walks both program pins and every map pin. Existing pin lifecycle and SIGTERM-without-detach semantics from SPEC §8.5 apply unchanged: both program pins survive process exit; the bpffs inodes hold kernel references; on restart, `pin::has_existing_pins()` sees them and refuses to start until operator runs `detach --all`.
+
+## Adding new stages
+
+The `MUTATION_PROGS` array has room for 7 future stages (slots 1–7). Two patterns:
+
+**Replace slot 0** if the new stage subsumes finalize's responsibilities. `populate_mutation_progs` decides which program goes in slot 0 based on config. Example: a new `finalize_with_nat` program that does NAT + mss-clamp + VLAN + redirect.
+
+**Chain via subsequent slots** if the new stage runs *between* finalize-equivalent stages. finalize's last action becomes `tail_call(MUTATION_PROGS, 1)` instead of `bpf_redirect_map`; the slot-1 program does redirect. This adds one more 512-byte stack budget.
+
+In both patterns, all stages share the same `MUTATION_CTX` and `STATS` maps (one ELF, automatic map sharing in aya). New stages can introduce their own scratch maps as needed.
+
+## What this isn't
+
+- **Multi-module composition.** ddos / sampler / randomizer (SPEC §5.x) need the libxdp dispatcher, not tail-calls. The dispatcher chains *independent* modules at the same hook based on XDP verdicts; tail-call is one-way control transfer between cooperating stages.
+- **A general "anything-goes" tail-call framework.** Tail calls have a depth limit (kernel cap is 33 chained calls; we never approach that) and one-way control flow. They're a tool for stack-budget relief, not a programmability layer.
+
+## See also
+
+- [docs/runbooks/mss-clamp.md](mss-clamp.md) — operator guide for the mss-clamp directive (which now lives inside `finalize`)
+- [docs/runbooks/reconfigure.md](reconfigure.md) — SIGHUP / `packetframe reconfigure` semantics; both maps update through the same reconcile path regardless of which program reads them
+- SPEC.md §3.2 (priority taxonomy), §3.4 (multi-program composition), §4.x (BPF map layouts), §11.x (kernel compatibility notes)