diff --git a/Cargo.lock b/Cargo.lock index 0c668c4..3b64876 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -680,7 +680,7 @@ checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe" [[package]] name = "packetframe-cli" -version = "0.2.3" +version = "0.2.4" dependencies = [ "anyhow", "clap", @@ -697,7 +697,7 @@ dependencies = [ [[package]] name = "packetframe-common" -version = "0.2.3" +version = "0.2.4" dependencies = [ "anyhow", "flate2", @@ -710,7 +710,7 @@ dependencies = [ [[package]] name = "packetframe-fast-path" -version = "0.2.3" +version = "0.2.4" dependencies = [ "anyhow", "aya", @@ -733,7 +733,7 @@ dependencies = [ [[package]] name = "packetframe-probe" -version = "0.2.3" +version = "0.2.4" dependencies = [ "anyhow", "aya", diff --git a/Cargo.toml b/Cargo.toml index 09af1b9..2b0f9a3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -23,7 +23,7 @@ exclude = [ # detach pacing for bridge members, integrity-check Total-line parse # fix). See SPEC.md §11.14 for the rollout history and # `docs/runbooks/custom-fib.md` for operations. -version = "0.2.3" +version = "0.2.4" edition = "2021" # MSRV. Deliberately behind the rust-toolchain.toml pin (which is the # latest stable) so a contributor with a slightly older toolchain still diff --git a/README.md b/README.md index a291f23..cf2bde7 100644 --- a/README.md +++ b/README.md @@ -71,8 +71,13 @@ PacketFrame complements existing routing daemons rather than replacing them. The | Connected-destination fast-path (`local-prefix`) | Production (v0.2.1+) | | `fallback-default` synthesis | Production (v0.2.1+) | | `block-prefix` XDP-time drop | Production (v0.2.1+) | +| `mss-clamp` directive (fast-path) | Production (v0.2.4+) | +| `packetframe reconfigure` / `systemctl reload packetframe` | Production (v0.2.4+) | | `probe` module — diagnostic XDP | Production | -| `randomizer` / `ddos` / `sampler` modules | Future — sketched in SPEC, not implemented | +| `ddos` module — XDP-time SYN-flood + amplification filter | Future — sketched in SPEC §5.2 (priority 0–999, security/admission) | +| `sampler` module — per-flow ringbuf observability | Future — sketched in SPEC §5.3 (priority 2000–2999, observation) | +| `randomizer` module — TC egress jitter for NoiseNet anti-correlation | Future — sketched in SPEC §5.1 (priority ~3000, egress) | +| Multi-module dispatcher (prerequisite for any second module on the same hook) | Future — module trait already shaped for it (SPEC §3.2 / §3.4) | ## Install @@ -81,7 +86,7 @@ Releases are published on the [GitHub releases page](https://github.com/unredact ### Debian / Ubuntu (.deb) ```sh -VERSION=v0.2.3 +VERSION=v0.2.4 ARCH=$(dpkg --print-architecture) # amd64 or arm64 curl -LO "https://github.com/unredacted/packetframe/releases/download/${VERSION}/packetframe_${VERSION#v}_${ARCH}.deb" @@ -98,7 +103,7 @@ Installs `/usr/bin/packetframe`, the systemd unit at `/lib/systemd/system/packet For musl-static deployments, non-Debian distros, or anything else: ```sh -VERSION=v0.2.3 +VERSION=v0.2.4 TARGET=aarch64-unknown-linux-gnu # or: x86_64-unknown-linux-{gnu,musl}, aarch64-unknown-linux-musl curl -LO "https://github.com/unredacted/packetframe/releases/download/${VERSION}/packetframe-${VERSION}-${TARGET}.tar.gz" @@ -140,6 +145,9 @@ module fast-path allow-prefix6 2001:db8::/48 dry-run on # observe-only — no redirects yet circuit-breaker drop-ratio 0.01 of matched window 5s threshold 5 + # mss-clamp via eth0 1360 # optional — clamp TCP MSS for fast-pathed + # traffic egressing eth0 (closes the + # iptables-bypass MSS gap; v0.2.4+) ``` `dry-run on` makes the program count matched packets but always return `XDP_PASS` — the kernel handles forwarding as if PacketFrame weren't there. Counters tell you whether your allowlist matches the right traffic before you flip the switch. @@ -161,7 +169,14 @@ sudo packetframe status # in another shell — live counters ### 5. Flip dry-run off when match ratios look right -Edit the config, change `dry-run on` to `dry-run off`, then `sudo systemctl reload packetframe` (if running under systemd) or `kill -HUP ` (foreground). The change is delta-only; no detach. +Edit the config, change `dry-run on` to `dry-run off`, then trigger a reload (v0.2.4+): + +```sh +sudo packetframe reconfigure # synchronous; exits non-zero on parse error +sudo systemctl reload packetframe # equivalent under systemd — both end up sending SIGHUP +``` + +What's hot-reloadable: `allow-prefix*`, `block-prefix`, `dry-run`, `forwarding-mode`, `mss-clamp`, VLAN-subif resolution, and the redirect devmap. Attach-set changes (interfaces added/removed), `route-source` config, `circuit-breaker` thresholds, and `local-prefix` still require a full restart. See [docs/runbooks/reconfigure.md](docs/runbooks/reconfigure.md). ### 6. Tear down @@ -249,10 +264,16 @@ Quick directive index: - `block-prefix ` — XDP-time drop for unrouteable destinations - `ecmp-default-hash-mode {3|4|5}` — tuple width for ECMP hashing +**Module fast-path — TCP transforms (v0.2.4+)** +- `mss-clamp ` — global clamp ceiling for matched TCP SYN/SYN-ACK +- `mss-clamp via ` — per-egress-iface +- `mss-clamp ` — per-src-or-dst-prefix (any egress) +- `mss-clamp via ` — most specific (precedence: prefix+iface > prefix > iface > global) + **Module fast-path — driver opt-ins** - `driver-workaround rvu-nicpf-head-shift {auto|on|off}` -`SIGHUP` reloads the config and applies delta-only changes to allowlists, VLAN-resolve, and devmap. Adding or removing an `attach` directive requires a restart. +`SIGHUP` (or `packetframe reconfigure` / `systemctl reload packetframe`) applies delta-only changes to allowlists, block-prefix, VLAN-resolve, devmap, mss-clamp, dry-run, and forwarding-mode bits. Adding or removing an `attach`, changing `route-source`, mutating `circuit-breaker` thresholds, or editing `local-prefix` requires a restart. ## Operator tools diff --git a/VERSION b/VERSION index 7179039..abd4105 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.2.3 +0.2.4 diff --git a/conf/example.conf b/conf/example.conf index a1c9f38..1080b82 100644 --- a/conf/example.conf +++ b/conf/example.conf @@ -45,6 +45,28 @@ module fast-path # 1% of matched over 5 consecutive 5-second samples. circuit-breaker drop-ratio 0.01 of matched window 5s threshold 5 + # MSS clamping for fast-pathed TCP SYN/SYN-ACK packets (v0.2.4+, + # SPEC.md §4.x — closes the §11.4 iptables-bypass gap). Standard + # iptables `-A FORWARD ... TCPMSS --set-mss N` rules don't fire on + # XDP-redirected traffic because bpf_redirect_map skips netfilter; + # this directive runs the equivalent mutation inline before the + # redirect. + # + # Lookup precedence (most specific wins, lower-if-higher policy): + # 1. mss-clamp via (prefix + egress iface) + # 2. mss-clamp (prefix, any egress) + # 3. mss-clamp via (egress iface, any prefix) + # 4. mss-clamp (global default) + # + # Prefix matches src OR dst (mirrors allow-prefix semantics) so a + # single rule covers both directions of a flow. Clamped on both SYN + # and SYN-ACK so each end's announced MSS is constrained per-direction. + # See docs/runbooks/mss-clamp.md for MSS vs MTU math + troubleshooting. + # + # mss-clamp via eth2 1360 # outbound: leaving WAN + # mss-clamp 23.191.201.0/24 via eth2 1360 # outbound, scoped to one customer + # mss-clamp 1360 # global fallback for all matched + # Driver workaround for the pre-Linux-v6.8 rvu-nicpf native XDP bug # (SPEC.md §11.1(c); upstream fix is commit 04f647c8e456). Values: # auto — detect rvu-nicpf via /sys and apply only on native attaches diff --git a/crates/cli/debian/packetframe.service b/crates/cli/debian/packetframe.service index dcba7b3..5b3dd51 100644 --- a/crates/cli/debian/packetframe.service +++ b/crates/cli/debian/packetframe.service @@ -7,6 +7,8 @@ Wants=network-online.target [Service] Type=simple ExecStart=/usr/bin/packetframe run +ExecReload=/bin/kill -HUP $MAINPID +PIDFile=/var/lib/packetframe/state/packetframe.pid Restart=on-failure RestartSec=5 User=root diff --git a/crates/cli/src/loader.rs b/crates/cli/src/loader.rs index 923eb12..67aa852 100644 --- a/crates/cli/src/loader.rs +++ b/crates/cli/src/loader.rs @@ -38,6 +38,57 @@ pub enum RunError { Runtime(String), } +/// Errors from `packetframe reconfigure`. Kept separate from +/// [`RunError`] because the CLI maps each variant to a different exit +/// code + log message — distinguishing "no daemon" from "daemon +/// rejected the new config" matters for operator scripts. Most +/// variants are Linux-only since the underlying signal/PID-file flow +/// is Linux-only; the macOS dev build gates them behind a generic +/// stub. +#[derive(Debug, thiserror::Error)] +pub enum ReconfigureError { + /// Config / pidfile / proc IO error — exit 2 (runtime). + #[error("{0}")] + Io(String), + /// PID file absent or stale — exit 1 (startup-style). + #[cfg_attr(not(target_os = "linux"), allow(dead_code))] + #[error("{0}")] + DaemonNotRunning(String), + /// SIGHUP delivered, daemon ack'd, but the ack reported a parse + /// error or per-module reconcile failure. Exit 2 (runtime). + #[cfg_attr(not(target_os = "linux"), allow(dead_code))] + #[error("{0}")] + DaemonRejected(String), + /// SIGHUP delivered but no ack within 5s. Daemon may be wedged. + #[cfg_attr(not(target_os = "linux"), allow(dead_code))] + #[error("daemon did not acknowledge reconfigure within 5s")] + Timeout, +} + +/// Sub-path under `state-dir` for the PID file. Written by the +/// running `run` loop after attach succeeds; removed on clean exit. +/// systemd's `PIDFile=` directive references the same path so the +/// supervisor has a clean handle (also enables `Type=forking` later +/// without protocol changes). +#[cfg(all(target_os = "linux", feature = "fast-path"))] +const PIDFILE_NAME: &str = "packetframe.pid"; + +/// Sub-path under `state-dir` for the reconfigure ack marker. The +/// daemon writes one line `OK ` after a successful SIGHUP +/// reconcile or `ERR ` on parse / per-module +/// failure. The `packetframe reconfigure` CLI polls this file for +/// up to 5s after sending SIGHUP and exits accordingly. +#[cfg(all(target_os = "linux", feature = "fast-path"))] +const RECONFIGURE_MARKER_NAME: &str = "last-reconfigure.timestamp"; + +/// Polling cadence + timeout for the CLI side of the reconfigure +/// handshake. 5s is plenty: the SIGHUP handler is synchronous and +/// finishes in ~tens of ms (mostly LPM-trie diffs). +#[cfg(target_os = "linux")] +const RECONFIGURE_POLL_INTERVAL_MS: u64 = 100; +#[cfg(target_os = "linux")] +const RECONFIGURE_TIMEOUT_MS: u64 = 5_000; + pub fn run(config_path: &Path) -> Result<(), RunError> { let config = Config::from_file(config_path) .map_err(|e| RunError::Startup(format!("config parse: {e}")))?; @@ -180,9 +231,25 @@ fn run_linux(config: Config, config_path: &Path) -> Result<(), RunError> { } } + // Write the PID file now, after attach has fully succeeded and + // the breaker sampler is up. Doing it any earlier would expose + // operators (and systemd's PIDFile=) to a half-attached daemon. + // Clean-exit paths below remove it; an uncontrolled crash leaves + // it stale, which `packetframe reconfigure` detects via the + // /proc//comm cross-check. + let pid_file_path = config.global.state_dir.join(PIDFILE_NAME); + if let Err(e) = write_pid_file(&pid_file_path) { + tracing::warn!( + path = %pid_file_path.display(), + error = %e, + "could not write PID file; `packetframe reconfigure` and `systemctl reload` will not work" + ); + } + tracing::info!("fast-path running — SIGHUP to reconfigure, SIGTERM/SIGINT to exit (§8.5)"); - let termination = drive_signal_loop(config_path, &mut modules).map_err(RunError::Runtime)?; + let termination = drive_signal_loop(config_path, &config.global.state_dir, &mut modules) + .map_err(RunError::Runtime)?; // Stop the exporter + breaker sampler(s) first so their final // writes complete before we touch module state. @@ -215,9 +282,36 @@ fn run_linux(config: Config, config_path: &Path) -> Result<(), RunError> { drop(modules); } } + // Best-effort PID file cleanup. Non-fatal — the file is harmless + // if left behind (PID will be unrecognized on re-validate). + if let Err(e) = std::fs::remove_file(&pid_file_path) { + if e.kind() != std::io::ErrorKind::NotFound { + tracing::warn!( + path = %pid_file_path.display(), + error = %e, + "could not remove PID file on exit" + ); + } + } Ok(()) } +/// Atomically write the current PID to `path`. Uses write-then-rename +/// so a half-written file is never observed. +#[cfg(all(target_os = "linux", feature = "fast-path"))] +fn write_pid_file(path: &Path) -> std::io::Result<()> { + use std::io::Write; + let parent = path.parent().unwrap_or_else(|| Path::new(".")); + std::fs::create_dir_all(parent)?; + let tmp = path.with_extension("pid.tmp"); + { + let mut f = std::fs::File::create(&tmp)?; + writeln!(f, "{}", std::process::id())?; + f.sync_all()?; + } + std::fs::rename(&tmp, path) +} + /// Look through a module section's directives and return its /// `CircuitBreakerSpec`, if present. Multiple directives of the same /// kind aren't rejected by the parser — take the last one if so. @@ -250,6 +344,7 @@ enum Termination { #[cfg(all(target_os = "linux", feature = "fast-path"))] fn drive_signal_loop( config_path: &Path, + state_dir: &Path, modules: &mut [(String, Box)], ) -> Result { use signal_hook::{ @@ -262,7 +357,7 @@ fn drive_signal_loop( for sig in signals.forever() { match sig { - SIGHUP => reconfigure_from_signal(config_path, modules), + SIGHUP => reconfigure_from_signal(config_path, state_dir, modules), SIGTERM | SIGINT => { tracing::info!(signal = sig, "termination requested"); return Ok(Termination::ExitPreserveAttach); @@ -280,24 +375,31 @@ fn drive_signal_loop( /// SIGHUP handler. Re-parses the config from `config_path` and calls /// `Module::reconfigure` on each loaded module. Parse failures and /// per-module reconfigure errors are logged and swallowed — a bad -/// SIGHUP never kills the running data plane. +/// SIGHUP never kills the running data plane. Writes an ack marker +/// to `state_dir/last-reconfigure.timestamp` for the +/// `packetframe reconfigure` CLI to poll. #[cfg(all(target_os = "linux", feature = "fast-path"))] fn reconfigure_from_signal( config_path: &Path, + state_dir: &Path, modules: &mut [(String, Box)], ) { use packetframe_common::module::ModuleConfig; tracing::info!(config = %config_path.display(), "SIGHUP received; reconfiguring"); + let marker_path = state_dir.join(RECONFIGURE_MARKER_NAME); + let new_config = match Config::from_file(config_path) { Ok(c) => c, Err(e) => { tracing::error!(error = %e, "SIGHUP config parse failed; keeping current config"); + write_reconfigure_marker(&marker_path, &format!("ERR parse: {e}")); return; } }; + let mut failures: Vec = Vec::new(); for (name, module) in modules.iter_mut() { let section = match new_config.modules.iter().find(|m| &m.name == name) { Some(s) => s, @@ -306,13 +408,182 @@ fn reconfigure_from_signal( module = %name, "module removed from config; reconfigure skipped (attach-set changes require restart)" ); + failures.push(format!("{name}: removed from config (restart required)")); continue; } }; let mcfg = ModuleConfig::new(section, &new_config.global); if let Err(e) = module.reconfigure(&mcfg) { tracing::warn!(module = %name, error = %e, "reconfigure failed"); + failures.push(format!("{name}: {e}")); + } + } + + if failures.is_empty() { + write_reconfigure_marker(&marker_path, "OK"); + } else { + write_reconfigure_marker( + &marker_path, + &format!("ERR module: {}", failures.join("; ")), + ); + } +} + +/// Append a timestamp + status line to the reconfigure marker file. +/// Non-fatal on I/O error — the SIGHUP handler still completed its +/// real work; the marker is just a hint to the CLI ack-poller. +#[cfg(all(target_os = "linux", feature = "fast-path"))] +fn write_reconfigure_marker(path: &Path, status: &str) { + use std::io::Write; + let now_ns = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .map(|d| d.as_nanos()) + .unwrap_or(0); + let parent = path.parent().unwrap_or_else(|| Path::new(".")); + if let Err(e) = std::fs::create_dir_all(parent) { + tracing::warn!(error = %e, "could not create reconfigure marker dir"); + return; + } + let tmp = path.with_extension("timestamp.tmp"); + let body = format!("{status} {now_ns}\n"); + let r = (|| -> std::io::Result<()> { + let mut f = std::fs::File::create(&tmp)?; + f.write_all(body.as_bytes())?; + f.sync_all()?; + std::fs::rename(&tmp, path) + })(); + if let Err(e) = r { + tracing::warn!(error = %e, "could not write reconfigure marker"); + } +} + +/// CLI entry for `packetframe reconfigure `. Reads the +/// daemon's PID file from the configured `state-dir`, validates the +/// running process, sends SIGHUP, and polls the ack-marker for up to +/// 5s. See [`ReconfigureError`] for the failure axes. +#[cfg(target_os = "linux")] +pub fn reconfigure(config_path: &Path) -> Result<(), ReconfigureError> { + let config = Config::from_file(config_path) + .map_err(|e| ReconfigureError::Io(format!("config parse: {e}")))?; + let state_dir = &config.global.state_dir; + let pid_path = state_dir.join(PIDFILE_NAME); + let marker_path = state_dir.join(RECONFIGURE_MARKER_NAME); + + let pid = read_pid_file(&pid_path).map_err(|e| match e.kind() { + std::io::ErrorKind::NotFound => ReconfigureError::DaemonNotRunning(format!( + "PID file not found at {} — daemon doesn't appear to be running", + pid_path.display() + )), + _ => ReconfigureError::Io(format!("read PID file {}: {e}", pid_path.display())), + })?; + + // /proc//comm cross-check defends against a stale PID file + // pointing at a recycled PID. The kernel truncates `comm` to 15 + // chars + NUL; "packetframe" fits comfortably. + if !proc_comm_matches(pid, "packetframe") { + return Err(ReconfigureError::DaemonNotRunning(format!( + "PID {pid} from {} is not a packetframe process (stale pidfile?)", + pid_path.display() + ))); + } + + // Snapshot the marker mtime (or NotFound) before signaling so we + // can detect "changed since SIGHUP." + let pre_mtime = marker_mtime(&marker_path); + + // SIGHUP. The daemon's signal loop picks it up synchronously and + // either reconciles or logs+writes ERR. + let rc = unsafe { libc::kill(pid, libc::SIGHUP) }; + if rc != 0 { + return Err(ReconfigureError::Io(format!( + "kill -HUP {pid}: {}", + std::io::Error::last_os_error() + ))); + } + + // Poll for up to 5s. + let start = std::time::Instant::now(); + let timeout = std::time::Duration::from_millis(RECONFIGURE_TIMEOUT_MS); + let interval = std::time::Duration::from_millis(RECONFIGURE_POLL_INTERVAL_MS); + loop { + let now_mtime = marker_mtime(&marker_path); + if now_mtime != pre_mtime && now_mtime.is_some() { + // The daemon ack'd. Read the body to distinguish OK from + // a parse-error or per-module reconcile failure. + return match std::fs::read_to_string(&marker_path) { + Ok(body) => parse_reconfigure_marker(&body), + Err(e) => Err(ReconfigureError::Io(format!( + "read marker {}: {e}", + marker_path.display() + ))), + }; } + if start.elapsed() >= timeout { + return Err(ReconfigureError::Timeout); + } + std::thread::sleep(interval); + } +} + +/// Non-Linux stub. The daemon can't actually run on non-Linux hosts +/// (XDP is Linux-only), so reconfigure has nothing to talk to. +#[cfg(not(target_os = "linux"))] +pub fn reconfigure(_config_path: &Path) -> Result<(), ReconfigureError> { + Err(ReconfigureError::Io( + "reconfigure is Linux-only — the daemon cannot run on this host".into(), + )) +} + +#[cfg(target_os = "linux")] +fn read_pid_file(path: &Path) -> std::io::Result { + let s = std::fs::read_to_string(path)?; + s.trim().parse::().map_err(|e| { + std::io::Error::new( + std::io::ErrorKind::InvalidData, + format!("PID parse `{}`: {e}", s.trim()), + ) + }) +} + +/// Read /proc//comm and check that it matches `expected`. +/// Returns false on any I/O error or mismatch — caller treats as +/// "PID is not our process." +#[cfg(target_os = "linux")] +fn proc_comm_matches(pid: libc::pid_t, expected: &str) -> bool { + let path = format!("/proc/{pid}/comm"); + match std::fs::read_to_string(&path) { + Ok(s) => s.trim() == expected, + Err(_) => false, + } +} + +/// Modified time of the marker file, in (secs, nanos). `None` if the +/// file doesn't exist — used to detect "freshly written since +/// SIGHUP." Any non-NotFound error is treated as "no observation," +/// which causes the poller to keep waiting until timeout. +#[cfg(target_os = "linux")] +fn marker_mtime(path: &Path) -> Option<(i64, u32)> { + let meta = std::fs::metadata(path).ok()?; + let m = meta.modified().ok()?; + let dur = m.duration_since(std::time::UNIX_EPOCH).ok()?; + Some((dur.as_secs() as i64, dur.subsec_nanos())) +} + +/// Parse the marker body — `OK ` or `ERR : `. +#[cfg(target_os = "linux")] +fn parse_reconfigure_marker(body: &str) -> Result<(), ReconfigureError> { + let trimmed = body.trim(); + if let Some(rest) = trimmed.strip_prefix("OK ") { + // Rest is just the timestamp; we don't use it. + let _ = rest; + Ok(()) + } else if let Some(rest) = trimmed.strip_prefix("ERR ") { + Err(ReconfigureError::DaemonRejected(rest.to_string())) + } else { + // Marker exists but doesn't match the expected format. + Err(ReconfigureError::Io(format!( + "unexpected marker content: {trimmed}" + ))) } } diff --git a/crates/cli/src/main.rs b/crates/cli/src/main.rs index ab2a00f..1069fa3 100644 --- a/crates/cli/src/main.rs +++ b/crates/cli/src/main.rs @@ -106,10 +106,19 @@ enum Command { config: Option, }, - /// Re-read config and reconcile. Stubbed until PR #6. + /// Re-read config and apply hot-reloadable changes (allow-prefix, + /// block-prefix, dry-run, forwarding-mode, mss-clamp, etc.) on the + /// running daemon without touching attach state. Reads the + /// daemon's PID file, sends SIGHUP, then polls the + /// `last-reconfigure.timestamp` marker for confirmation. + /// Equivalent to `systemctl reload packetframe` under systemd. + /// Attach-set changes (interfaces added/removed) require a full + /// restart and are silently skipped — `packetframe status` after + /// the reload shows what's currently attached. Reconfigure { /// Path to the config file. Defaults to - /// `/etc/packetframe/packetframe.conf`. + /// `/etc/packetframe/packetframe.conf`. Used to discover the + /// daemon's `state-dir` (and from there, the PID file). #[arg(long)] config: Option, }, @@ -274,7 +283,31 @@ fn main() -> ExitCode { } } } - Command::Reconfigure { .. } => not_implemented("reconfigure"), + Command::Reconfigure { config } => { + let path = config_path_or_default(config); + match loader::reconfigure(&path) { + Ok(()) => ExitCode::from(EXIT_OK), + Err(loader::ReconfigureError::Io(msg)) => { + tracing::error!(error = %msg, "reconfigure failed"); + ExitCode::from(EXIT_RUNTIME_ERROR) + } + Err(loader::ReconfigureError::DaemonNotRunning(msg)) => { + tracing::error!(error = %msg, "reconfigure: daemon not running"); + ExitCode::from(EXIT_STARTUP_ERROR) + } + Err(loader::ReconfigureError::DaemonRejected(msg)) => { + tracing::error!(error = %msg, "reconfigure: daemon rejected the new config"); + ExitCode::from(EXIT_RUNTIME_ERROR) + } + Err(loader::ReconfigureError::Timeout) => { + tracing::error!( + "reconfigure: no acknowledgment from the daemon within 5s — \ + daemon may be wedged. Check `journalctl -u packetframe`." + ); + ExitCode::from(EXIT_RUNTIME_ERROR) + } + } + } Command::Map { .. } => not_implemented("map"), #[cfg(all(target_os = "linux", feature = "fast-path"))] Command::Fib { op } => fib_cli::run(op), diff --git a/crates/common/src/config.rs b/crates/common/src/config.rs index 0cce884..05698f1 100644 --- a/crates/common/src/config.rs +++ b/crates/common/src/config.rs @@ -205,6 +205,28 @@ pub enum ModuleDirective { cidr: Ipv4Prefix, line: usize, }, + /// MSS clamping for matched TCP SYN/SYN-ACK packets (v0.2.4+). + /// Closes the SPEC §11.4 gap where iptables `TCPMSS` rules don't + /// fire on fast-pathed flows because XDP redirect bypasses + /// netfilter. Four grammars: + /// + /// - `mss-clamp ` — global default for all matched TCP SYNs + /// - `mss-clamp via ` — per-egress-iface + /// - `mss-clamp ` — per-src-or-dst-prefix (any egress) + /// - `mss-clamp via ` — most specific + /// + /// Lookup precedence at XDP runtime, most specific wins: + /// `(prefix + iface)` then `prefix` then `iface` then `global`. + /// Prefix matches on src OR dst (mirrors `allow-prefix`). + /// Lower-if-higher policy — only rewrites when the SYN's existing + /// MSS is greater than the configured clamp (matches iptables + /// `TCPMSS --set-mss` semantics). + MssClamp { + prefix: Option, + iface: Option, + mss: u16, + line: usize, + }, DryRun(bool), CircuitBreaker(CircuitBreakerSpec), /// Operator override for a driver-specific workaround. Currently @@ -237,6 +259,16 @@ pub enum ModuleDirective { EcmpDefaultHashMode(EcmpHashMode), } +/// One side of the [`ModuleDirective::MssClamp`] discriminator — +/// either an IPv4 or IPv6 prefix. Userspace dispatches on this when +/// populating the `MSS_CLAMP_V4` / `MSS_CLAMP_V6` LPM tries. +#[derive(Debug, Clone, Copy, Serialize, PartialEq, Eq)] +#[serde(rename_all = "lowercase", tag = "family", content = "cidr")] +pub enum MssClampPrefix { + V4(Ipv4Prefix), + V6(Ipv6Prefix), +} + /// Forwarding-path selector. `KernelFib` keeps today's behavior — /// bpf_fib_lookup() and the legacy success path. `CustomFib` routes /// through the Option-F LPM trie + nexthop cache. `Compare` runs @@ -889,6 +921,100 @@ fn parse_module_directive(line: usize, s: &str) -> Result { + // v0.2.4+ — four grammars accepted: + // mss-clamp + // mss-clamp via + // mss-clamp + // mss-clamp via + // + // Disambiguation: a token containing `/` (CIDR delimiter) + // is treated as a prefix; the literal `via` introduces an + // egress-iface scope; otherwise the token is the MSS + // value. SPEC §4.x. + let tok1 = rest.next().ok_or_else(|| { + ConfigError::parse( + line, + "mss-clamp requires at least an MTU value \ + (form: `mss-clamp [] [via ] `)", + ) + })?; + let mut prefix: Option = None; + let mut iface: Option = None; + let mss_tok: &str; + + if tok1 == "via" { + // mss-clamp via + let iface_tok = rest.next().ok_or_else(|| { + ConfigError::parse(line, "mss-clamp: expected an iface name after `via`") + })?; + iface = Some(iface_tok.to_string()); + mss_tok = rest.next().ok_or_else(|| { + ConfigError::parse(line, "mss-clamp: expected an MTU value after the iface") + })?; + } else if tok1.contains('/') { + // mss-clamp [via ] + if let Ok(p) = tok1.parse::() { + prefix = Some(MssClampPrefix::V4(p)); + } else if let Ok(p) = tok1.parse::() { + prefix = Some(MssClampPrefix::V6(p)); + } else { + return Err(ConfigError::parse( + line, + format!( + "mss-clamp: cannot parse `{tok1}` as IPv4 or IPv6 CIDR \ + (form: `mss-clamp [] [via ] `)" + ), + )); + } + let next_tok = rest.next().ok_or_else(|| { + ConfigError::parse(line, "mss-clamp: expected `via ` or an MTU value") + })?; + if next_tok == "via" { + let iface_tok = rest.next().ok_or_else(|| { + ConfigError::parse(line, "mss-clamp: expected an iface name after `via`") + })?; + iface = Some(iface_tok.to_string()); + mss_tok = rest.next().ok_or_else(|| { + ConfigError::parse(line, "mss-clamp: expected an MTU value after the iface") + })?; + } else { + mss_tok = next_tok; + } + } else { + // mss-clamp + mss_tok = tok1; + } + + if rest.next().is_some() { + return Err(ConfigError::parse( + line, + "mss-clamp: too many arguments \ + (form: `mss-clamp [] [via ] `)", + )); + } + + let mss: u16 = mss_tok.parse().map_err(|e| { + ConfigError::parse(line, format!("mss-clamp: bad MTU `{mss_tok}`: {e}")) + })?; + // 88 = TCP/IP minimum (RFC 879/1122 — 40-byte v4+TCP + // header on a 128-byte frame, less than 88 starts breaking + // assumptions). 65495 = max ethernet payload minus a v4 + + // TCP header (65535 - 40). Outside this range is almost + // certainly a config typo. + if !(88..=65495).contains(&mss) { + return Err(ConfigError::parse( + line, + format!("mss-clamp: MTU {mss} out of range [88, 65495]"), + )); + } + Ok(ModuleDirective::MssClamp { + prefix, + iface, + mss, + line, + }) + } "dry-run" => { let v = rest .next() @@ -2203,4 +2329,146 @@ module fast-path let e = parse_module_body(" block-prefix 10.0.0.0/8 something\n").unwrap_err(); assert!(matches!(e, ConfigError::Parse { .. })); } + + // --- mss-clamp tests (v0.2.4+) ---------------------------------- + + fn extract_mss_clamps(m: &ModuleSection) -> Vec<(Option, Option, u16)> { + m.directives + .iter() + .filter_map(|d| match d { + ModuleDirective::MssClamp { + prefix, iface, mss, .. + } => Some((*prefix, iface.clone(), *mss)), + _ => None, + }) + .collect() + } + + #[test] + fn mss_clamp_global_form() { + let m = parse_module_body(" mss-clamp 1360\n").expect("parse"); + let v = extract_mss_clamps(&m); + assert_eq!(v.len(), 1); + assert_eq!(v[0], (None, None, 1360)); + } + + #[test] + fn mss_clamp_per_iface_form() { + let m = parse_module_body(" mss-clamp via eth2 1400\n").expect("parse"); + let v = extract_mss_clamps(&m); + assert_eq!(v.len(), 1); + assert_eq!(v[0].0, None); + assert_eq!(v[0].1.as_deref(), Some("eth2")); + assert_eq!(v[0].2, 1400); + } + + #[test] + fn mss_clamp_per_prefix_v4() { + let m = parse_module_body(" mss-clamp 23.191.201.0/24 1280\n").expect("parse"); + let v = extract_mss_clamps(&m); + assert_eq!(v.len(), 1); + match v[0].0.as_ref().unwrap() { + MssClampPrefix::V4(p) => { + assert_eq!(p.addr.octets(), [23, 191, 201, 0]); + assert_eq!(p.prefix_len, 24); + } + other => panic!("expected V4, got {other:?}"), + } + assert_eq!(v[0].1, None); + assert_eq!(v[0].2, 1280); + } + + #[test] + fn mss_clamp_per_prefix_v6() { + let m = parse_module_body(" mss-clamp 2001:db8::/48 1280\n").expect("parse"); + let v = extract_mss_clamps(&m); + assert_eq!(v.len(), 1); + match v[0].0.as_ref().unwrap() { + MssClampPrefix::V6(p) => assert_eq!(p.prefix_len, 48), + other => panic!("expected V6, got {other:?}"), + } + } + + #[test] + fn mss_clamp_prefix_plus_iface() { + let m = parse_module_body(" mss-clamp 23.191.201.0/24 via eth2 1280\n").expect("parse"); + let v = extract_mss_clamps(&m); + assert_eq!(v.len(), 1); + assert!(matches!(v[0].0, Some(MssClampPrefix::V4(_)))); + assert_eq!(v[0].1.as_deref(), Some("eth2")); + assert_eq!(v[0].2, 1280); + } + + #[test] + fn mss_clamp_multiple_lines_accumulate() { + let body = " mss-clamp 1360\n\ + mss-clamp via eth2 1400\n\ + mss-clamp 23.191.201.0/24 1280\n"; + let m = parse_module_body(body).expect("parse"); + assert_eq!(extract_mss_clamps(&m).len(), 3); + } + + #[test] + fn mss_clamp_missing_value_errors() { + let e = parse_module_body(" mss-clamp\n").unwrap_err(); + assert!(matches!(e, ConfigError::Parse { .. })); + } + + #[test] + fn mss_clamp_missing_iface_after_via_errors() { + let e = parse_module_body(" mss-clamp via\n").unwrap_err(); + assert!(matches!(e, ConfigError::Parse { .. })); + } + + #[test] + fn mss_clamp_missing_value_after_iface_errors() { + let e = parse_module_body(" mss-clamp via eth2\n").unwrap_err(); + assert!(matches!(e, ConfigError::Parse { .. })); + } + + #[test] + fn mss_clamp_value_below_minimum_errors() { + // 87 is below the 88 floor. + let e = parse_module_body(" mss-clamp 87\n").unwrap_err(); + let msg = format!("{e}"); + assert!(msg.contains("out of range"), "got: {msg}"); + } + + #[test] + fn mss_clamp_value_above_maximum_errors() { + // 65496 is above the 65495 ceiling. + let e = parse_module_body(" mss-clamp 65496\n").unwrap_err(); + let msg = format!("{e}"); + assert!(msg.contains("out of range"), "got: {msg}"); + } + + #[test] + fn mss_clamp_ip_without_cidr_errors() { + // `10.0.0.0` (no `/` slash) → parser treats it as the MSS + // value position, then sees `1360` as an unexpected extra + // arg. The error message isn't pretty but the directive is + // rejected, which is what matters. + let e = parse_module_body(" mss-clamp 10.0.0.0 1360\n").unwrap_err(); + assert!(matches!(e, ConfigError::Parse { .. }), "got: {e:?}"); + } + + #[test] + fn mss_clamp_extra_arg_errors() { + let e = parse_module_body(" mss-clamp 1360 extra\n").unwrap_err(); + assert!(matches!(e, ConfigError::Parse { .. })); + } + + #[test] + fn mss_clamp_value_at_minimum_accepted() { + let m = parse_module_body(" mss-clamp 88\n").expect("parse"); + let v = extract_mss_clamps(&m); + assert_eq!(v[0].2, 88); + } + + #[test] + fn mss_clamp_value_at_maximum_accepted() { + let m = parse_module_body(" mss-clamp 65495\n").expect("parse"); + let v = extract_mss_clamps(&m); + assert_eq!(v[0].2, 65495); + } } diff --git a/crates/modules/fast-path/bpf/src/main.rs b/crates/modules/fast-path/bpf/src/main.rs index de8b30f..39fd065 100644 --- a/crates/modules/fast-path/bpf/src/main.rs +++ b/crates/modules/fast-path/bpf/src/main.rs @@ -35,8 +35,8 @@ mod maps; use maps::{ bump_stat, StatIdx, ALLOW_V4, ALLOW_V6, BLOCK_V4, BLOCK_V6, CFG, FP_CFG_FLAG_COMPARE_MODE, - FP_CFG_FLAG_CUSTOM_FIB, - FP_CFG_FLAG_HEAD_SHIFT_128, REDIRECT_DEVMAP, VLAN_RESOLVE, + FP_CFG_FLAG_CUSTOM_FIB, FP_CFG_FLAG_HEAD_SHIFT_128, MSS_CLAMP_BY_IFACE, MSS_CLAMP_V4, + MSS_CLAMP_V6, REDIRECT_DEVMAP, VLAN_RESOLVE, }; const AF_INET: u8 = 2; @@ -453,6 +453,16 @@ fn forward_success( return Ok(xdp_action::XDP_PASS); } + // MSS clamping (v0.2.4+, closes SPEC §11.4 gap). Mutate the TCP + // MSS option in SYN/SYN-ACK packets before they're handed to the + // egress NIC — must happen before `apply_vlan_egress` (which can + // shift packet bytes via `bpf_xdp_adjust_head`) but is order- + // independent w.r.t. TTL decrement and L2 rewrite (those edit + // existing bytes in place). No-op for non-TCP, non-SYN packets, + // or when no clamp policy applies. Skipped under `is_dry_run()` + // because dry-run returns XDP_PASS earlier in the flow. + mss_clamp_inline(ctx, ip, is_v4, egress_ifindex); + // TTL/hop_limit + csum first — IP header's position in memory // doesn't change with adjust_head, only its offset from `data`. if is_v4 { @@ -659,6 +669,260 @@ fn vlan_rewrite(ctx: &XdpContext, vid: u16) -> Result<(), ()> { Ok(()) } +// --- MSS clamping (v0.2.4+, SPEC §4.x — closes §11.4 gap) ----------------- + +/// SYN flag in TCP byte 13 (low byte of `doff_flags` over the wire). +const TCP_FLAG_SYN: u8 = 0x02; + +/// Walk the TCP-options block of a matched SYN/SYN-ACK and mutate the +/// MSS option in place if a clamp policy applies and the existing MSS +/// is greater than the clamp value. Recomputes the TCP checksum +/// incrementally (RFC 1624). Bumps `MssClampApplied` on rewrite and +/// `MssClampSkipped` on "policy applies but no rewrite needed" — i.e. +/// existing MSS already ≤ clamp, no MSS option present, or malformed +/// options walked past before finding it. +/// +/// Bounds-checked at every read against `ctx.data_end()`. The options +/// loop is fixed-bound at 40 iterations (TCP options max 40 bytes; +/// each iteration consumes at least 1 byte) so the BPF verifier +/// accepts it without unrolling concerns. +/// +/// Marked `#[inline(always)]` deliberately. Two earlier attempts to +/// split this into a subprogram (for stack budget) ran into the BPF +/// kernel verifier rejecting the bpf2bpf calling convention LLVM +/// emits: even when arguments are scalar, LLVM SROA decomposes +/// `&XdpContext` into `(data, data_end)` packet pointers, and the +/// verifier prohibits pointer-shift instructions on packet pointers +/// (the lift LLVM emits to extend 32-bit→64-bit). Inlining is the +/// only verifier-friendly option for code that touches the packet. +/// +/// Stack trim: each LPM key is block-scoped so the compiler can +/// reuse the same stack slot for src and dst keys rather than +/// holding both live; lookup helpers are also `#[inline(always)]` +/// for the same reason; src/dst addresses are read inside their +/// respective LPM blocks rather than at the function top. +#[inline(always)] +fn mss_clamp_inline(ctx: &XdpContext, ip: *mut u8, is_v4: bool, egress_ifindex: u32) { + let start = ctx.data(); + let end = ctx.data_end(); + + // Read protocol byte first — bail early on non-TCP, which is + // the overwhelmingly common case for fast-pathed traffic. + let proto = if is_v4 { + unsafe { (*(ip as *const Ipv4Hdr)).proto } + } else { + unsafe { (*(ip as *const Ipv6Hdr)).next_hdr } + }; + if proto != PROTO_TCP { + return; + } + + // Look up clamp value via the precedence chain. Returns 0 if no + // policy applies. Helper is `#[inline(always)]`; its locals share + // this function's frame and are block-scoped for slot reuse. + let clamp = if is_v4 { + lookup_mss_clamp_v4(ip as *const Ipv4Hdr, egress_ifindex) + } else { + lookup_mss_clamp_v6(ip as *const Ipv6Hdr, egress_ifindex) + }; + if clamp == 0 { + return; + } + + // Recover the IP-header offset so we can compute the TCP offset + // (and bounds-check) without holding `ip` as a separate pointer + // variable. ip - start is a scalar (pkt_a - pkt_b) per the + // verifier. + let ip_offset = (ip as usize) - start; + let tcp_offset = ip_offset + if is_v4 { Ipv4Hdr::LEN } else { Ipv6Hdr::LEN }; + + // Need 20 bytes for the fixed TCP header before walking options. + if start + tcp_offset + 20 > end { + return; + } + + // Bytes 12-13 of TCP header: data_offset:4 | reserved:4 | flags:8. + // doff is in 32-bit words; valid range [5, 15] = [20, 60] bytes. + let doff_byte = unsafe { *((start + tcp_offset + 12) as *const u8) }; + let flags = unsafe { *((start + tcp_offset + 13) as *const u8) }; + if flags & TCP_FLAG_SYN == 0 { + return; // Not SYN/SYN-ACK; clamp doesn't apply. + } + let doff_words = (doff_byte >> 4) as usize; + if !(5..=15).contains(&doff_words) { + return; + } + let tcp_hdr_len = doff_words * 4; + let opts_len = tcp_hdr_len - 20; + if opts_len == 0 { + // SYN with no options — operator policy says "clamp" but + // there's no MSS field to mutate. Count as skipped. + bump_stat(StatIdx::MssClampSkipped); + return; + } + if start + tcp_offset + tcp_hdr_len > end { + return; + } + + // Walk options. Cap at 8 iterations: real SYN packets put MSS in + // the first 1-4 options, and 8 is plenty of headroom while + // keeping the BPF verifier's state-exploration bounded. A 40- + // iteration walk hit the verifier's 1M-instruction processing + // limit due to combinatorial state explosion across the branches. + // + // Use sequential `if` checks rather than `match` for the same + // reason — fewer state-space splits per iteration. + let opts_start_off = tcp_offset + 20; + let mut cursor: usize = 0; + let mut found = false; + + for _ in 0..8 { + if cursor >= opts_len { + break; + } + let p_addr = start + opts_start_off + cursor; + // Need at least 4 bytes for a worst-case MSS option; if + // there's less than that left, no MSS is possible. Also + // bounds-checks the kind/length reads below. + if p_addr + 4 > end { + break; + } + let p = p_addr as *const u8; + let kind = unsafe { *p }; + if kind == 0 { + break; // EOL — no more options. + } + if kind == 1 { + cursor += 1; // NOP, single byte. + continue; + } + // Length-prefixed option (kind != 0, != 1). Length includes + // the kind+length bytes themselves; valid range is 2..=opts_len. + let length = unsafe { *p.add(1) } as usize; + if length < 2 || cursor + length > opts_len { + break; // Malformed. + } + if kind == 2 && length == 4 { + // MSS option: [kind=2, length=4, mss_be:2]. + let mss_be = unsafe { [*p.add(2), *p.add(3)] }; + let mss = u16::from_be_bytes(mss_be); + if mss > clamp { + let new_mss_be = clamp.to_be_bytes(); + unsafe { + let pmut = p as *mut u8; + *pmut.add(2) = new_mss_be[0]; + *pmut.add(3) = new_mss_be[1]; + } + // TCP csum is at offset 16 of the TCP header; do an + // RFC 1624 incremental update. + let csum_off = tcp_offset + 16; + if start + csum_off + 2 > end { + return; + } + let csum_p = (start + csum_off) as *mut u8; + let old_csum_be = unsafe { [*csum_p, *csum_p.add(1)] }; + let old_csum = u16::from_be_bytes(old_csum_be); + let new_csum = csum_replace_u16(old_csum, mss, clamp); + let new_csum_be = new_csum.to_be_bytes(); + unsafe { + *csum_p = new_csum_be[0]; + *csum_p.add(1) = new_csum_be[1]; + } + bump_stat(StatIdx::MssClampApplied); + } else { + bump_stat(StatIdx::MssClampSkipped); + } + found = true; + break; + } + cursor += length; + } + + if !found { + // Hit EOL or walked past the budget without an MSS option. + bump_stat(StatIdx::MssClampSkipped); + } +} + +/// Apply RFC 1624 incremental checksum update for a single 16-bit +/// field change: `HC' = ~(~HC + ~m + m')`. Two-iteration end-around +/// carry fold (max 2 needed for adding three 16-bit values into a +/// u32). Verifier-friendly — no loops. +#[inline(always)] +fn csum_replace_u16(old_csum: u16, old_val: u16, new_val: u16) -> u16 { + let mut sum: u32 = (!old_csum) as u32 + (!old_val) as u32 + new_val as u32; + sum = (sum & 0xffff) + (sum >> 16); + sum = (sum & 0xffff) + (sum >> 16); + !(sum as u16) +} + +/// Resolve the mss-clamp value for an IPv4 packet, in precedence +/// order: src-prefix → dst-prefix → per-egress → global. Returns 0 if +/// no policy applies. The LPM lookups respect each entry's +/// `iface_filter` (0 = wildcard). Block-scope each Key + addr so LLVM +/// can reuse the same stack slot rather than carrying both keys live +/// — matters for the cumulative BPF 512-byte stack budget. Reads +/// addresses through the IP-header pointer rather than taking them +/// by value so the caller doesn't pre-materialize them on its frame. +#[inline(always)] +fn lookup_mss_clamp_v4(ip: *const Ipv4Hdr, egress_ifindex: u32) -> u16 { + { + let key = Key::new(32, unsafe { (*ip).src_addr }); + if let Some(entry) = MSS_CLAMP_V4.get(&key) { + if entry.iface_filter == 0 || entry.iface_filter == egress_ifindex { + return entry.mss; + } + } + } + { + let key = Key::new(32, unsafe { (*ip).dst_addr }); + if let Some(entry) = MSS_CLAMP_V4.get(&key) { + if entry.iface_filter == 0 || entry.iface_filter == egress_ifindex { + return entry.mss; + } + } + } + if let Some(mss) = unsafe { MSS_CLAMP_BY_IFACE.get(&egress_ifindex) } { + if *mss != 0 { + return *mss; + } + } + if let Some(c) = CFG.get(0) { + return c.mss_clamp_global; + } + 0 +} + +/// IPv6 mirror of [`lookup_mss_clamp_v4`] — same precedence, /128 keys. +#[inline(always)] +fn lookup_mss_clamp_v6(ip: *const Ipv6Hdr, egress_ifindex: u32) -> u16 { + { + let key = Key::new(128, unsafe { (*ip).src_addr }); + if let Some(entry) = MSS_CLAMP_V6.get(&key) { + if entry.iface_filter == 0 || entry.iface_filter == egress_ifindex { + return entry.mss; + } + } + } + { + let key = Key::new(128, unsafe { (*ip).dst_addr }); + if let Some(entry) = MSS_CLAMP_V6.get(&key) { + if entry.iface_filter == 0 || entry.iface_filter == egress_ifindex { + return entry.mss; + } + } + } + if let Some(mss) = unsafe { MSS_CLAMP_BY_IFACE.get(&egress_ifindex) } { + if *mss != 0 { + return *mss; + } + } + if let Some(c) = CFG.get(0) { + return c.mss_clamp_global; + } + 0 +} + // --- TTL / csum / helpers ------------------------------------------------- /// Decrement IPv4 TTL and patch the header checksum using RFC 1624 diff --git a/crates/modules/fast-path/bpf/src/maps.rs b/crates/modules/fast-path/bpf/src/maps.rs index f012d2a..4e9c6fe 100644 --- a/crates/modules/fast-path/bpf/src/maps.rs +++ b/crates/modules/fast-path/bpf/src/maps.rs @@ -14,7 +14,7 @@ use aya_ebpf::{ /// Runtime flags poked by userspace via the `cfg` map. `version` is a /// reserved byte carved out now so future fields can be added without /// breaking userspace reads of older-layout BPF objects (SPEC §4.5 note: -/// the `fp_cfg` struct has `...` — we enumerate exactly the v0.1 fields +/// the `fp_cfg` struct has `...` — we enumerate exactly the fields /// plus a version discriminator). #[repr(C)] #[derive(Copy, Clone)] @@ -24,21 +24,25 @@ pub struct FpCfg { /// Bit 0 = IPv4 enabled, bit 1 = IPv6 enabled. 0 disables both /// (pure dry-run passthrough). Reserved bits must be zero. pub flags: u8, - pub _reserved: [u8; 2], - /// Layout version. `0` = v0.1 layout (this file). Userspace rejects - /// loads if this doesn't match what it expects. + /// Global MSS clamp value (native u16). 0 = unset; per-prefix and + /// per-iface lookups in `MSS_CLAMP_V{4,6}` and `MSS_CLAMP_BY_IFACE` + /// take precedence over this fallback. v0.2.4+ (SPEC §4.x). + /// Slot was `_reserved` in v0.1; repurposed in layout V2. + pub mss_clamp_global: u16, + /// Layout version. `1` = v0.2.4 layout (this file). Userspace + /// rejects loads if this doesn't match what it expects. pub version: u32, } impl FpCfg { - pub const VERSION_V1: u32 = 0; + pub const VERSION_V2: u32 = 1; pub const fn zeroed() -> Self { Self { dry_run: 0, flags: 0b11, - _reserved: [0; 2], - version: Self::VERSION_V1, + mss_clamp_global: 0, + version: Self::VERSION_V2, } } } @@ -119,11 +123,22 @@ pub enum StatIdx { /// Diagnostic counter so operators can see which/how many flows /// the bogon-block is catching. BogonDropped = 32, + /// v0.2.4: matched TCP SYN/SYN-ACK whose MSS option was rewritten + /// down to the configured clamp value (SPEC §4.x — closes §11.4 + /// gap). Bumped per packet, not per flow. + MssClampApplied = 33, + /// v0.2.4: matched TCP SYN/SYN-ACK that hit a clamp lookup but was + /// NOT rewritten — either the existing MSS option was already ≤ the + /// clamp value (no-op), there was no MSS option in the SYN, or the + /// TCP-options walk hit its bounded cap before finding one. Useful + /// to gauge how often clamps are firing vs being skipped on existing + /// well-behaved traffic. + MssClampSkipped = 34, } /// Total counter count. Used as `stats` map `max_entries`. New counters /// bump this; dashboards keying on indices keep working. -pub const STATS_COUNT: u32 = 33; +pub const STATS_COUNT: u32 = 35; /// Flag bits for `FpCfg.flags`. Bits 0-1 are the IPv4/IPv6 enable /// mask (historical, load-bearing for dashboards). Bit 2 is the @@ -164,6 +179,15 @@ const LOG_RINGBUF_BYTES: u32 = 256 * 1024; /// VIDs 1/66/88/99/1337 + 3996..4040 — ~50. 256 is headroom. const VLAN_RESOLVE_MAX_ENTRIES: u32 = 256; +/// Max prefixes per mss-clamp LPM trie. Sized like the allowlist; each +/// `mss-clamp [via ] ` directive becomes one entry. +const MSS_CLAMP_PREFIX_MAX_ENTRIES: u32 = 1024; + +/// Max per-egress mss-clamp entries. Sized like the redirect devmap — +/// one entry per attached egress iface that has a `mss-clamp via X N` +/// rule. 64 is comfortable for any non-container deployment. +const MSS_CLAMP_IFACE_MAX_ENTRIES: u32 = 64; + // --- Custom-FIB map sizes (Option F) ----------------------------------- // // Sized to accommodate the full IPv4 + IPv6 BGP tables plus headroom. @@ -207,6 +231,27 @@ pub struct VlanResolve { pub _pad: u16, } +/// Value stored in the mss-clamp LPM tries. `mss` is a native u16 +/// clamp ceiling; `iface_filter` is an optional egress ifindex +/// constraint (0 = wildcard, matches any egress). `#[repr(C)]` with +/// explicit padding so userspace layout matches byte-for-byte. +/// +/// Why a struct rather than a bare u16: lets a single LPM lookup +/// answer both "prefix only" and "prefix + via iface" queries — the +/// XDP path checks `iface_filter == 0 || iface_filter == egress_ifindex` +/// without needing two separate tries. Userspace splats one entry per +/// `mss-clamp [via ] ` directive. +#[repr(C)] +#[derive(Copy, Clone)] +pub struct MssClampValue { + /// Clamp ceiling, native u16 (host endian). Comparisons with the + /// wire MSS option value happen after a `from_be_bytes` conversion. + pub mss: u16, + pub _pad: u16, + /// Egress ifindex required for this rule to apply. 0 = match any. + pub iface_filter: u32, +} + // --- Custom FIB value layouts (Option F) ------------------------------- /// Max nexthops in a single ECMP group. The XDP program walks `nh_idx` @@ -384,6 +429,29 @@ pub static REDIRECT_DEVMAP: DevMapHash = pub static VLAN_RESOLVE: HashMap = HashMap::with_max_entries(VLAN_RESOLVE_MAX_ENTRIES, 0); +/// IPv4 mss-clamp policy by src-or-dst prefix (v0.2.4+). XDP looks up +/// the packet's src first, then dst, mirroring `allow-prefix` semantics. +/// Each hit's `iface_filter` is checked against the resolved egress +/// ifindex (0 = wildcard). Empty by default; populated only when the +/// operator declares `mss-clamp [via ] ` lines. +#[map] +pub static MSS_CLAMP_V4: LpmTrie<[u8; 4], MssClampValue> = + LpmTrie::with_max_entries(MSS_CLAMP_PREFIX_MAX_ENTRIES, 0); + +/// IPv6 mss-clamp policy by src-or-dst prefix. +#[map] +pub static MSS_CLAMP_V6: LpmTrie<[u8; 16], MssClampValue> = + LpmTrie::with_max_entries(MSS_CLAMP_PREFIX_MAX_ENTRIES, 0); + +/// Per-egress mss-clamp value (v0.2.4+). Keyed by FIB-resolved egress +/// ifindex (pre-VLAN-resolve, so the operator's `mss-clamp via eth2.66` +/// keys on the subif ifindex if that's what FIB returned). Value is +/// the clamp ceiling as a native u16. Hash-keyed so high ifindex values +/// don't waste memory. +#[map] +pub static MSS_CLAMP_BY_IFACE: HashMap = + HashMap::with_max_entries(MSS_CLAMP_IFACE_MAX_ENTRIES, 0); + // --- Custom-FIB maps (Option F, Phase 1) ------------------------------- // // These maps are declared and sized in Phase 1 but neither read nor diff --git a/crates/modules/fast-path/build.rs b/crates/modules/fast-path/build.rs index a09e75f..61cbd0f 100644 --- a/crates/modules/fast-path/build.rs +++ b/crates/modules/fast-path/build.rs @@ -178,7 +178,11 @@ fn main() { } Ok(out) => { // stdout is cargo JSON messages with --message-format=json; - // only stderr is human-readable and worth forwarding. + // on FAILURE we parse out the `rendered` field from each + // compiler-message so the actual rustc diagnostic text + // surfaces in cargo's warning stream. stderr carries the + // human-readable progress + summary. + forward_rendered_diagnostics(&out.stdout); forward_output(&[], &out.stderr); let msg = format!( "BPF build failed (exit {}) — see cargo:warning lines above for the real error, or run `(cd crates/modules/fast-path/bpf && cargo build --release)` directly", @@ -257,3 +261,71 @@ fn forward_output(stdout: &[u8], stderr: &[u8]) { fn write_stub(path: &std::path::Path) { std::fs::write(path, []).expect("write stub ELF"); } + +/// Walk cargo's `--message-format=json` stdout, find each +/// `"reason":"compiler-message"` line, and forward its rendered +/// diagnostic body. This is what makes BPF compile errors visible in +/// the parent cargo's warning stream — they're emitted as JSON on +/// stdout (not stderr), so without this they're invisible when +/// `forward_output(&[], ...)` is called. +fn forward_rendered_diagnostics(stdout: &[u8]) { + const REASON_MARKER: &str = "\"reason\":\"compiler-message\""; + const RENDERED_MARKER: &str = "\"rendered\":\""; + for line in String::from_utf8_lossy(stdout).lines() { + if !line.contains(REASON_MARKER) { + continue; + } + let Some(start) = line.find(RENDERED_MARKER) else { + continue; + }; + let rest = &line[start + RENDERED_MARKER.len()..]; + // Find the closing quote, handling backslash-escaped quotes + // by walking the string char-by-char. + let mut end = None; + let mut chars = rest.char_indices(); + while let Some((idx, c)) = chars.next() { + if c == '\\' { + // skip the escaped char + chars.next(); + continue; + } + if c == '"' { + end = Some(idx); + break; + } + } + let Some(end_idx) = end else { + continue; + }; + let rendered = &rest[..end_idx]; + // Unescape the JSON string. Only `\n`, `\t`, `\"`, `\\` + // appear in rustc rendered output. + let mut out = String::with_capacity(rendered.len()); + let mut esc = false; + for c in rendered.chars() { + if esc { + match c { + 'n' => out.push('\n'), + 't' => out.push('\t'), + 'r' => out.push('\r'), + '"' => out.push('"'), + '\\' => out.push('\\'), + other => { + out.push('\\'); + out.push(other); + } + } + esc = false; + } else if c == '\\' { + esc = true; + } else { + out.push(c); + } + } + for line in out.lines() { + if !line.trim().is_empty() { + println!("cargo::warning=[bpf rustc] {line}"); + } + } + } +} diff --git a/crates/modules/fast-path/src/linux_impl.rs b/crates/modules/fast-path/src/linux_impl.rs index 7c823f7..1e027e4 100644 --- a/crates/modules/fast-path/src/linux_impl.rs +++ b/crates/modules/fast-path/src/linux_impl.rs @@ -31,23 +31,27 @@ use crate::{aligned_bpf_copy, pin, FAST_PATH_BPF_AVAILABLE, MODULE_NAME}; /// with all-bit-patterns-valid primitive fields, so `aya::Pod` is safe /// to impl — the marker tells aya the struct is safe to byte-copy into /// the kernel's map buffer. Bytes-for-bytes match the BPF-side struct. +/// +/// Layout V2 (v0.2.4): the formerly-`_reserved [u8; 2]` slot is now +/// `mss_clamp_global: u16` — a global MSS clamp ceiling for matched +/// TCP SYN/SYN-ACK packets. 0 = unset. Per-prefix and per-iface clamp +/// maps take precedence over this fallback. #[repr(C)] #[derive(Copy, Clone, Debug)] pub struct FpCfg { pub dry_run: u8, pub flags: u8, - pub _reserved: [u8; 2], + pub mss_clamp_global: u16, pub version: u32, } -// SAFETY: FpCfg is repr(C), contains only primitive integer types and -// a fixed-size byte array — every bit pattern is a valid FpCfg. No -// padding that could leak uninitialized memory (u8/u8/[u8;2]/u32 packs -// exactly into 8 bytes on every target). Aya uses this to byte-copy -// the struct into the kernel's array value slot. +// SAFETY: FpCfg is repr(C), contains only primitive integer types +// (u8/u8/u16/u32 packs exactly into 8 bytes on every target). Every +// bit pattern is valid; aya uses this to byte-copy the struct into +// the kernel's array value slot. unsafe impl aya::Pod for FpCfg {} -pub(crate) const FP_CFG_VERSION_V1: u32 = 0; +pub(crate) const FP_CFG_VERSION_V2: u32 = 1; /// Mirror of `bpf/src/maps.rs::FP_CFG_FLAG_HEAD_SHIFT_128`. Enables /// the pre-Linux-v6.8 rvu-nicpf `xdp_prepare_buff` workaround (SPEC @@ -106,6 +110,23 @@ pub struct VlanResolve { // SAFETY: repr(C), all primitive fields, every bit pattern valid. unsafe impl aya::Pod for VlanResolve {} +/// Layout mirror of `MssClampValue` in `bpf/src/maps.rs`. Value type +/// for the `MSS_CLAMP_V4` / `MSS_CLAMP_V6` LPM tries. `#[repr(C)]` +/// with explicit padding so the userspace and BPF layouts match +/// byte-for-byte (8 bytes total). +#[repr(C)] +#[derive(Copy, Clone, Debug, PartialEq, Eq)] +pub struct MssClampValue { + pub mss: u16, + pub _pad: u16, + /// Egress ifindex required for this rule. 0 = match any. + pub iface_filter: u32, +} + +// SAFETY: repr(C), all primitive fields, no internal padding leaks +// (u16/u16/u32 packs exactly into 8 bytes). +unsafe impl aya::Pod for MssClampValue {} + /// All state required to keep the attached program alive. /// /// After `attach`, the XDP program, every §4.5 map, and each per-iface @@ -201,6 +222,7 @@ pub fn load(cfg: &ModuleConfig<'_>, ctx: &LoaderCtx<'_>) -> ModuleResult) -> ModuleResult<()> { .unwrap_or_default(); let fib_flags = fib_flags_from_forwarding_mode(forwarding); + // Global mss-clamp value from `mss-clamp ` (no prefix, no + // iface). Per-prefix and per-iface clamps live in their own maps + // populated by `populate_mss_clamp` and take precedence over this + // CFG fallback. 0 = unset. + let mss_clamp_global = mss_clamp_global_value(&mcfg.section.directives).unwrap_or(0); + let fp_cfg = FpCfg { dry_run: u8::from(dry_run), // bits 0-1: IPv4/IPv6 enabled (historical, load-bearing for @@ -262,8 +290,8 @@ fn populate_cfg(ebpf: &mut Ebpf, mcfg: &ModuleConfig<'_>) -> ModuleResult<()> { // bit 2 (HEAD_SHIFT_128) is OR'd on later in // apply_driver_quirks_cfg for rvu-nicpf attaches. flags: 0b11 | fib_flags, - _reserved: [0; 2], - version: FP_CFG_VERSION_V1, + mss_clamp_global, + version: FP_CFG_VERSION_V2, }; let map = ebpf @@ -469,6 +497,165 @@ fn prefix_contains(outer: &Ipv4Prefix, inner: &Ipv4Prefix) -> bool { (u32::from(outer.addr) & mask) == (u32::from(inner.addr) & mask) } +/// Extract the global `mss-clamp ` directive (the form with no +/// prefix and no `via ` qualifier). Returns the MSS value or +/// `None` if no such directive exists. Shared between `populate_cfg` +/// (initial load) and `reconcile::reconcile_cfg` (SIGHUP) so both +/// agree on which directive is "the global one." v0.2.4+. +pub(crate) fn mss_clamp_global_value(directives: &[ModuleDirective]) -> Option { + directives.iter().find_map(|d| match d { + ModuleDirective::MssClamp { + prefix: None, + iface: None, + mss, + .. + } => Some(*mss), + _ => None, + }) +} + +/// Populate `MSS_CLAMP_V4`, `MSS_CLAMP_V6`, and `MSS_CLAMP_BY_IFACE` +/// from any `mss-clamp` directives in the module config. The global +/// (`mss-clamp `) form is handled in `populate_cfg` via the +/// `FpCfg.mss_clamp_global` field; this function handles the three +/// scoped forms — per-prefix, per-iface, and per-prefix-+-iface. +/// Empty / no directives → all maps stay empty (LPM lookups miss +/// cheaply, no per-packet overhead). +fn populate_mss_clamp(ebpf: &mut Ebpf, mcfg: &ModuleConfig<'_>) -> ModuleResult<()> { + use packetframe_common::config::MssClampPrefix; + + // Collect (prefix, iface_filter, mss) triples, splitting by family. + let mut v4_entries: Vec<([u8; 4], u8, u32, u16)> = Vec::new(); + let mut v6_entries: Vec<([u8; 16], u8, u32, u16)> = Vec::new(); + let mut iface_entries: Vec<(String, u16)> = Vec::new(); + + for d in &mcfg.section.directives { + let ModuleDirective::MssClamp { + prefix, + iface, + mss, + line, + } = d + else { + continue; + }; + // Resolve `via ` to ifindex if present. Missing ifaces + // are fatal at load — the operator declared a clamp on an + // iface that doesn't exist, which is almost certainly a typo. + let iface_filter: u32 = match iface { + Some(name) => if_nametoindex(name).map_err(|e| { + ModuleError::other( + MODULE_NAME, + format!("mss-clamp at line {line}: iface `{name}` lookup failed: {e}",), + ) + })?, + None => 0, + }; + match (prefix, iface) { + (Some(MssClampPrefix::V4(p)), _) => { + v4_entries.push((p.addr.octets(), p.prefix_len, iface_filter, *mss)); + } + (Some(MssClampPrefix::V6(p)), _) => { + v6_entries.push((p.addr.octets(), p.prefix_len, iface_filter, *mss)); + } + (None, Some(name)) => { + iface_entries.push((name.clone(), *mss)); + } + (None, None) => { + // Global — handled by populate_cfg; skip here. + } + } + } + + // IPv4 LPM trie. + if !v4_entries.is_empty() { + let map = ebpf + .map_mut("MSS_CLAMP_V4") + .ok_or_else(|| ModuleError::other(MODULE_NAME, "MSS_CLAMP_V4 map missing from ELF"))?; + let mut trie: LpmTrie<_, [u8; 4], MssClampValue> = LpmTrie::try_from(map) + .map_err(|e| ModuleError::other(MODULE_NAME, format!("MSS_CLAMP_V4 try_from: {e}")))?; + for (addr, plen, iface_filter, mss) in &v4_entries { + let key = LpmKey::new(u32::from(*plen), *addr); + let value = MssClampValue { + mss: *mss, + _pad: 0, + iface_filter: *iface_filter, + }; + trie.insert(&key, value, 0).map_err(|e| { + ModuleError::other( + MODULE_NAME, + format!( + "MSS_CLAMP_V4 insert {}/{}: {e}", + std::net::Ipv4Addr::from(*addr), + plen + ), + ) + })?; + } + } + + // IPv6 LPM trie. + if !v6_entries.is_empty() { + let map = ebpf + .map_mut("MSS_CLAMP_V6") + .ok_or_else(|| ModuleError::other(MODULE_NAME, "MSS_CLAMP_V6 map missing from ELF"))?; + let mut trie: LpmTrie<_, [u8; 16], MssClampValue> = LpmTrie::try_from(map) + .map_err(|e| ModuleError::other(MODULE_NAME, format!("MSS_CLAMP_V6 try_from: {e}")))?; + for (addr, plen, iface_filter, mss) in &v6_entries { + let key = LpmKey::new(u32::from(*plen), *addr); + let value = MssClampValue { + mss: *mss, + _pad: 0, + iface_filter: *iface_filter, + }; + trie.insert(&key, value, 0).map_err(|e| { + ModuleError::other( + MODULE_NAME, + format!( + "MSS_CLAMP_V6 insert {}/{}: {e}", + std::net::Ipv6Addr::from(*addr), + plen + ), + ) + })?; + } + } + + // Per-iface table. + if !iface_entries.is_empty() { + let map = ebpf.map_mut("MSS_CLAMP_BY_IFACE").ok_or_else(|| { + ModuleError::other(MODULE_NAME, "MSS_CLAMP_BY_IFACE map missing from ELF") + })?; + let mut hm: AyaHashMap<_, u32, u16> = AyaHashMap::try_from(map).map_err(|e| { + ModuleError::other(MODULE_NAME, format!("MSS_CLAMP_BY_IFACE try_from: {e}")) + })?; + for (name, mss) in &iface_entries { + let ifindex = if_nametoindex(name).map_err(|e| { + ModuleError::other( + MODULE_NAME, + format!("mss-clamp via {name}: ifindex lookup failed: {e}"), + ) + })?; + hm.insert(ifindex, mss, 0).map_err(|e| { + ModuleError::other( + MODULE_NAME, + format!("MSS_CLAMP_BY_IFACE insert {name}({ifindex}): {e}"), + ) + })?; + } + } + + if !v4_entries.is_empty() || !v6_entries.is_empty() || !iface_entries.is_empty() { + info!( + v4_count = v4_entries.len(), + v6_count = v6_entries.len(), + iface_count = iface_entries.len(), + "mss-clamp policy populated" + ); + } + Ok(()) +} + pub fn attach(state: &mut ActiveState, cfg: &ModuleConfig<'_>) -> ModuleResult> { let prog: &mut Xdp = state .ebpf diff --git a/crates/modules/fast-path/src/reconcile.rs b/crates/modules/fast-path/src/reconcile.rs index 1557def..9c62e0d 100644 --- a/crates/modules/fast-path/src/reconcile.rs +++ b/crates/modules/fast-path/src/reconcile.rs @@ -23,8 +23,8 @@ use packetframe_common::{ use tracing::{info, warn}; use crate::linux_impl::{ - fib_flags_from_forwarding_mode, if_nametoindex, read_vlan_config, ActiveState, FpCfg, - VlanResolve, FP_CFG_FLAG_HEAD_SHIFT_128, FP_CFG_VERSION_V1, + fib_flags_from_forwarding_mode, if_nametoindex, mss_clamp_global_value, read_vlan_config, + ActiveState, FpCfg, MssClampValue, VlanResolve, FP_CFG_FLAG_HEAD_SHIFT_128, FP_CFG_VERSION_V2, }; use crate::MODULE_NAME; @@ -39,6 +39,9 @@ pub fn reconcile(state: &mut ActiveState, cfg: &ModuleConfig<'_>) -> ModuleResul reconcile_cfg(state, cfg)?; let v4 = reconcile_allow_v4(state, cfg)?; let v6 = reconcile_allow_v6(state, cfg)?; + let block_v4 = reconcile_block_v4(state, cfg)?; + let block_v6 = reconcile_block_v6(state, cfg)?; + let mss_clamp = reconcile_mss_clamp(state, cfg)?; let vlan = reconcile_vlan_resolve(state)?; let devmap = reconcile_devmap(state)?; @@ -47,6 +50,16 @@ pub fn reconcile(state: &mut ActiveState, cfg: &ModuleConfig<'_>) -> ModuleResul v4_removed = v4.removed, v6_added = v6.added, v6_removed = v6.removed, + block_v4_added = block_v4.added, + block_v4_removed = block_v4.removed, + block_v6_added = block_v6.added, + block_v6_removed = block_v6.removed, + mss_v4_added = mss_clamp.0.added, + mss_v4_removed = mss_clamp.0.removed, + mss_v6_added = mss_clamp.1.added, + mss_v6_removed = mss_clamp.1.removed, + mss_iface_added = mss_clamp.2.added, + mss_iface_removed = mss_clamp.2.removed, vlan_added = vlan.added, vlan_removed = vlan.removed, devmap_added = devmap.added, @@ -94,11 +107,13 @@ fn reconcile_cfg(state: &mut ActiveState, cfg: &ModuleConfig<'_>) -> ModuleResul .map_err(|e| ModuleError::other(MODULE_NAME, format!("CFG get: {e}")))?; let head_shift = current.flags & FP_CFG_FLAG_HEAD_SHIFT_128; + let mss_clamp_global = mss_clamp_global_value(&cfg.section.directives).unwrap_or(0); + let new_cfg = FpCfg { dry_run: u8::from(dry_run), flags: 0b11 | head_shift | fib_flags_from_forwarding_mode(forwarding), - _reserved: [0; 2], - version: FP_CFG_VERSION_V1, + mss_clamp_global, + version: FP_CFG_VERSION_V2, }; cfg_arr @@ -323,3 +338,258 @@ fn ifindex_exists(ifindex: u32) -> bool { let c = unsafe { std::ffi::CStr::from_ptr(ptr) }; !c.to_bytes().is_empty() } + +// --- v0.2.4 additions: block-prefix + mss-clamp reconcile ----------------- + +/// IPv4 block-prefix delta. Mirrors `reconcile_allow_v4` against the +/// `BLOCK_V4` LPM trie. Closes the v0.2.1 gap where adding/removing +/// `block-prefix` lines required a full restart. +fn reconcile_block_v4(state: &mut ActiveState, cfg: &ModuleConfig<'_>) -> ModuleResult { + let desired: HashSet<(u32, [u8; 4])> = cfg + .section + .directives + .iter() + .filter_map(|d| match d { + ModuleDirective::BlockPrefix { cidr, .. } => { + Some((u32::from(cidr.prefix_len), cidr.addr.octets())) + } + _ => None, + }) + .collect(); + + let map = state + .ebpf + .map_mut("BLOCK_V4") + .ok_or_else(|| ModuleError::other(MODULE_NAME, "BLOCK_V4 map missing from ELF"))?; + let mut trie: LpmTrie<_, [u8; 4], u8> = LpmTrie::try_from(map) + .map_err(|e| ModuleError::other(MODULE_NAME, format!("BLOCK_V4 try_from: {e}")))?; + + let current: HashSet<(u32, [u8; 4])> = trie + .keys() + .filter_map(Result::ok) + .map(|k| (k.prefix_len(), k.data())) + .collect(); + + apply_prefix_delta::<[u8; 4]>(&mut trie, &desired, ¤t, "BLOCK_V4") +} + +/// IPv6 block-prefix delta. Mirrors `reconcile_allow_v6`. Currently +/// the BPF-side `BLOCK_V6` is consulted but the v0.2.1 grammar only +/// has `block-prefix` for IPv4 (no `block-prefix6` directive); this +/// path always converges to "remove anything left over" until the v6 +/// directive lands. Cheap to keep wired up so the reconcile flow is +/// symmetric. +fn reconcile_block_v6( + state: &mut ActiveState, + _cfg: &ModuleConfig<'_>, +) -> ModuleResult { + let desired: HashSet<(u32, [u8; 16])> = HashSet::new(); + + let map = state + .ebpf + .map_mut("BLOCK_V6") + .ok_or_else(|| ModuleError::other(MODULE_NAME, "BLOCK_V6 map missing from ELF"))?; + let mut trie: LpmTrie<_, [u8; 16], u8> = LpmTrie::try_from(map) + .map_err(|e| ModuleError::other(MODULE_NAME, format!("BLOCK_V6 try_from: {e}")))?; + + let current: HashSet<(u32, [u8; 16])> = trie + .keys() + .filter_map(Result::ok) + .map(|k| (k.prefix_len(), k.data())) + .collect(); + + apply_prefix_delta::<[u8; 16]>(&mut trie, &desired, ¤t, "BLOCK_V6") +} + +/// MSS-clamp delta across the three maps: `MSS_CLAMP_V4`, `MSS_CLAMP_V6`, +/// `MSS_CLAMP_BY_IFACE`. Returns three [`DeltaCount`]s for the caller's +/// log line. The value-bearing tries use re-insert-on-key semantics +/// (any change to `mss` or `iface_filter` for an existing prefix +/// re-writes the entry), then drop keys absent from the desired set. +/// The global `mss-clamp ` form is updated via `reconcile_cfg`, +/// not here. +#[allow(clippy::type_complexity)] +fn reconcile_mss_clamp( + state: &mut ActiveState, + cfg: &ModuleConfig<'_>, +) -> ModuleResult<(DeltaCount, DeltaCount, DeltaCount)> { + use packetframe_common::config::MssClampPrefix; + + // Build desired sets keyed by prefix; values include both mss and + // iface_filter so we can detect changed-MSS-on-same-prefix. + let mut desired_v4: std::collections::HashMap<(u32, [u8; 4]), MssClampValue> = + std::collections::HashMap::new(); + let mut desired_v6: std::collections::HashMap<(u32, [u8; 16]), MssClampValue> = + std::collections::HashMap::new(); + let mut desired_iface: std::collections::HashMap = std::collections::HashMap::new(); + + for d in &cfg.section.directives { + let ModuleDirective::MssClamp { + prefix, iface, mss, .. + } = d + else { + continue; + }; + let iface_filter: u32 = match iface { + Some(name) => match if_nametoindex(name) { + Ok(idx) => idx, + Err(e) => { + warn!(iface = %name, error = %e, "mss-clamp reconcile: iface lookup failed; skipping rule"); + continue; + } + }, + None => 0, + }; + match (prefix, iface) { + (Some(MssClampPrefix::V4(p)), _) => { + desired_v4.insert( + (u32::from(p.prefix_len), p.addr.octets()), + MssClampValue { + mss: *mss, + _pad: 0, + iface_filter, + }, + ); + } + (Some(MssClampPrefix::V6(p)), _) => { + desired_v6.insert( + (u32::from(p.prefix_len), p.addr.octets()), + MssClampValue { + mss: *mss, + _pad: 0, + iface_filter, + }, + ); + } + (None, Some(_)) => { + desired_iface.insert(iface_filter, *mss); + } + (None, None) => { + // Global — handled by reconcile_cfg. + } + } + } + + let v4_delta = reconcile_mss_lpm::<[u8; 4]>(state, &desired_v4, "MSS_CLAMP_V4")?; + let v6_delta = reconcile_mss_lpm::<[u8; 16]>(state, &desired_v6, "MSS_CLAMP_V6")?; + let iface_delta = reconcile_mss_iface(state, &desired_iface)?; + + Ok((v4_delta, v6_delta, iface_delta)) +} + +/// Generic LPM-trie reconcile for the mss-clamp value type. Differs +/// from `apply_prefix_delta` because the value (`MssClampValue`) is +/// part of the desired-state comparison: re-inserts entries whose +/// value changed even if the key already exists. +fn reconcile_mss_lpm( + state: &mut ActiveState, + desired: &std::collections::HashMap<(u32, K), MssClampValue>, + map_label: &str, +) -> ModuleResult +where + K: aya::Pod + Eq + std::hash::Hash + std::fmt::Debug + Clone + Copy, +{ + let map = state.ebpf.map_mut(map_label).ok_or_else(|| { + ModuleError::other(MODULE_NAME, format!("{map_label} map missing from ELF")) + })?; + let mut trie: LpmTrie<_, K, MssClampValue> = LpmTrie::try_from(map) + .map_err(|e| ModuleError::other(MODULE_NAME, format!("{map_label} try_from: {e}")))?; + + let current_keys: HashSet<(u32, K)> = trie + .keys() + .filter_map(Result::ok) + .map(|k| (k.prefix_len(), k.data())) + .collect(); + + let mut delta = DeltaCount::default(); + + // Insert/overwrite desired entries. + for ((len, data), value) in desired { + let key = LpmKey::new(*len, *data); + // Treat any insert as either an add (key not present) or an + // update (key present, value possibly changed). The delta + // counts adds only; updates are silent — operators see them + // as "0 added, 0 removed" and have to look at counters to + // confirm new values landed. + let was_present = current_keys.contains(&(*len, *data)); + match trie.insert(&key, *value, 0) { + Ok(()) => { + if !was_present { + delta.added += 1; + } + } + Err(e) => warn!( + map = map_label, + prefix_len = *len, + ?data, + error = %e, + "mss-clamp insert failed" + ), + } + } + + // Remove keys absent from desired. + for (len, data) in ¤t_keys { + if !desired.contains_key(&(*len, *data)) { + let key = LpmKey::new(*len, *data); + match trie.remove(&key) { + Ok(()) => delta.removed += 1, + Err(e) => warn!( + map = map_label, + prefix_len = *len, + ?data, + error = %e, + "mss-clamp remove failed" + ), + } + } + } + + Ok(delta) +} + +fn reconcile_mss_iface( + state: &mut ActiveState, + desired: &std::collections::HashMap, +) -> ModuleResult { + let map = state.ebpf.map_mut("MSS_CLAMP_BY_IFACE").ok_or_else(|| { + ModuleError::other(MODULE_NAME, "MSS_CLAMP_BY_IFACE map missing from ELF") + })?; + let mut hm: AyaHashMap<_, u32, u16> = AyaHashMap::try_from(map).map_err(|e| { + ModuleError::other(MODULE_NAME, format!("MSS_CLAMP_BY_IFACE try_from: {e}")) + })?; + + let current_keys: HashSet = hm.keys().filter_map(Result::ok).collect(); + + let mut delta = DeltaCount::default(); + + for (ifindex, mss) in desired { + let was_present = current_keys.contains(ifindex); + match hm.insert(ifindex, mss, 0) { + Ok(()) => { + if !was_present { + delta.added += 1; + } + } + Err(e) => warn!( + ifindex = *ifindex, + error = %e, + "MSS_CLAMP_BY_IFACE insert failed" + ), + } + } + for ifindex in ¤t_keys { + if !desired.contains_key(ifindex) { + match hm.remove(ifindex) { + Ok(()) => delta.removed += 1, + Err(e) => warn!( + ifindex = *ifindex, + error = %e, + "MSS_CLAMP_BY_IFACE remove failed" + ), + } + } + } + + Ok(delta) +} diff --git a/docs/runbooks/mss-clamp.md b/docs/runbooks/mss-clamp.md new file mode 100644 index 0000000..a7406b8 --- /dev/null +++ b/docs/runbooks/mss-clamp.md @@ -0,0 +1,104 @@ +# MSS clamping (v0.2.4+) + +Operator guide for `mss-clamp` directives in `module fast-path`. Closes the [SPEC.md §11.4](../../SPEC.md) gap where iptables `TCPMSS --set-mss` rules don't fire on fast-pathed flows because XDP redirect (`bpf_redirect_map`) bypasses netfilter. + +## Contents + +- [When to use it](#when-to-use-it) +- [Grammar + lookup precedence](#grammar--lookup-precedence) +- [What gets clamped](#what-gets-clamped) +- [Troubleshooting](#troubleshooting) +- [Why no `from ` (ingress) form?](#why-no-from-iface-ingress-form) +- [Hot-reload semantics](#hot-reload-semantics) + +## When to use it + +Add `mss-clamp` directives if any of these are true on your edge: + +- A downstream peer's path MTU is less than the local link MTU (typical: PPPoE, GRE, IPsec, WireGuard, MPLS overlays). Without clamping, large segments arrive at the bottleneck, get fragmented or PMTUD-discovered-then-dropped, and TCP throughput collapses. +- You currently rely on `iptables -A FORWARD ... TCPMSS --set-mss ` rules and one or more of the `-s` / `-d` prefixes overlaps an `allow-prefix`. Those iptables rules do not fire for fast-pathed traffic — adding a matching `mss-clamp` directive moves the mutation into XDP. +- You deploy onto a host where you don't control upstream MTU but want to defend against MTU-blackhole-induced TCP stalls. + +If none of those apply, you don't need this. PacketFrame doesn't insert any clamp by default. + +## Grammar + lookup precedence + +Four forms accepted, looked up in order of specificity at packet time. The first match wins: + +| # | Form | Scope | +|---|---|---| +| 1 | `mss-clamp via ` | Source-or-dest prefix AND egress iface (most specific) | +| 2 | `mss-clamp ` | Source-or-dest prefix, any egress | +| 3 | `mss-clamp via ` | Egress iface, any prefix | +| 4 | `mss-clamp ` | Global fallback | + +Examples for the typical "clamp customer SYNs leaving the WAN" case: + +``` +mss-clamp via eth2 1360 # everything leaving eth2 +mss-clamp 23.191.201.0/24 via eth2 1360 # only customer 23.191.201.0/24 leaving eth2 +mss-clamp 23.191.201.0/24 1360 # customer 23.191.201.0/24, any egress +``` + +Prefix matches **src OR dst** (same semantic as `allow-prefix`), so one rule covers both directions of a flow. + +CIDR ranges work for both IPv4 and IPv6 — the parser dispatches on the address family. `mss-clamp 2001:db8::/48 1280` is valid. + +`` is the clamp ceiling in bytes. Range: 88–65495. The clamp is **lower-if-higher** — if the SYN's existing MSS is already ≤ the configured value, the packet is left untouched and `mss_clamp_skipped` is bumped instead of `mss_clamp_applied`. + +## What gets clamped + +- Only **matched** traffic (i.e. `allow-prefix` / `allow-prefix6` already hit). Non-matched traffic flows through the kernel where existing iptables `TCPMSS` rules still fire normally. +- Only **TCP SYN and SYN-ACK** packets. Established-connection packets don't carry an MSS option, so there's nothing to mutate. +- Both directions: a SYN egressing eth2 from a clamped prefix, AND the responder's SYN-ACK egressing back into the customer LAN. TCP's per-direction MSS is independent — clamping both ensures both endpoints respect the constraint. +- Only when a clamp value > 0 applies. A packet whose lookup returns no policy is forwarded with no counter activity. + +What is **not** touched: the original packet's TCP timestamp, SACK, window-scale, or any other option. Only the MSS option's 2 bytes change. Checksum is recomputed via RFC 1624 incremental update — no full TCP-segment re-fold. + +## Troubleshooting + +The two relevant counters (SPEC §4.6 indices 33–34, exposed via `packetframe status` and the metrics textfile): + +- `mss_clamp_applied` — packets where the MSS option was rewritten. Climbs with new TCP sessions on clamped prefixes; flat means either no SYNs are arriving on those prefixes or your existing SYNs already announce ≤ clamp. +- `mss_clamp_skipped` — packets matched + with a clamp policy active, but no rewrite. Common reasons: + - Existing MSS already ≤ clamp value (working as intended; you can ignore unless you expect *every* SYN to need adjustment). + - SYN had no MSS option (rare; some old/embedded stacks). + - Malformed TCP options block (very rare; would also break the kernel's processing). + +To confirm the clamp is firing on real traffic: + +```sh +sudo packetframe status | grep mss_clamp +# or via Prometheus textfile: +grep packetframe_mss_clamp /var/lib/node_exporter/textfile/packetframe.prom +``` + +To see the wire MSS that egresses, capture on the egress interface (clamping happens inside the XDP redirect path, so `tcpdump -i ` shows the post-clamp value): + +```sh +sudo tcpdump -i eth2 -n 'tcp[tcpflags] & tcp-syn != 0' -c 5 -vv +# Look for: ... [S], options [mss , ... +``` + +If `mss_clamp_applied` is climbing but downstream still shows MTU-blackhole symptoms, the clamp value is probably too high. The standard math: `MSS = MTU - 40` for IPv4, `MSS = MTU - 60` for IPv6 (each subtracts the IP+TCP header overhead). + +## Why no `from ` (ingress) form? + +PacketFrame's XDP runs at ingress on attached physical NICs. The directional concept that matters operationally is "what egress will this fast-pathed packet take" — resolved by `bpf_fib_lookup` before redirect. `via ` always means **egress**, matching the `local-prefix via X` and `fallback-default via X` grammar. + +For the cases where iptables operators reach for `-i ` (e.g. "clamp packets coming in on this tunnel"), the realistic need is "clamp this customer's traffic" — better expressed via prefix scoping. Prefix scoping is route-stable; ingress-iface scoping depends on which physical bridge member happened to receive the packet. + +If a future use case needs strict ingress scoping, the grammar is append-only — adding `from ` later is straightforward. + +## Hot-reload semantics + +Changes to `mss-clamp` directives are applied via SIGHUP without re-attaching XDP: + +```sh +# Edit /etc/packetframe/packetframe.conf, then: +sudo packetframe reconfigure # or `systemctl reload packetframe` +``` + +The reconcile path performs delta updates against the LPM tries (`MSS_CLAMP_V4`, `MSS_CLAMP_V6`) and the per-iface table — adds, removes, and value updates all happen in place. The global `mss-clamp ` form lives in the `CFG` array and is updated atomically. + +A bad config (e.g. value out of range, malformed CIDR) is rejected at parse time; the CLI exits non-zero with the parse error, the running daemon keeps the old policy in effect. diff --git a/docs/runbooks/reconfigure.md b/docs/runbooks/reconfigure.md new file mode 100644 index 0000000..76581af --- /dev/null +++ b/docs/runbooks/reconfigure.md @@ -0,0 +1,96 @@ +# Reconfigure (SIGHUP) — operator guide (v0.2.4+) + +What `packetframe reconfigure` and `systemctl reload packetframe` do, what's hot-reloadable, and what still needs a full restart. The wiring landed in v0.2.4; before that, the README's `systemctl reload packetframe` line was aspirational. + +## Contents + +- [Quick reference](#quick-reference) +- [What's hot-reloadable](#whats-hot-reloadable) +- [What requires a restart](#what-requires-a-restart) +- [How the handshake works](#how-the-handshake-works) +- [Error semantics](#error-semantics) +- [Operating under systemd](#operating-under-systemd) + +## Quick reference + +```sh +# Edit /etc/packetframe/packetframe.conf, then: +sudo packetframe reconfigure # synchronous; exits non-zero on parse error +sudo systemctl reload packetframe # equivalent — both end up sending SIGHUP + +# Inspect the latest reconfigure result: +cat /var/lib/packetframe/state/last-reconfigure.timestamp +# OK — config parsed + every module reconciled +# ERR parse: — config didn't parse; daemon kept old config +# ERR module: : ; ... — at least one module's reconcile() failed +``` + +The CLI returns immediately on parse failure (exit non-zero, message on stderr). On success it polls the marker file for up to 5s and returns 0 once the daemon has acknowledged. + +## What's hot-reloadable + +These directives can be added, removed, or changed under SIGHUP without re-attaching XDP: + +| Directive | Map(s) updated | Notes | +|---|---|---| +| `allow-prefix`, `allow-prefix6` | `ALLOW_V4`, `ALLOW_V6` | Delta diff vs in-kernel state | +| `block-prefix` | `BLOCK_V4` | v0.2.4+ — wired up alongside reconfigure | +| `dry-run on/off` | `CFG.dry_run` | Single-byte write | +| `forwarding-mode {kernel-fib\|custom-fib\|compare}` | `CFG.flags` bits 3-4 | Atomic | +| `mss-clamp …` (all four grammars) | `MSS_CLAMP_V4/V6` + `MSS_CLAMP_BY_IFACE` + `CFG.mss_clamp_global` | v0.2.4+ — value changes also pick up | +| (auto) VLAN-subif resolution | `VLAN_RESOLVE` | Re-scanned from `/proc/net/vlan/config` | +| (auto) Redirect devmap | `REDIRECT_DEVMAP` | Re-scanned from `/sys/class/net` | + +Adds-before-removes ordering: a renamed prefix (remove + add of the same value) never has a window where neither exists. + +## What requires a restart + +These need `systemctl restart packetframe` (or stop + run): + +- **`attach` directives (interface added or removed).** XDP attach mutates kernel-side state and risks brief link bounce on some drivers (SPEC §11.8). The reconcile path explicitly logs a warning and skips attach-set changes — your delta does not silently apply. +- **`route-source` config (custom-FIB only).** The RouteController's runtime is started at attach. Editing the BGP/BMP listener address or peer-AS requires bringing the runtime down and back up. +- **`circuit-breaker` thresholds.** The breaker sampler thread reads its config at thread start; it doesn't currently observe SIGHUP. +- **`local-prefix` directives (custom-FIB only).** The connected-fast-path resolver is similarly attach-time-bound. +- **`bpffs-root`, `state-dir`.** Used at module load only; baked into the running daemon's pin paths and the metrics file location. + +If you change one of these in the config and reload, the daemon keeps using the old value silently (with a `WARN`-level log line for attach-set changes). Restart is the only way through. + +## How the handshake works + +1. The daemon writes `/var/lib/packetframe/state/packetframe.pid` after attach succeeds and removes it on clean exit. +2. `packetframe reconfigure` (or systemd's `ExecReload=`) reads the PID file and cross-checks `/proc//comm == "packetframe"` to defend against stale-PID-after-process-recycle. +3. The CLI snapshots the mtime of the ack-marker file (`last-reconfigure.timestamp`), then sends SIGHUP via `kill(2)`. +4. The daemon's signal loop catches SIGHUP, re-parses the config, and calls `Module::reconfigure()` on each loaded module. Per-module errors are logged but not fatal — partial-update state is strictly better than halting mid-reconcile. +5. The daemon writes `OK ` (or `ERR `) to the marker file via write-then-rename. +6. The CLI polls every 100 ms for up to 5s. When the marker's mtime advances, it reads the body and exits accordingly. + +If the daemon doesn't ack within 5s, the CLI exits with a "wedged daemon" message. In practice this only fires if the SIGHUP was lost (kernel signal queue full — extremely unlikely) or the reconcile path itself hangs (no observed cases). + +## Error semantics + +| CLI exit | Cause | Daemon state | +|---|---|---| +| 0 | `OK` marker observed | New config in effect | +| non-zero, `parse error: ...` on stderr | Config didn't parse | Daemon is **still running with the old config** | +| non-zero, `daemon rejected: : ...` | At least one module's reconcile failed | Other modules reconciled; the failing module retains old map state | +| non-zero, `daemon not running` | PID file absent or stale | Operator action: start the daemon | +| non-zero, `daemon did not acknowledge ... within 5s` | SIGHUP delivered but no marker update | Investigate via `journalctl -u packetframe` | + +The "old config preserved on parse error" semantics matter for operators editing live: a typo in the config does not take down the data plane. + +## Operating under systemd + +The shipped unit at `/lib/systemd/system/packetframe.service` (installed by the `.deb`) wires `ExecReload=/bin/kill -HUP $MAINPID` and `PIDFile=/var/lib/packetframe/state/packetframe.pid`. So: + +```sh +sudo systemctl reload packetframe # equivalent to `packetframe reconfigure` +journalctl -u packetframe --since '5min ago' | grep -i 'sighup\|reconfigure\|reconcile' +``` + +systemd's `reload` is fire-and-forget — it doesn't poll the ack marker. If you want exit-code-on-failure semantics for scripted use, prefer `packetframe reconfigure` directly. + +The unit ships disabled by default (the `.deb` postinst does not auto-enable). After editing the config the first time you'll want: + +```sh +sudo systemctl enable --now packetframe +```