diff --git a/.github/workflows/docker-scan.yml b/.github/workflows/docker-scan.yml index 3ffffcecc..b1324956a 100644 --- a/.github/workflows/docker-scan.yml +++ b/.github/workflows/docker-scan.yml @@ -27,7 +27,7 @@ jobs: with: # Define paths that trigger specific component workflows # Changes to observability affect multiple components - list-files: 'json' + list-files: "json" filters: | dockerfile: - 'docker/**/Dockerfile' @@ -88,7 +88,7 @@ jobs: build-args: | RUST_IMAGE_VERSION=${{ env.RUST_IMAGE_VERSION }} TARGETARCH=amd64 - context: '.' + context: "." secrets: BLOCKCHAIN_ACTIONS_TOKEN=${{ secrets.BLOCKCHAIN_ACTIONS_TOKEN }} file: ${{ matrix.dockerfile }} platforms: linux/amd64 @@ -102,11 +102,11 @@ jobs: base:latest - name: Scan image with Trivy - uses: aquasecurity/trivy-action@c1824fd6edce30d7ab345a9989de00bbd46ef284 # v0.34.0 + uses: aquasecurity/trivy-action@97e0b3872f55f89b95b2f65b3dbab56962816478 # v0.34.2 with: image-ref: base:latest - format: 'table' - severity: 'CRITICAL,HIGH' + format: "table" + severity: "CRITICAL,HIGH" output: trivy-result.txt - name: Check Trivy result file diff --git a/Cargo.lock b/Cargo.lock index 7a71577c7..ef3f9c440 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4320,6 +4320,8 @@ dependencies = [ "tfhe-versionable", "thiserror 2.0.12", "threshold-fhe", + "tikv-jemalloc-ctl", + "tikv-jemallocator", "tokio", "tokio-rustls 0.26.2", "tokio-util", @@ -5002,6 +5004,7 @@ dependencies = [ "sysinfo", "tempfile", "thiserror 2.0.12", + "tikv-jemalloc-ctl", "tokio", "tonic 0.13.1", "tracing", @@ -7755,6 +7758,37 @@ dependencies = [ "zeroize", ] +[[package]] +name = "tikv-jemalloc-ctl" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "661f1f6a57b3a36dc9174a2c10f19513b4866816e13425d3e418b11cc37bc24c" +dependencies = [ + "libc", + "paste", + "tikv-jemalloc-sys", +] + +[[package]] +name = "tikv-jemalloc-sys" +version = "0.6.1+5.3.0-1-ge13ca993e8ccb9ba9847cc330696e02839f328f7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cd8aa5b2ab86a2cefa406d889139c162cbb230092f7d1d7cbc1716405d852a3b" +dependencies = [ + "cc", + "libc", +] + +[[package]] +name = "tikv-jemallocator" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0359b4327f954e0567e69fb191cf1436617748813819c94b8cd4a431422d053a" +dependencies = [ + "libc", + "tikv-jemalloc-sys", +] + [[package]] name = "time" version = "0.3.47" diff --git a/Cargo.toml b/Cargo.toml index 523edf5f2..668ac845d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -185,6 +185,8 @@ tfhe-csprng = "=0.8.0" # Cryptographically secure PRNG for TFHE - LOW RISK: Zam tfhe-versionable = "=0.7.0" # TFHE versioning support - LOW RISK: Zama tfhe-zk-pok = "=0.8.0" # Zero-knowledge proofs for TFHE - LOW RISK: Zama thiserror = "=2.0.12" # Error derive macro - MEDIUM RISK: Reputable individual maintainer (dtolnay), 545M downloads +tikv-jemalloc-ctl = { version = "=0.6.1", features = ["use_std", "stats"] } # jemalloc runtime control - LOW RISK: TiKV team (PingCAP), bindings to jemalloc +tikv-jemallocator = { version = "=0.6.1", features = ["profiling", "unprefixed_malloc_on_supported_platforms"] } # jemalloc allocator with heap profiling - LOW RISK: TiKV team (PingCAP), 17M+ downloads tokio = { version = "=1.46.1", features = ["full"] } # Async runtime - LOW RISK: tokio team, industry standard tokio-rustls = { version = "=0.26.2", default-features = false, features = ["aws_lc_rs"] } # Async TLS - LOW RISK: rustls team, memory-safe TLS implementation tokio-util = { version = "=0.7.15", features = ["rt"] } # Tokio utilities - LOW RISK: tokio team @@ -243,6 +245,16 @@ inherits = "release" # but set LTO to the default (off instead of fat) lto = "off" +# profile for heap profiling with jemalloc — keeps line-table debug info +# so jeprof can resolve addresses to function:line instead of ??:0. +# Uses debug=1 (line tables only) instead of debug=2 (full DWARF) to +# dramatically reduce build time while retaining all info jeprof needs. +[profile.heap-profiling] +inherits = "release" +debug = 1 +strip = "none" +lto = "off" + [patch.crates-io] # MEDIUM RISK: Using fork instead of upstream - verify changes, consider upstreaming attestation-doc-validation = { git = 'https://github.com/mkmks/attestation-doc-validation.git', branch = 'timestamps' } diff --git a/Makefile b/Makefile index 9b237f347..dd60274d2 100644 --- a/Makefile +++ b/Makefile @@ -25,6 +25,38 @@ start-compose-threshold-telemetry: stop-compose-threshold-telemetry: docker compose -vvv -f docker-compose-core-base.yml -f docker-compose-core-threshold.yml -f docker-compose-telemetry.yml down --volumes --remove-orphans +build-compose-heap-profiling: + docker compose -f docker-compose-core-base.yml -f docker-compose-core-threshold.yml -f profiling/docker-compose-heap-profiling.yml -f docker-compose-telemetry.yml build + +start-compose-heap-profiling: + docker compose -f docker-compose-core-base.yml -f docker-compose-core-threshold.yml -f profiling/docker-compose-heap-profiling.yml -f docker-compose-telemetry.yml up -d --wait + +stop-compose-heap-profiling: + docker compose -f docker-compose-core-base.yml -f docker-compose-core-threshold.yml -f profiling/docker-compose-heap-profiling.yml -f docker-compose-telemetry.yml down --volumes --remove-orphans + +# Dump heap profiles from all cores and copy them locally for analysis +dump-heap-profiles: + @mkdir -p profiling/heap-dumps + @for i in 1 2 3 4; do \ + echo "Dumping heap profile for dev-kms-core-$$i..."; \ + docker compose -f docker-compose-core-base.yml -f docker-compose-core-threshold.yml -f profiling/docker-compose-heap-profiling.yml \ + exec dev-kms-core-$$i killall -USR1 kms-server 2>/dev/null || true; \ + done + @sleep 1 + @for i in 1 2 3 4; do \ + echo "Copying dumps from dev-kms-core-$$i..."; \ + docker compose -f docker-compose-core-base.yml -f docker-compose-core-threshold.yml -f profiling/docker-compose-heap-profiling.yml \ + cp dev-kms-core-$$i:/tmp/kms-heap/ ./profiling/heap-dumps/core-$$i/ 2>/dev/null || true; \ + echo "Capturing /proc/maps for dev-kms-core-$$i..."; \ + docker compose -f docker-compose-core-base.yml -f docker-compose-core-threshold.yml -f profiling/docker-compose-heap-profiling.yml \ + exec -T dev-kms-core-$$i sh -c 'cat /proc/$$(pidof kms-server)/maps' \ + > ./profiling/heap-dumps/core-$$i/maps.txt 2>/dev/null || true; \ + done + @echo "Copying kms-server binary for symbol resolution..." + @docker compose -f docker-compose-core-base.yml -f docker-compose-core-threshold.yml -f profiling/docker-compose-heap-profiling.yml \ + cp dev-kms-core-1:/app/kms/core/service/bin/kms-server ./profiling/heap-dumps/kms-server 2>/dev/null || true + @echo "Done. Analyze with: ./profiling/analyze-heap.sh ./profiling/heap-dumps/kms-server ./profiling/heap-dumps/core-1/" + # Test backwards compatibility with LFS files. This will pull the LFS files from git before running the tests. test-backward-compatibility: pull-lfs-files cargo test --test backward_compatibility_* -- --include-ignored diff --git a/core/service/Cargo.toml b/core/service/Cargo.toml index 68b4e212f..92ab54aca 100644 --- a/core/service/Cargo.toml +++ b/core/service/Cargo.toml @@ -104,6 +104,8 @@ tfhe = { workspace = true, features = [ ] } tfhe-versionable.workspace = true thiserror.workspace = true +tikv-jemalloc-ctl = { workspace = true, optional = true } +tikv-jemallocator = { workspace = true, optional = true } threshold-fhe = { workspace = true, default-features = false, features = [ "extension_degree_4", ] } @@ -175,6 +177,11 @@ non-wasm = [ "dep:tower-http", "dep:x509-parser", ] +heap-profiling = [ + "observability?/heap-profiling", + "dep:tikv-jemalloc-ctl", + "dep:tikv-jemallocator", +] slow_tests = ["testing"] wasm_tests = ["testing"] s3_tests = ["testing"] diff --git a/core/service/config/compose_centralized.toml b/core/service/config/compose_centralized.toml index 52f945dc4..7c4ec4ade 100644 --- a/core/service/config/compose_centralized.toml +++ b/core/service/config/compose_centralized.toml @@ -42,4 +42,4 @@ user_decrypt = 1 crsgen = 100 preproc = 25000 keygen = 1000 -new_epoch = 1 \ No newline at end of file +new_epoch = 1 diff --git a/core/service/src/bin/kms-server.rs b/core/service/src/bin/kms-server.rs index a34954388..2f17d0bff 100644 --- a/core/service/src/bin/kms-server.rs +++ b/core/service/src/bin/kms-server.rs @@ -46,6 +46,10 @@ use tokio_rustls::rustls::{ version::TLS13, }; +#[cfg(feature = "heap-profiling")] +#[global_allocator] +static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc; + #[derive(Parser)] #[clap(name = "KMS server")] #[clap( @@ -346,6 +350,9 @@ fn main() -> anyhow::Result<()> { /// Note that key material MUST exist when starting the server and be stored in the path specified by the configuration file. /// Please consult the `kms-gen-keys` binary for details on generating key material. async fn main_exec() -> anyhow::Result<()> { + #[cfg(feature = "heap-profiling")] + kms_lib::heap_profiling::install_sigusr1_handler(); + let args = KmsArgs::parse(); let (mut core_config, tracer_provider, meter_provider) = init_conf_kms_core_telemetry::(&args.config_file).await?; diff --git a/core/service/src/heap_profiling.rs b/core/service/src/heap_profiling.rs new file mode 100644 index 000000000..f5c36cb28 --- /dev/null +++ b/core/service/src/heap_profiling.rs @@ -0,0 +1,80 @@ +//! Heap profiling support using jemalloc. +//! +//! When the `heap-profiling` feature is enabled and `MALLOC_CONF` includes +//! `prof:true`, this module provides on-demand heap dumps. +//! +//! # Quick Start +//! +//! For the full Docker-based workflow (handles PIE/ASLR, symbol resolution, +//! and diff analysis automatically), see `profiling/README.md`. +//! +//! Manual (non-PIE binary) usage: +//! +//! 1. Build with: `cargo build -p kms --bin kms-server --profile heap-profiling -F heap-profiling` +//! 2. Run with env: `MALLOC_CONF=prof:true,lg_prof_sample:12 kms-server ...` +//! (use `lg_prof_sample:19` for lower overhead — see `profiling/README.md`) +//! 3. Dump heap: `kill -USR1 ` +//! 4. Analyze: `jeprof --svg kms-server /tmp/kms-heap/prof.0001.heap > heap.svg` +//! 5. Diff two dumps: `jeprof --base=prof.0001.heap --svg kms-server prof.0010.heap > diff.svg` + +use std::sync::atomic::{AtomicUsize, Ordering}; + +const HEAP_DUMP_DIR: &str = "/tmp/kms-heap"; + +static DUMP_SEQ: AtomicUsize = AtomicUsize::new(0); + +/// Dump a heap profile to `/tmp/kms-heap/prof.NNNN.heap`. +/// +/// Creates the output directory if it does not already exist. +pub fn dump_heap_profile() -> Result { + // Ensure the output directory exists (idempotent) + if let Err(e) = std::fs::create_dir_all(HEAP_DUMP_DIR) { + eprintln!("[heap-profiling] WARNING: failed to create {HEAP_DUMP_DIR}: {e}"); + } + + let seq = DUMP_SEQ.fetch_add(1, Ordering::Relaxed); + let path_str = format!("{HEAP_DUMP_DIR}/prof.{seq:04}.heap"); + let path_c = + std::ffi::CString::new(path_str.clone()).map_err(|e| format!("invalid path: {e}"))?; + + // jemalloc mallctl expects a pointer to the filename string + let ptr = path_c.as_ptr(); + // SAFETY: `ptr` points to a valid null-terminated C string (`path_c`) that + // outlives this call. jemalloc's `prof.dump` mallctl expects a `const char *` + // and `raw::write` passes `&ptr` as `newp`, matching the expected ABI. + let result = unsafe { tikv_jemalloc_ctl::raw::write(b"prof.dump\0", ptr) }; + + match result { + Ok(()) => { + eprintln!("[heap-profiling] Dumped to {path_str}"); + Ok(path_str) + } + Err(e) => { + let msg = format!("jemalloc prof.dump failed: {e}. Is MALLOC_CONF=prof:true set?"); + eprintln!("[heap-profiling] ERROR: {msg}"); + Err(msg) + } + } +} + +/// Install a SIGUSR1 handler that triggers heap profile dumps. +/// +/// Call this once at startup. Then `kill -USR1 ` to dump. +pub fn install_sigusr1_handler() { + if let Err(e) = std::fs::create_dir_all(HEAP_DUMP_DIR) { + eprintln!("[heap-profiling] WARNING: failed to create {HEAP_DUMP_DIR}: {e}"); + } + + // Spawn a background tokio task to listen for SIGUSR1 + tokio::spawn(async { + let mut sig = tokio::signal::unix::signal(tokio::signal::unix::SignalKind::user_defined1()) + .expect("Failed to register SIGUSR1 handler"); + + eprintln!("[heap-profiling] Ready — send SIGUSR1 to dump heap profile to {HEAP_DUMP_DIR}/"); + + loop { + sig.recv().await; + let _ = dump_heap_profile(); + } + }); +} diff --git a/core/service/src/lib.rs b/core/service/src/lib.rs index 04705665b..ebc54467b 100644 --- a/core/service/src/lib.rs +++ b/core/service/src/lib.rs @@ -35,6 +35,9 @@ pub mod testing; #[cfg(feature = "non-wasm")] pub mod vault; +#[cfg(feature = "heap-profiling")] +pub mod heap_profiling; + #[cfg(feature = "non-wasm")] pub use kms_grpc::utils::tonic_result::BoxedStatus; diff --git a/docker/core/service/Dockerfile b/docker/core/service/Dockerfile index d9f24ba6d..0cc3ebd9b 100644 --- a/docker/core/service/Dockerfile +++ b/docker/core/service/Dockerfile @@ -8,6 +8,10 @@ FROM --platform=$BUILDPLATFORM ghcr.io/zama-ai/kms/rust-golden-image:latest AS k # But you can provide --build-arg LTO_RELEASE="--profile release-lto-off" locally to build locally ARG LTO_RELEASE=release ARG APP_CACHE_DIR=kms +# Extra Cargo features to enable, comma-separated (e.g., "heap-profiling" for jemalloc profiling) +ARG CARGO_EXTRA_FEATURES="" +ARG RUSTFLAGS="" +ENV RUSTFLAGS=${RUSTFLAGS} # Fetch dependencies and build binaries WORKDIR /app/kms @@ -24,11 +28,11 @@ COPY docker ./docker RUN mkdir -p /app/kms/core/service/bin RUN --mount=type=cache,target=/root/.cargo/registry,sharing=locked \ - --mount=type=cache,target=/app/${APP_CACHE_DIR}/target,sharing=locked \ + --mount=type=cache,target=/app/${APP_CACHE_DIR}/target,sharing=locked,id=cargo-target-${LTO_RELEASE} \ cargo fetch --locked RUN --mount=type=cache,target=/root/.cargo/registry,sharing=locked \ - --mount=type=cache,target=/app/${APP_CACHE_DIR}/target,sharing=locked \ - cargo build --locked --profile=${LTO_RELEASE} -p kms --bin kms-server --bin kms-gen-tls-certs --bin kms-init --bin kms-custodian -F insecure && \ + --mount=type=cache,target=/app/${APP_CACHE_DIR}/target,sharing=locked,id=cargo-target-${LTO_RELEASE} \ + cargo build --locked --profile=${LTO_RELEASE} -p kms --bin kms-server --bin kms-gen-tls-certs --bin kms-init --bin kms-custodian -F insecure ${CARGO_EXTRA_FEATURES:+-F "$CARGO_EXTRA_FEATURES"} && \ cargo build --locked --profile=${LTO_RELEASE} -p kms --bin kms-gen-keys -F testing -F threshold-fhe/testing -F insecure && \ cp /app/kms/target/${LTO_RELEASE}/kms-server \ /app/kms/target/${LTO_RELEASE}/kms-gen-tls-certs \ diff --git a/observability/Cargo.toml b/observability/Cargo.toml index 68ee7840a..9b6f0cf9f 100644 --- a/observability/Cargo.toml +++ b/observability/Cargo.toml @@ -29,6 +29,7 @@ strum.workspace = true strum_macros.workspace = true sysinfo.workspace = true thiserror.workspace = true +tikv-jemalloc-ctl = { workspace = true, optional = true } tokio = { workspace = true, features = ["rt", "rt-multi-thread", "macros", "net"] } tonic.workspace = true tracing = { workspace = true, features = ["log", "async-await"] } @@ -48,3 +49,4 @@ ignored = ["strum"] [features] default = [] +heap-profiling = ["dep:tikv-jemalloc-ctl"] diff --git a/observability/src/metrics.rs b/observability/src/metrics.rs index 6687026b7..ab0524835 100644 --- a/observability/src/metrics.rs +++ b/observability/src/metrics.rs @@ -80,6 +80,10 @@ pub struct CoreMetrics { process_cpu_usage_gauge: TaggedMetric>, // CPU load for the current process in percentage total_memory_gauge: TaggedMetric>, // Total memory available process_memory_gauge: TaggedMetric>, // Memory usage for the current process + #[cfg(feature = "heap-profiling")] + jemalloc_allocated_gauge: TaggedMetric>, // Bytes actively allocated by the application (via jemalloc) + #[cfg(feature = "heap-profiling")] + jemalloc_resident_gauge: TaggedMetric>, // Bytes mapped by jemalloc from OS cpu_load_gauge: TaggedMetric>, // 1-minute average CPU load, divided by number of cores memory_usage_gauge: TaggedMetric>, // Trace guard for file-based logging @@ -146,6 +150,12 @@ impl CoreMetrics { format!("{}_total_memory", config.prefix).into(); let process_memory_metric: Cow<'static, str> = format!("{}_process_memory_usage", config.prefix).into(); + #[cfg(feature = "heap-profiling")] + let jemalloc_allocated_metric: Cow<'static, str> = + format!("{}_jemalloc_allocated", config.prefix).into(); + #[cfg(feature = "heap-profiling")] + let jemalloc_resident_metric: Cow<'static, str> = + format!("{}_jemalloc_resident", config.prefix).into(); let cpu_load_metric: Cow<'static, str> = format!("{}_cpu_load", config.prefix).into(); let memory_usage_metric: Cow<'static, str> = format!("{}_memory_usage", config.prefix).into(); @@ -318,6 +328,26 @@ impl CoreMetrics { //Record 0 just to make sure the gauge is exported process_memory_gauge.record(0, &[]); + #[cfg(feature = "heap-profiling")] + let jemalloc_allocated_gauge = meter + .u64_gauge(jemalloc_allocated_metric) + .with_description("Bytes actively allocated by the application (via jemalloc)") + .with_unit("bytes") + .build(); + #[cfg(feature = "heap-profiling")] + //Record 0 just to make sure the gauge is exported + jemalloc_allocated_gauge.record(0, &[]); + + #[cfg(feature = "heap-profiling")] + let jemalloc_resident_gauge = meter + .u64_gauge(jemalloc_resident_metric) + .with_description("Bytes mapped by jemalloc from OS (resident set)") + .with_unit("bytes") + .build(); + #[cfg(feature = "heap-profiling")] + //Record 0 just to make sure the gauge is exported + jemalloc_resident_gauge.record(0, &[]); + let cpu_gauge = meter .f64_gauge(cpu_load_metric) .with_description("CPU load for KMS (averaged over all CPUs)") @@ -358,6 +388,10 @@ impl CoreMetrics { total_memory_gauge: TaggedMetric::new(total_memory_gauge), process_cpu_usage_gauge: TaggedMetric::new(process_cpu_usage_gauge), process_memory_gauge: TaggedMetric::new(process_memory_gauge), + #[cfg(feature = "heap-profiling")] + jemalloc_allocated_gauge: TaggedMetric::new(jemalloc_allocated_gauge), + #[cfg(feature = "heap-profiling")] + jemalloc_resident_gauge: TaggedMetric::new(jemalloc_resident_gauge), trace_guard: Arc::new(Mutex::new(None)), } } @@ -598,6 +632,22 @@ impl CoreMetrics { .metric .record(usage, &self.process_memory_gauge.with_tags(&[])); } + + /// Record jemalloc's active allocation size (stats.allocated) + #[cfg(feature = "heap-profiling")] + pub fn record_jemalloc_allocated(&self, usage: u64) { + self.jemalloc_allocated_gauge + .metric + .record(usage, &self.jemalloc_allocated_gauge.with_tags(&[])); + } + + /// Record jemalloc's resident memory size (stats.resident) + #[cfg(feature = "heap-profiling")] + pub fn record_jemalloc_resident(&self, usage: u64) { + self.jemalloc_resident_gauge + .metric + .record(usage, &self.jemalloc_resident_gauge.with_tags(&[])); + } } /// Builder for DurationGuard to ensure proper initialization diff --git a/observability/src/sys_metrics.rs b/observability/src/sys_metrics.rs index fffcde51d..41f42a5b5 100644 --- a/observability/src/sys_metrics.rs +++ b/observability/src/sys_metrics.rs @@ -3,6 +3,8 @@ use std::{cmp::max, ffi::OsStr, fs, time::Duration}; use sysinfo::{ ProcessRefreshKind, ProcessesToUpdate, RefreshKind, System, MINIMUM_CPU_UPDATE_INTERVAL, }; +#[cfg(feature = "heap-profiling")] +use tikv_jemalloc_ctl::stats as jemalloc_stats; pub fn start_sys_metrics_collection(refresh_interval: Duration) -> anyhow::Result<()> { // Only fail for info we'll actually poll later on @@ -80,6 +82,28 @@ pub fn start_sys_metrics_collection(refresh_interval: Duration) -> anyhow::Resul let socat_count = get_socat_file_descriptor_count(&system); METRICS.record_socat_file_descriptors(socat_count); + // Jemalloc allocator stats + #[cfg(feature = "heap-profiling")] + { + // Advance jemalloc's stats epoch to get fresh values + match tikv_jemalloc_ctl::epoch::mib() { + Ok(epoch) => { + if let Err(e) = epoch.advance() { + tracing::debug!("jemalloc epoch advance failed: {e}"); + } + } + Err(e) => { + tracing::debug!("jemalloc epoch mib lookup failed: {e}"); + } + } + if let Ok(allocated) = jemalloc_stats::allocated::read() { + METRICS.record_jemalloc_allocated(allocated as u64); + } + if let Ok(resident) = jemalloc_stats::resident::read() { + METRICS.record_jemalloc_resident(resident as u64); + } + } + // Ensure we sleep at least the time needed to accurately update CPU usage, as recommended by sysinfo documentation tokio::time::sleep(max(refresh_interval, MINIMUM_CPU_UPDATE_INTERVAL)).await; } diff --git a/profiling/.gitignore b/profiling/.gitignore new file mode 100644 index 000000000..9fd4d8cc4 --- /dev/null +++ b/profiling/.gitignore @@ -0,0 +1,2 @@ +heap-dumps/ +heap-analysis/ diff --git a/profiling/README.md b/profiling/README.md new file mode 100644 index 000000000..b27156250 --- /dev/null +++ b/profiling/README.md @@ -0,0 +1,138 @@ +# Heap Profiling + +Detect memory leaks in KMS core nodes using jemalloc heap profiling. + +## How it works + +The `heap-profiling` Cargo profile builds with jemalloc and preserves debug symbols. At runtime, jemalloc samples allocations and dumps heap snapshots on memory peaks and on `SIGUSR1`. Comparing two snapshots shows only allocations that *grew* — these are your leak *candidates*. Some growth may be legitimate (caches warming up, connection pools expanding, etc.), so the analyst must judge which sites represent actual leaks. + +> **Note:** `heap-profiling` is used as both a Cargo **feature** (enables jemalloc code paths and SIGUSR1 handler) and a Cargo **profile** (preserves debug info for jeprof). The docker-compose file enables both automatically. + +## Host dependencies + +- **jeprof** (from gperftools) — reads jemalloc `.heap` dumps +- **graphviz** — renders SVG flamegraphs (`dot`) +- **addr2line** (from binutils) — resolves addresses to source lines + +Install on Ubuntu/Debian: +```bash +sudo apt install google-perftools graphviz binutils +``` + +Install on macOS: +```bash +brew install gperftools graphviz binutils +``` + +## Usage + +All commands run from the repo root. + +### 1. Build + +```bash +make build-compose-heap-profiling +``` + +### 2. Start + +```bash +make start-compose-heap-profiling +``` + +Verify `[heap-profiling] Ready` appears in container logs. + +### 3. Dump heap profiles + +```bash +# Take a baseline dump before load +make dump-heap-profiles + +# ... run your workload ... + +# Take another dump after load +make dump-heap-profiles +``` + +This sends `SIGUSR1` to each core, copies `.heap` files, the binary, and `/proc/PID/maps` (for PIE address resolution) to `profiling/heap-dumps/`. + +### 4. Analyze + +```bash +./profiling/analyze-heap.sh ./profiling/heap-dumps/kms-server ./profiling/heap-dumps/core-1/ +``` + +Output in `profiling/heap-analysis/`: + +| File | Description | +|---|---| +| `top-leaks.txt` | Top allocation sites in the latest snapshot | +| `latest.svg` | Flamegraph of the latest snapshot | +| `diff-leaks.txt` | Allocation sites that grew between first and last snapshot | +| `diff.svg` | Diff flamegraph — **the most useful output** | + +Open the `.svg` files in a browser. The diff shows only allocations that increased between the two dumps. + +## Profiling + +### `lg_prof_sample` trade-offs + +The `lg_prof_sample` setting in `MALLOC_CONF` controls profiling granularity: + +| Value | Sample interval | Overhead | Use case | +|---|---|---|---| +| `19` | 512 KB | ~1-2% | Quick smoke-test, production-safe | +| `12` | 4 KB | ~15-20% | Detailed leak hunting (default in this stack) | + +Lower values capture more allocations but slow things down. The default is `12` for thorough profiling; bump to `19` if you only need a quick pass. + +### Diagnosing leak type with Prometheus metrics + +After deploying with the telemetry stack (included by default), compare these three metrics in Prometheus/Grafana: + +| `kms_jemalloc_allocated` | `kms_jemalloc_resident` | `kms_process_memory_usage` (RSS) | Diagnosis | +|---|---|---|---| +| Staircases up | Staircases up | Staircases up | **Application-level leak** — objects allocated and never freed | +| Flat | Staircases up | Staircases up | **Allocator fragmentation** — freed memory can't be returned due to mixed page usage | +| Flat | Flat | Staircases up | **Non-jemalloc memory growth** — mmap, thread stacks, shared libs, etc. | + +- `kms_jemalloc_allocated` — bytes the app actively holds via jemalloc +- `kms_jemalloc_resident` — bytes jemalloc has mapped from the OS (includes fragmentation) +- `kms_process_memory_usage` — total process RSS (includes non-jemalloc memory) + +## Files + +``` +profiling/ +├── README.md +├── analyze-heap.sh # Analysis script (handles PIE/ASLR address resolution) +├── docker-compose-heap-profiling.yml # Compose override (build args + MALLOC_CONF) +├── heap-dumps/ # Dumped .heap files + binary + maps.txt (git-ignored) +└── heap-analysis/ # Analysis output (git-ignored) +``` + +## Troubleshooting + +### Symbols show as `??:0` + +jeprof needs three things to resolve addresses: + +1. **Debug info in the binary** — the `heap-profiling` Cargo profile sets `debug=1` (line tables) +2. **`addr2line` on the host** — `which addr2line` (from binutils) +3. **`MAPPED_LIBRARIES:` section in the heap dump** — jemalloc writes this from `/proc/self/maps`. If missing, `make dump-heap-profiles` captures it separately as `maps.txt`, and `analyze-heap.sh` injects it automatically + +If symbols still don't resolve, check: +```bash +# Binary has debug sections? +readelf -S ./profiling/heap-dumps/kms-server | grep debug + +# Heap dump has MAPPED_LIBRARIES? +grep -c MAPPED_LIBRARIES ./profiling/heap-dumps/core-1/*.heap + +# maps.txt was captured? +ls -l ./profiling/heap-dumps/core-1/maps.txt +``` + +### Negative diff totals + +This happens when auto-dumps from `prof_gdump:true` (taken at memory peaks) get mixed with manual dumps. The script prefers manual dumps (from SIGUSR1) for diffing. For reliable diffs, always take two manual dumps: one before and one after your workload. diff --git a/profiling/analyze-heap.sh b/profiling/analyze-heap.sh new file mode 100755 index 000000000..b4df21aa3 --- /dev/null +++ b/profiling/analyze-heap.sh @@ -0,0 +1,261 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Analyze jemalloc heap profile dumps from KMS core nodes. +# +# Works on both Linux and macOS. +# +# Prerequisites: +# - Install jeprof: +# Ubuntu/Debian: apt install jemalloc +# macOS: brew install jemalloc +# - graphviz for SVG output: +# Ubuntu/Debian: apt install graphviz +# macOS: brew install graphviz +# - addr2line (from binutils) for symbol resolution +# +# Usage: +# # 1. Start the profiling stack (from repo root) +# make build-compose-heap-profiling +# make start-compose-heap-profiling +# +# # 2. Run your decryption workload, then dump + copy profiles +# make dump-heap-profiles +# +# # 3. Analyze +# ./profiling/analyze-heap.sh ./profiling/heap-dumps/kms-server ./profiling/heap-dumps/core-1/ +# +# Output (inside profiling/heap-analysis/): +# top-leaks.txt — text listing of largest allocation sites +# latest.svg — flamegraph of the latest heap snapshot +# diff-leaks.txt — allocation sites that GREW between first and last dump +# diff.svg — diff flamegraph (the most useful: shows your leaks) + +BINARY="${1:?Usage: $0 }" +DUMP_DIR="${2:?Usage: $0 }" + +# ── Find jeprof ────────────────────────────────────────────────────────── +JEPROF="" +for cmd in jeprof; do + if command -v "$cmd" &>/dev/null; then + JEPROF="$cmd" + break + fi +done + +if [ -z "$JEPROF" ]; then + echo "ERROR: jeprof not found. Install with:" + echo " Ubuntu/Debian: apt install jemalloc" + echo " macOS: brew install jemalloc" + echo " Or build from: https://github.com/jemalloc/jemalloc" + exit 1 +fi + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +OUT_DIR="$SCRIPT_DIR/heap-analysis" +mkdir -p "$OUT_DIR" + +# ── Cross-platform helpers ──────────────────────────────────────────────── +# realpath: available on Linux, but not always on macOS +portable_realpath() { + if command -v realpath &>/dev/null; then + realpath "$1" + else + # Python fallback (available on macOS by default) + python3 -c "import os,sys; print(os.path.realpath(sys.argv[1]))" "$1" + fi +} + +# sed -i: GNU sed uses -i (no suffix arg), BSD (macOS) sed requires -i '' (explicit empty suffix) +portable_sed_i() { + local expr="$1" + local file="$2" + if sed --version 2>/dev/null | grep -q GNU; then + sed -i "$expr" "$file" + else + sed -i '' "$expr" "$file" + fi +} + +ABS_BINARY="$(portable_realpath "$BINARY")" +BINARY_NAME="$(basename "$BINARY")" + +# Extract the binary load base address from a heap dump's MAPPED_LIBRARIES +get_binary_base() { + grep -A9999 '^MAPPED_LIBRARIES:' "$1" 2>/dev/null \ + | grep -F "$ABS_BINARY" | grep -E "^[0-9a-f].*r.xp " \ + | head -1 | cut -d'-' -f1 +} + +# ── Working directory for processed heap dumps ─────────────────────────── +# We create copies with the binary path in MAPPED_LIBRARIES rewritten to +# match the local binary. This is the critical fix for PIE/ASLR: jeprof +# uses the path in MAPPED_LIBRARIES to match the binary and compute the +# load offset — if the container path (/app/kms/.../kms-server) doesn't +# match the local path, jeprof can't translate addresses → ??:0. +WORK_DIR=$(mktemp -d) +trap 'rm -rf "$WORK_DIR"' EXIT + +# ── Find maps.txt (captured from /proc/PID/maps by make dump-heap-profiles) +MAPS_FILE="" +for candidate in "$DUMP_DIR/maps.txt" "$(dirname "$DUMP_DIR")/maps.txt"; do + if [ -f "$candidate" ]; then + MAPS_FILE="$candidate" + break + fi +done + +# ── Prepare a heap dump for jeprof ─────────────────────────────────────── +# 1. Inject MAPPED_LIBRARIES if jemalloc didn't include it +# 2. Rewrite the binary path so jeprof can match it to the local binary +DUMP_COUNTER=0 +prepare_dump() { + local src="$1" + local idx="$2" + local dst="$WORK_DIR/${idx}.heap" + cp "$src" "$dst" + + # Inject MAPPED_LIBRARIES from maps.txt if the dump doesn't have it + if [ -n "$MAPS_FILE" ] && ! grep -q '^MAPPED_LIBRARIES:' "$dst" 2>/dev/null; then + echo " Injecting MAPPED_LIBRARIES into $(basename "$src")" + printf '\nMAPPED_LIBRARIES:\n' >> "$dst" + cat "$MAPS_FILE" >> "$dst" + fi + + # Rewrite the binary path in MAPPED_LIBRARIES so jeprof can match it. + # Container path: /app/kms/core/service/bin/kms-server + # Local path: /home/user/.../profiling/heap-dumps/kms-server + if grep -q 'MAPPED_LIBRARIES:' "$dst" 2>/dev/null; then + portable_sed_i "s|[^ ]*/${BINARY_NAME}\$|${ABS_BINARY}|" "$dst" + fi + + echo "$dst" +} + +# ── Find heap dumps, sorted by modification time (oldest first) ────────── +# Prepare working copies and partition manual vs auto in a single pass. +# Auto-dumps (in /auto/ subdir from prof_gdump) fire at memory peaks and can +# invert diff direction, so we prefer manual dumps (from SIGUSR1) for diffs. +ALL_ORIG=() # all original paths +WORK_ALL=() # all working copies +MANUAL_ORIGINALS=() # manual-only originals +MANUAL_WORK=() # manual-only working copies +while IFS= read -r heap; do + work="$(prepare_dump "$heap" "$DUMP_COUNTER")" + DUMP_COUNTER=$((DUMP_COUNTER + 1)) + ALL_ORIG+=("$heap") + WORK_ALL+=("$work") + if [[ "$heap" != *"/auto/"* ]]; then + MANUAL_ORIGINALS+=("$heap") + MANUAL_WORK+=("$work") + fi +done < <(find "$DUMP_DIR" -name '*.heap' -type f -exec ls -1tr {} +) + +if [ ${#ALL_ORIG[@]} -eq 0 ]; then + echo "ERROR: No .heap files found in $DUMP_DIR" + echo "Did you run 'make dump-heap-profiles'?" + exit 1 +fi + +echo "Found ${#ALL_ORIG[@]} heap dump(s)" + +if [ ${#MANUAL_WORK[@]} -ge 2 ]; then + DUMPS_ORIG=("${MANUAL_ORIGINALS[@]}") + DUMPS_WORK=("${MANUAL_WORK[@]}") + echo "Using ${#MANUAL_WORK[@]} manual dump(s) for diff analysis" +else + DUMPS_ORIG=("${ALL_ORIG[@]}") + DUMPS_WORK=("${WORK_ALL[@]}") +fi + +LATEST_ORIG="${DUMPS_ORIG[@]: -1}" +LATEST_WORK="${DUMPS_WORK[@]: -1}" + +# ── 1. Top allocation sites in the latest dump ─────────────────────────── +echo "" +echo "=== Top allocation sites (latest dump: $(basename "$LATEST_ORIG")) ===" +("$JEPROF" --text --lines "$BINARY" "$LATEST_WORK" || true) | head -40 | tee "$OUT_DIR/top-leaks.txt" +echo "" + +# ── Fallback: manual addr2line if jeprof shows ??:0 ────────────────────── +if grep -q '??:0' "$OUT_DIR/top-leaks.txt" 2>/dev/null; then + echo "WARNING: jeprof could not resolve symbols (??:0)." + + # Try to extract the binary base address from MAPPED_LIBRARIES + BASE_ADDR=$(get_binary_base "$LATEST_WORK") + + if [ -n "$BASE_ADDR" ] && command -v addr2line &>/dev/null; then + echo " Falling back to manual addr2line (binary base: 0x${BASE_ADDR})" + echo "" + echo "=== Manual symbol resolution ===" + # Re-read the jeprof output and resolve each address + grep -oE '0x[0-9a-f]+' "$OUT_DIR/top-leaks.txt" | sort -u | while read -r addr; do + # Compute binary offset: virtual_addr - load_base + offset=$(printf "0x%x" $(( addr - 0x${BASE_ADDR} )) 2>/dev/null) || continue + resolved=$(addr2line -C -f -e "$BINARY" "$offset" 2>/dev/null | head -2 | tr '\n' ' ') + if [ -n "$resolved" ] && [[ "$resolved" != *"??"* ]]; then + printf " %-20s → %s\n" "$addr" "$resolved" + fi + done | tee "$OUT_DIR/resolved-symbols.txt" + echo "" + else + echo " Checklist:" + echo " 1. Binary has debug info? readelf -S '$BINARY' | grep debug" + echo " 2. addr2line installed? which addr2line" + if [ -z "$MAPS_FILE" ]; then + echo " 3. No maps.txt found — re-run 'make dump-heap-profiles' to capture /proc/PID/maps" + fi + echo "" + fi +fi + +# ── 2. SVG of the latest dump ──────────────────────────────────────────── +echo "Generating $OUT_DIR/latest.svg ..." +"$JEPROF" --svg --lines "$BINARY" "$LATEST_WORK" > "$OUT_DIR/latest.svg" +echo " Open $OUT_DIR/latest.svg in a browser to see the full allocation flamegraph." +echo "" + +# ── 3. Diff between earliest and latest dump ───────────────────────────── +if [ ${#DUMPS_WORK[@]} -ge 2 ]; then + EARLIEST_ORIG="${DUMPS_ORIG[0]}" + EARLIEST_WORK="${DUMPS_WORK[0]}" + + # Detect cross-run diffs: if the ASLR base addresses differ, the dumps + # are from different process instances and the diff is meaningless. + BASE_EARLIEST=$(get_binary_base "$EARLIEST_WORK") + BASE_LATEST=$(get_binary_base "$LATEST_WORK") + + if [ -n "$BASE_EARLIEST" ] && [ -n "$BASE_LATEST" ] && [ "$BASE_EARLIEST" != "$BASE_LATEST" ]; then + echo "WARNING: Dumps are from DIFFERENT process instances (ASLR bases differ)." + echo " earliest: 0x${BASE_EARLIEST} ($(basename "$EARLIEST_ORIG"))" + echo " latest: 0x${BASE_LATEST} ($(basename "$LATEST_ORIG"))" + echo " The diff below will be meaningless. Clean up and take fresh dumps:" + echo " rm -rf ./profiling/heap-dumps/core-*/" + echo " make dump-heap-profiles # first dump" + echo " # ... run workload ..." + echo " make dump-heap-profiles # second dump" + echo "" + fi + + echo "=== Diff: $(basename "$EARLIEST_ORIG") → $(basename "$LATEST_ORIG") ===" + echo " (Shows allocations that GREW — i.e., your leaks)" + echo "" + ("$JEPROF" --text --lines --base="$EARLIEST_WORK" "$BINARY" "$LATEST_WORK" || true) | head -40 | tee "$OUT_DIR/diff-leaks.txt" + echo "" + + echo "Generating $OUT_DIR/diff.svg ..." + "$JEPROF" --svg --lines --base="$EARLIEST_WORK" "$BINARY" "$LATEST_WORK" > "$OUT_DIR/diff.svg" + echo " Open $OUT_DIR/diff.svg — this is the MOST USEFUL output." + echo " It shows only the allocations that grew between the two dumps." + echo "" +else + echo "Only 1 dump found. For diff analysis, take at least 2 manual dumps:" + echo " make dump-heap-profiles # before load" + echo " # ... run your workload ..." + echo " make dump-heap-profiles # after load" + echo "" +fi + +echo "=== Analysis complete ===" +echo "Files in $OUT_DIR/:" +ls -lh "$OUT_DIR/" diff --git a/profiling/docker-compose-heap-profiling.yml b/profiling/docker-compose-heap-profiling.yml new file mode 100644 index 000000000..9dfc9e90c --- /dev/null +++ b/profiling/docker-compose-heap-profiling.yml @@ -0,0 +1,63 @@ +# Docker Compose override for heap profiling KMS core nodes. +# +# Usage: +# # Build with heap profiling enabled +# make build-compose-heap-profiling +# +# # Run normally, then at any point dump + copy + analyze: +# make dump-heap-profiles +# ./profiling/analyze-heap.sh ./profiling/heap-dumps/kms-server ./profiling/heap-dumps/core-1/ +# +# The key insight: take a dump BEFORE sustained load, and another AFTER. +# The diff shows only allocations that grew — i.e., your leaks. + +x-heap-profiling-build: &heap-profiling-build + args: + CARGO_EXTRA_FEATURES: heap-profiling + LTO_RELEASE: heap-profiling + # Frame pointers give jemalloc reliable stack traces for allocation profiling. + # Without them, some frames may be missing from heap dump backtraces. + RUSTFLAGS: "-C force-frame-pointers=yes" + +x-heap-profiling-env: &heap-profiling-env + # Enable jemalloc profiling: + # prof:true — enable profiling + # lg_prof_sample:12 — sample every 4KB of allocation (2^12 bytes) + # prof_gdump:true — auto-dump on new global memory peak + # prof_final:true — dump on process exit + - MALLOC_CONF=prof:true,lg_prof_sample:12,prof_gdump:true,prof_final:true,prof_prefix:/tmp/kms-heap/auto/prof + +services: + dev-kms-core-gen-signing-keys-ca-certs: + image: ghcr.io/zama-ai/kms/core-service:latest-dev-heap-profiling + build: + <<: *heap-profiling-build + + dev-kms-core-1: + image: ghcr.io/zama-ai/kms/core-service:latest-dev-heap-profiling + build: + <<: *heap-profiling-build + environment: *heap-profiling-env + + dev-kms-core-2: + image: ghcr.io/zama-ai/kms/core-service:latest-dev-heap-profiling + build: + <<: *heap-profiling-build + environment: *heap-profiling-env + + dev-kms-core-3: + image: ghcr.io/zama-ai/kms/core-service:latest-dev-heap-profiling + build: + <<: *heap-profiling-build + environment: *heap-profiling-env + + dev-kms-core-4: + image: ghcr.io/zama-ai/kms/core-service:latest-dev-heap-profiling + build: + <<: *heap-profiling-build + environment: *heap-profiling-env + + dev-kms-core-init: + image: ghcr.io/zama-ai/kms/core-service:latest-dev-heap-profiling + build: + <<: *heap-profiling-build