Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions .github/workflows/docker-scan.yml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ jobs:
with:
# Define paths that trigger specific component workflows
# Changes to observability affect multiple components
list-files: 'json'
list-files: "json"
filters: |
dockerfile:
- 'docker/**/Dockerfile'
Expand Down Expand Up @@ -88,7 +88,7 @@ jobs:
build-args: |
RUST_IMAGE_VERSION=${{ env.RUST_IMAGE_VERSION }}
TARGETARCH=amd64
context: '.'
context: "."
secrets: BLOCKCHAIN_ACTIONS_TOKEN=${{ secrets.BLOCKCHAIN_ACTIONS_TOKEN }}
file: ${{ matrix.dockerfile }}
platforms: linux/amd64
Expand All @@ -102,11 +102,11 @@ jobs:
base:latest

- name: Scan image with Trivy
uses: aquasecurity/trivy-action@c1824fd6edce30d7ab345a9989de00bbd46ef284 # v0.34.0
uses: aquasecurity/trivy-action@97e0b3872f55f89b95b2f65b3dbab56962816478 # v0.34.2
with:
image-ref: base:latest
format: 'table'
severity: 'CRITICAL,HIGH'
format: "table"
severity: "CRITICAL,HIGH"
output: trivy-result.txt

- name: Check Trivy result file
Expand Down
34 changes: 34 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

12 changes: 12 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,8 @@ tfhe-csprng = "=0.8.0" # Cryptographically secure PRNG for TFHE - LOW RISK: Zam
tfhe-versionable = "=0.7.0" # TFHE versioning support - LOW RISK: Zama
tfhe-zk-pok = "=0.8.0" # Zero-knowledge proofs for TFHE - LOW RISK: Zama
thiserror = "=2.0.12" # Error derive macro - MEDIUM RISK: Reputable individual maintainer (dtolnay), 545M downloads
tikv-jemalloc-ctl = { version = "=0.6.1", features = ["use_std", "stats"] } # jemalloc runtime control - LOW RISK: TiKV team (PingCAP), bindings to jemalloc
tikv-jemallocator = { version = "=0.6.1", features = ["profiling", "unprefixed_malloc_on_supported_platforms"] } # jemalloc allocator with heap profiling - LOW RISK: TiKV team (PingCAP), 17M+ downloads
tokio = { version = "=1.46.1", features = ["full"] } # Async runtime - LOW RISK: tokio team, industry standard
tokio-rustls = { version = "=0.26.2", default-features = false, features = ["aws_lc_rs"] } # Async TLS - LOW RISK: rustls team, memory-safe TLS implementation
tokio-util = { version = "=0.7.15", features = ["rt"] } # Tokio utilities - LOW RISK: tokio team
Expand Down Expand Up @@ -243,6 +245,16 @@ inherits = "release"
# but set LTO to the default (off instead of fat)
lto = "off"

# profile for heap profiling with jemalloc — keeps line-table debug info
# so jeprof can resolve addresses to function:line instead of ??:0.
# Uses debug=1 (line tables only) instead of debug=2 (full DWARF) to
# dramatically reduce build time while retaining all info jeprof needs.
[profile.heap-profiling]
inherits = "release"
debug = 1
strip = "none"
lto = "off"

[patch.crates-io]
# MEDIUM RISK: Using fork instead of upstream - verify changes, consider upstreaming
attestation-doc-validation = { git = 'https://github.com/mkmks/attestation-doc-validation.git', branch = 'timestamps' }
Expand Down
32 changes: 32 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,38 @@ start-compose-threshold-telemetry:
stop-compose-threshold-telemetry:
docker compose -vvv -f docker-compose-core-base.yml -f docker-compose-core-threshold.yml -f docker-compose-telemetry.yml down --volumes --remove-orphans

build-compose-heap-profiling:
docker compose -f docker-compose-core-base.yml -f docker-compose-core-threshold.yml -f profiling/docker-compose-heap-profiling.yml -f docker-compose-telemetry.yml build

start-compose-heap-profiling:
docker compose -f docker-compose-core-base.yml -f docker-compose-core-threshold.yml -f profiling/docker-compose-heap-profiling.yml -f docker-compose-telemetry.yml up -d --wait

stop-compose-heap-profiling:
docker compose -f docker-compose-core-base.yml -f docker-compose-core-threshold.yml -f profiling/docker-compose-heap-profiling.yml -f docker-compose-telemetry.yml down --volumes --remove-orphans

# Dump heap profiles from all cores and copy them locally for analysis
dump-heap-profiles:
@mkdir -p profiling/heap-dumps
@for i in 1 2 3 4; do \
echo "Dumping heap profile for dev-kms-core-$$i..."; \
docker compose -f docker-compose-core-base.yml -f docker-compose-core-threshold.yml -f profiling/docker-compose-heap-profiling.yml \
exec dev-kms-core-$$i killall -USR1 kms-server 2>/dev/null || true; \
done
@sleep 1
@for i in 1 2 3 4; do \
echo "Copying dumps from dev-kms-core-$$i..."; \
docker compose -f docker-compose-core-base.yml -f docker-compose-core-threshold.yml -f profiling/docker-compose-heap-profiling.yml \
cp dev-kms-core-$$i:/tmp/kms-heap/ ./profiling/heap-dumps/core-$$i/ 2>/dev/null || true; \
echo "Capturing /proc/maps for dev-kms-core-$$i..."; \
docker compose -f docker-compose-core-base.yml -f docker-compose-core-threshold.yml -f profiling/docker-compose-heap-profiling.yml \
exec -T dev-kms-core-$$i sh -c 'cat /proc/$$(pidof kms-server)/maps' \
> ./profiling/heap-dumps/core-$$i/maps.txt 2>/dev/null || true; \
done
@echo "Copying kms-server binary for symbol resolution..."
@docker compose -f docker-compose-core-base.yml -f docker-compose-core-threshold.yml -f profiling/docker-compose-heap-profiling.yml \
cp dev-kms-core-1:/app/kms/core/service/bin/kms-server ./profiling/heap-dumps/kms-server 2>/dev/null || true
@echo "Done. Analyze with: ./profiling/analyze-heap.sh ./profiling/heap-dumps/kms-server ./profiling/heap-dumps/core-1/"

# Test backwards compatibility with LFS files. This will pull the LFS files from git before running the tests.
test-backward-compatibility: pull-lfs-files
cargo test --test backward_compatibility_* -- --include-ignored
Expand Down
7 changes: 7 additions & 0 deletions core/service/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,8 @@ tfhe = { workspace = true, features = [
] }
tfhe-versionable.workspace = true
thiserror.workspace = true
tikv-jemalloc-ctl = { workspace = true, optional = true }
tikv-jemallocator = { workspace = true, optional = true }
threshold-fhe = { workspace = true, default-features = false, features = [
"extension_degree_4",
] }
Expand Down Expand Up @@ -175,6 +177,11 @@ non-wasm = [
"dep:tower-http",
"dep:x509-parser",
]
heap-profiling = [
"observability?/heap-profiling",
"dep:tikv-jemalloc-ctl",
"dep:tikv-jemallocator",
]
slow_tests = ["testing"]
wasm_tests = ["testing"]
s3_tests = ["testing"]
Expand Down
2 changes: 1 addition & 1 deletion core/service/config/compose_centralized.toml
Original file line number Diff line number Diff line change
Expand Up @@ -42,4 +42,4 @@ user_decrypt = 1
crsgen = 100
preproc = 25000
keygen = 1000
new_epoch = 1
new_epoch = 1
7 changes: 7 additions & 0 deletions core/service/src/bin/kms-server.rs
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,10 @@ use tokio_rustls::rustls::{
version::TLS13,
};

#[cfg(feature = "heap-profiling")]
#[global_allocator]
static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;

#[derive(Parser)]
#[clap(name = "KMS server")]
#[clap(
Expand Down Expand Up @@ -346,6 +350,9 @@ fn main() -> anyhow::Result<()> {
/// Note that key material MUST exist when starting the server and be stored in the path specified by the configuration file.
/// Please consult the `kms-gen-keys` binary for details on generating key material.
async fn main_exec() -> anyhow::Result<()> {
#[cfg(feature = "heap-profiling")]
kms_lib::heap_profiling::install_sigusr1_handler();

let args = KmsArgs::parse();
let (mut core_config, tracer_provider, meter_provider) =
init_conf_kms_core_telemetry::<CoreConfig>(&args.config_file).await?;
Expand Down
80 changes: 80 additions & 0 deletions core/service/src/heap_profiling.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
//! Heap profiling support using jemalloc.
//!
//! When the `heap-profiling` feature is enabled and `MALLOC_CONF` includes
//! `prof:true`, this module provides on-demand heap dumps.
//!
//! # Quick Start
//!
//! For the full Docker-based workflow (handles PIE/ASLR, symbol resolution,
//! and diff analysis automatically), see `profiling/README.md`.
//!
//! Manual (non-PIE binary) usage:
//!
//! 1. Build with: `cargo build -p kms --bin kms-server --profile heap-profiling -F heap-profiling`
//! 2. Run with env: `MALLOC_CONF=prof:true,lg_prof_sample:12 kms-server ...`
//! (use `lg_prof_sample:19` for lower overhead — see `profiling/README.md`)
//! 3. Dump heap: `kill -USR1 <pid>`
//! 4. Analyze: `jeprof --svg kms-server /tmp/kms-heap/prof.0001.heap > heap.svg`
//! 5. Diff two dumps: `jeprof --base=prof.0001.heap --svg kms-server prof.0010.heap > diff.svg`

use std::sync::atomic::{AtomicUsize, Ordering};

const HEAP_DUMP_DIR: &str = "/tmp/kms-heap";

static DUMP_SEQ: AtomicUsize = AtomicUsize::new(0);

/// Dump a heap profile to `/tmp/kms-heap/prof.NNNN.heap`.
///
/// Creates the output directory if it does not already exist.
pub fn dump_heap_profile() -> Result<String, String> {
// Ensure the output directory exists (idempotent)
if let Err(e) = std::fs::create_dir_all(HEAP_DUMP_DIR) {
eprintln!("[heap-profiling] WARNING: failed to create {HEAP_DUMP_DIR}: {e}");
}

let seq = DUMP_SEQ.fetch_add(1, Ordering::Relaxed);
let path_str = format!("{HEAP_DUMP_DIR}/prof.{seq:04}.heap");
let path_c =
std::ffi::CString::new(path_str.clone()).map_err(|e| format!("invalid path: {e}"))?;

// jemalloc mallctl expects a pointer to the filename string
let ptr = path_c.as_ptr();
// SAFETY: `ptr` points to a valid null-terminated C string (`path_c`) that
// outlives this call. jemalloc's `prof.dump` mallctl expects a `const char *`
// and `raw::write` passes `&ptr` as `newp`, matching the expected ABI.
let result = unsafe { tikv_jemalloc_ctl::raw::write(b"prof.dump\0", ptr) };

match result {
Ok(()) => {
eprintln!("[heap-profiling] Dumped to {path_str}");
Ok(path_str)
}
Err(e) => {
let msg = format!("jemalloc prof.dump failed: {e}. Is MALLOC_CONF=prof:true set?");
eprintln!("[heap-profiling] ERROR: {msg}");
Err(msg)
}
}
}

/// Install a SIGUSR1 handler that triggers heap profile dumps.
///
/// Call this once at startup. Then `kill -USR1 <pid>` to dump.
pub fn install_sigusr1_handler() {
if let Err(e) = std::fs::create_dir_all(HEAP_DUMP_DIR) {
eprintln!("[heap-profiling] WARNING: failed to create {HEAP_DUMP_DIR}: {e}");
}

// Spawn a background tokio task to listen for SIGUSR1
tokio::spawn(async {
let mut sig = tokio::signal::unix::signal(tokio::signal::unix::SignalKind::user_defined1())
.expect("Failed to register SIGUSR1 handler");

eprintln!("[heap-profiling] Ready — send SIGUSR1 to dump heap profile to {HEAP_DUMP_DIR}/");

loop {
sig.recv().await;
let _ = dump_heap_profile();
}
});
}
3 changes: 3 additions & 0 deletions core/service/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,9 @@ pub mod testing;
#[cfg(feature = "non-wasm")]
pub mod vault;

#[cfg(feature = "heap-profiling")]
pub mod heap_profiling;

#[cfg(feature = "non-wasm")]
pub use kms_grpc::utils::tonic_result::BoxedStatus;

Expand Down
10 changes: 7 additions & 3 deletions docker/core/service/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,10 @@ FROM --platform=$BUILDPLATFORM ghcr.io/zama-ai/kms/rust-golden-image:latest AS k
# But you can provide --build-arg LTO_RELEASE="--profile release-lto-off" locally to build locally
ARG LTO_RELEASE=release
ARG APP_CACHE_DIR=kms
# Extra Cargo features to enable, comma-separated (e.g., "heap-profiling" for jemalloc profiling)
ARG CARGO_EXTRA_FEATURES=""
ARG RUSTFLAGS=""
ENV RUSTFLAGS=${RUSTFLAGS}

# Fetch dependencies and build binaries
WORKDIR /app/kms
Expand All @@ -24,11 +28,11 @@ COPY docker ./docker
RUN mkdir -p /app/kms/core/service/bin

RUN --mount=type=cache,target=/root/.cargo/registry,sharing=locked \
--mount=type=cache,target=/app/${APP_CACHE_DIR}/target,sharing=locked \
--mount=type=cache,target=/app/${APP_CACHE_DIR}/target,sharing=locked,id=cargo-target-${LTO_RELEASE} \
cargo fetch --locked
RUN --mount=type=cache,target=/root/.cargo/registry,sharing=locked \
--mount=type=cache,target=/app/${APP_CACHE_DIR}/target,sharing=locked \
cargo build --locked --profile=${LTO_RELEASE} -p kms --bin kms-server --bin kms-gen-tls-certs --bin kms-init --bin kms-custodian -F insecure && \
--mount=type=cache,target=/app/${APP_CACHE_DIR}/target,sharing=locked,id=cargo-target-${LTO_RELEASE} \
cargo build --locked --profile=${LTO_RELEASE} -p kms --bin kms-server --bin kms-gen-tls-certs --bin kms-init --bin kms-custodian -F insecure ${CARGO_EXTRA_FEATURES:+-F "$CARGO_EXTRA_FEATURES"} && \
cargo build --locked --profile=${LTO_RELEASE} -p kms --bin kms-gen-keys -F testing -F threshold-fhe/testing -F insecure && \
cp /app/kms/target/${LTO_RELEASE}/kms-server \
/app/kms/target/${LTO_RELEASE}/kms-gen-tls-certs \
Expand Down
2 changes: 2 additions & 0 deletions observability/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ strum.workspace = true
strum_macros.workspace = true
sysinfo.workspace = true
thiserror.workspace = true
tikv-jemalloc-ctl = { workspace = true, optional = true }
tokio = { workspace = true, features = ["rt", "rt-multi-thread", "macros", "net"] }
tonic.workspace = true
tracing = { workspace = true, features = ["log", "async-await"] }
Expand All @@ -48,3 +49,4 @@ ignored = ["strum"]

[features]
default = []
heap-profiling = ["dep:tikv-jemalloc-ctl"]
Loading
Loading