diff --git a/.github/workflows/docker-scan.yml b/.github/workflows/docker-scan.yml
index 3ffffcecc..b1324956a 100644
--- a/.github/workflows/docker-scan.yml
+++ b/.github/workflows/docker-scan.yml
@@ -27,7 +27,7 @@ jobs:
         with:
           # Define paths that trigger specific component workflows
           # Changes to observability affect multiple components
-          list-files: 'json'
+          list-files: "json"
           filters: |
             dockerfile:
               - 'docker/**/Dockerfile'
@@ -88,7 +88,7 @@ jobs:
           build-args: |
             RUST_IMAGE_VERSION=${{ env.RUST_IMAGE_VERSION }}
             TARGETARCH=amd64
-          context: '.'
+          context: "."
           secrets: BLOCKCHAIN_ACTIONS_TOKEN=${{ secrets.BLOCKCHAIN_ACTIONS_TOKEN }}
           file: ${{ matrix.dockerfile }}
           platforms: linux/amd64
@@ -102,11 +102,11 @@ jobs:
             base:latest
 
       - name: Scan image with Trivy
-        uses: aquasecurity/trivy-action@c1824fd6edce30d7ab345a9989de00bbd46ef284 # v0.34.0
+        uses: aquasecurity/trivy-action@97e0b3872f55f89b95b2f65b3dbab56962816478 # v0.34.2
         with:
           image-ref: base:latest
-          format: 'table'
-          severity: 'CRITICAL,HIGH'
+          format: "table"
+          severity: "CRITICAL,HIGH"
           output: trivy-result.txt
 
       - name: Check Trivy result file
diff --git a/Cargo.lock b/Cargo.lock
index 7a71577c7..ef3f9c440 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4320,6 +4320,8 @@ dependencies = [
  "tfhe-versionable",
  "thiserror 2.0.12",
  "threshold-fhe",
+ "tikv-jemalloc-ctl",
+ "tikv-jemallocator",
  "tokio",
  "tokio-rustls 0.26.2",
  "tokio-util",
@@ -5002,6 +5004,7 @@ dependencies = [
  "sysinfo",
  "tempfile",
  "thiserror 2.0.12",
+ "tikv-jemalloc-ctl",
  "tokio",
  "tonic 0.13.1",
  "tracing",
@@ -7755,6 +7758,37 @@ dependencies = [
  "zeroize",
 ]
 
+[[package]]
+name = "tikv-jemalloc-ctl"
+version = "0.6.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "661f1f6a57b3a36dc9174a2c10f19513b4866816e13425d3e418b11cc37bc24c"
+dependencies = [
+ "libc",
+ "paste",
+ "tikv-jemalloc-sys",
+]
+
+[[package]]
+name = "tikv-jemalloc-sys"
+version = "0.6.1+5.3.0-1-ge13ca993e8ccb9ba9847cc330696e02839f328f7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cd8aa5b2ab86a2cefa406d889139c162cbb230092f7d1d7cbc1716405d852a3b"
+dependencies = [
+ "cc",
+ "libc",
+]
+
+[[package]]
+name = "tikv-jemallocator"
+version = "0.6.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0359b4327f954e0567e69fb191cf1436617748813819c94b8cd4a431422d053a"
+dependencies = [
+ "libc",
+ "tikv-jemalloc-sys",
+]
+
 [[package]]
 name = "time"
 version = "0.3.47"
diff --git a/Cargo.toml b/Cargo.toml
index 523edf5f2..668ac845d 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -185,6 +185,8 @@ tfhe-csprng = "=0.8.0"  # Cryptographically secure PRNG for TFHE - LOW RISK: Zam
 tfhe-versionable = "=0.7.0"  # TFHE versioning support - LOW RISK: Zama
 tfhe-zk-pok = "=0.8.0"  # Zero-knowledge proofs for TFHE - LOW RISK: Zama
 thiserror = "=2.0.12"  # Error derive macro - MEDIUM RISK: Reputable individual maintainer (dtolnay), 545M downloads
+tikv-jemalloc-ctl = { version = "=0.6.1", features = ["use_std", "stats"] }  # jemalloc runtime control - LOW RISK: TiKV team (PingCAP), bindings to jemalloc
+tikv-jemallocator = { version = "=0.6.1", features = ["profiling", "unprefixed_malloc_on_supported_platforms"] }  # jemalloc allocator with heap profiling - LOW RISK: TiKV team (PingCAP), 17M+ downloads
 tokio = { version = "=1.46.1", features = ["full"] }  # Async runtime - LOW RISK: tokio team, industry standard
 tokio-rustls = { version = "=0.26.2", default-features = false, features = ["aws_lc_rs"] }  # Async TLS - LOW RISK: rustls team, memory-safe TLS implementation
 tokio-util = { version = "=0.7.15", features = ["rt"] }  # Tokio utilities - LOW RISK: tokio team
@@ -243,6 +245,16 @@ inherits = "release"
 # but set LTO to the default (off instead of fat)
 lto = "off"
 
+# profile for heap profiling with jemalloc — keeps line-table debug info
+# so jeprof can resolve addresses to function:line instead of ??:0.
+# Uses debug=1 (line tables only) instead of debug=2 (full DWARF) to
+# dramatically reduce build time while retaining all info jeprof needs.
+[profile.heap-profiling]
+inherits = "release"
+debug = 1
+strip = "none"
+lto = "off"
+
 [patch.crates-io]
 # MEDIUM RISK: Using fork instead of upstream - verify changes, consider upstreaming
 attestation-doc-validation = { git = 'https://github.com/mkmks/attestation-doc-validation.git', branch = 'timestamps' }
diff --git a/Makefile b/Makefile
index 9b237f347..dd60274d2 100644
--- a/Makefile
+++ b/Makefile
@@ -25,6 +25,38 @@ start-compose-threshold-telemetry:
 stop-compose-threshold-telemetry:
 	docker compose -vvv -f docker-compose-core-base.yml -f docker-compose-core-threshold.yml -f docker-compose-telemetry.yml down --volumes --remove-orphans
 
+build-compose-heap-profiling:
+	docker compose -f docker-compose-core-base.yml -f docker-compose-core-threshold.yml -f profiling/docker-compose-heap-profiling.yml -f docker-compose-telemetry.yml build
+
+start-compose-heap-profiling:
+	docker compose -f docker-compose-core-base.yml -f docker-compose-core-threshold.yml -f profiling/docker-compose-heap-profiling.yml -f docker-compose-telemetry.yml up -d --wait
+
+stop-compose-heap-profiling:
+	docker compose -f docker-compose-core-base.yml -f docker-compose-core-threshold.yml -f profiling/docker-compose-heap-profiling.yml -f docker-compose-telemetry.yml down --volumes --remove-orphans
+
+# Dump heap profiles from all cores and copy them locally for analysis
+dump-heap-profiles:
+	@mkdir -p profiling/heap-dumps
+	@for i in 1 2 3 4; do \
+		echo "Dumping heap profile for dev-kms-core-$$i..."; \
+		docker compose -f docker-compose-core-base.yml -f docker-compose-core-threshold.yml -f profiling/docker-compose-heap-profiling.yml \
+			exec dev-kms-core-$$i killall -USR1 kms-server 2>/dev/null || true; \
+	done
+	@sleep 1
+	@for i in 1 2 3 4; do \
+		echo "Copying dumps from dev-kms-core-$$i..."; \
+		docker compose -f docker-compose-core-base.yml -f docker-compose-core-threshold.yml -f profiling/docker-compose-heap-profiling.yml \
+			cp dev-kms-core-$$i:/tmp/kms-heap/ ./profiling/heap-dumps/core-$$i/ 2>/dev/null || true; \
+		echo "Capturing /proc/maps for dev-kms-core-$$i..."; \
+		docker compose -f docker-compose-core-base.yml -f docker-compose-core-threshold.yml -f profiling/docker-compose-heap-profiling.yml \
+			exec -T dev-kms-core-$$i sh -c 'cat /proc/$$(pidof kms-server)/maps' \
+			> ./profiling/heap-dumps/core-$$i/maps.txt 2>/dev/null || true; \
+	done
+	@echo "Copying kms-server binary for symbol resolution..."
+	@docker compose -f docker-compose-core-base.yml -f docker-compose-core-threshold.yml -f profiling/docker-compose-heap-profiling.yml \
+		cp dev-kms-core-1:/app/kms/core/service/bin/kms-server ./profiling/heap-dumps/kms-server 2>/dev/null || true
+	@echo "Done. Analyze with: ./profiling/analyze-heap.sh ./profiling/heap-dumps/kms-server ./profiling/heap-dumps/core-1/"
+
 # Test backwards compatibility with LFS files. This will pull the LFS files from git before running the tests.
 test-backward-compatibility: pull-lfs-files
 	cargo test --test backward_compatibility_* -- --include-ignored
diff --git a/core/service/Cargo.toml b/core/service/Cargo.toml
index 68b4e212f..92ab54aca 100644
--- a/core/service/Cargo.toml
+++ b/core/service/Cargo.toml
@@ -104,6 +104,8 @@ tfhe = { workspace = true, features = [
 ] }
 tfhe-versionable.workspace = true
 thiserror.workspace = true
+tikv-jemalloc-ctl = { workspace = true, optional = true }
+tikv-jemallocator = { workspace = true, optional = true }
 threshold-fhe = { workspace = true, default-features = false, features = [
     "extension_degree_4",
 ] }
@@ -175,6 +177,11 @@ non-wasm = [
     "dep:tower-http",
     "dep:x509-parser",
 ]
+heap-profiling = [
+    "observability?/heap-profiling",
+    "dep:tikv-jemalloc-ctl",
+    "dep:tikv-jemallocator",
+]
 slow_tests = ["testing"]
 wasm_tests = ["testing"]
 s3_tests = ["testing"]
diff --git a/core/service/config/compose_centralized.toml b/core/service/config/compose_centralized.toml
index 52f945dc4..7c4ec4ade 100644
--- a/core/service/config/compose_centralized.toml
+++ b/core/service/config/compose_centralized.toml
@@ -42,4 +42,4 @@ user_decrypt = 1
 crsgen = 100
 preproc = 25000
 keygen = 1000
-new_epoch = 1
\ No newline at end of file
+new_epoch = 1
diff --git a/core/service/src/bin/kms-server.rs b/core/service/src/bin/kms-server.rs
index a34954388..2f17d0bff 100644
--- a/core/service/src/bin/kms-server.rs
+++ b/core/service/src/bin/kms-server.rs
@@ -46,6 +46,10 @@ use tokio_rustls::rustls::{
     version::TLS13,
 };
 
+#[cfg(feature = "heap-profiling")]
+#[global_allocator]
+static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
+
 #[derive(Parser)]
 #[clap(name = "KMS server")]
 #[clap(
@@ -346,6 +350,9 @@ fn main() -> anyhow::Result<()> {
 /// Note that key material MUST exist when starting the server and be stored in the path specified by the configuration file.
 /// Please consult the `kms-gen-keys` binary for details on generating key material.
 async fn main_exec() -> anyhow::Result<()> {
+    #[cfg(feature = "heap-profiling")]
+    kms_lib::heap_profiling::install_sigusr1_handler();
+
     let args = KmsArgs::parse();
     let (mut core_config, tracer_provider, meter_provider) =
         init_conf_kms_core_telemetry::<CoreConfig>(&args.config_file).await?;
diff --git a/core/service/src/heap_profiling.rs b/core/service/src/heap_profiling.rs
new file mode 100644
index 000000000..f5c36cb28
--- /dev/null
+++ b/core/service/src/heap_profiling.rs
@@ -0,0 +1,80 @@
+//! Heap profiling support using jemalloc.
+//!
+//! When the `heap-profiling` feature is enabled and `MALLOC_CONF` includes
+//! `prof:true`, this module provides on-demand heap dumps.
+//!
+//! # Quick Start
+//!
+//! For the full Docker-based workflow (handles PIE/ASLR, symbol resolution,
+//! and diff analysis automatically), see `profiling/README.md`.
+//!
+//! Manual (non-PIE binary) usage:
+//!
+//! 1. Build with: `cargo build -p kms --bin kms-server --profile heap-profiling -F heap-profiling`
+//! 2. Run with env: `MALLOC_CONF=prof:true,lg_prof_sample:12 kms-server ...`
+//!    (use `lg_prof_sample:19` for lower overhead — see `profiling/README.md`)
+//! 3. Dump heap:    `kill -USR1 <pid>`
+//! 4. Analyze:      `jeprof --svg kms-server /tmp/kms-heap/prof.0001.heap > heap.svg`
+//! 5. Diff two dumps: `jeprof --base=prof.0001.heap --svg kms-server prof.0010.heap > diff.svg`
+
+use std::sync::atomic::{AtomicUsize, Ordering};
+
+const HEAP_DUMP_DIR: &str = "/tmp/kms-heap";
+
+static DUMP_SEQ: AtomicUsize = AtomicUsize::new(0);
+
+/// Dump a heap profile to `/tmp/kms-heap/prof.NNNN.heap`.
+///
+/// Creates the output directory if it does not already exist.
+pub fn dump_heap_profile() -> Result<String, String> {
+    // Ensure the output directory exists (idempotent)
+    if let Err(e) = std::fs::create_dir_all(HEAP_DUMP_DIR) {
+        eprintln!("[heap-profiling] WARNING: failed to create {HEAP_DUMP_DIR}: {e}");
+    }
+
+    let seq = DUMP_SEQ.fetch_add(1, Ordering::Relaxed);
+    let path_str = format!("{HEAP_DUMP_DIR}/prof.{seq:04}.heap");
+    let path_c =
+        std::ffi::CString::new(path_str.clone()).map_err(|e| format!("invalid path: {e}"))?;
+
+    // jemalloc mallctl expects a pointer to the filename string
+    let ptr = path_c.as_ptr();
+    // SAFETY: `ptr` points to a valid null-terminated C string (`path_c`) that
+    // outlives this call. jemalloc's `prof.dump` mallctl expects a `const char *`
+    // and `raw::write` passes `&ptr` as `newp`, matching the expected ABI.
+    let result = unsafe { tikv_jemalloc_ctl::raw::write(b"prof.dump\0", ptr) };
+
+    match result {
+        Ok(()) => {
+            eprintln!("[heap-profiling] Dumped to {path_str}");
+            Ok(path_str)
+        }
+        Err(e) => {
+            let msg = format!("jemalloc prof.dump failed: {e}. Is MALLOC_CONF=prof:true set?");
+            eprintln!("[heap-profiling] ERROR: {msg}");
+            Err(msg)
+        }
+    }
+}
+
+/// Install a SIGUSR1 handler that triggers heap profile dumps.
+///
+/// Call this once at startup. Then `kill -USR1 <pid>` to dump.
+pub fn install_sigusr1_handler() {
+    if let Err(e) = std::fs::create_dir_all(HEAP_DUMP_DIR) {
+        eprintln!("[heap-profiling] WARNING: failed to create {HEAP_DUMP_DIR}: {e}");
+    }
+
+    // Spawn a background tokio task to listen for SIGUSR1
+    tokio::spawn(async {
+        let mut sig = tokio::signal::unix::signal(tokio::signal::unix::SignalKind::user_defined1())
+            .expect("Failed to register SIGUSR1 handler");
+
+        eprintln!("[heap-profiling] Ready — send SIGUSR1 to dump heap profile to {HEAP_DUMP_DIR}/");
+
+        loop {
+            sig.recv().await;
+            let _ = dump_heap_profile();
+        }
+    });
+}
diff --git a/core/service/src/lib.rs b/core/service/src/lib.rs
index 04705665b..ebc54467b 100644
--- a/core/service/src/lib.rs
+++ b/core/service/src/lib.rs
@@ -35,6 +35,9 @@ pub mod testing;
 #[cfg(feature = "non-wasm")]
 pub mod vault;
 
+#[cfg(feature = "heap-profiling")]
+pub mod heap_profiling;
+
 #[cfg(feature = "non-wasm")]
 pub use kms_grpc::utils::tonic_result::BoxedStatus;
 
diff --git a/docker/core/service/Dockerfile b/docker/core/service/Dockerfile
index d9f24ba6d..0cc3ebd9b 100644
--- a/docker/core/service/Dockerfile
+++ b/docker/core/service/Dockerfile
@@ -8,6 +8,10 @@ FROM --platform=$BUILDPLATFORM ghcr.io/zama-ai/kms/rust-golden-image:latest AS k
 # But you can provide --build-arg LTO_RELEASE="--profile release-lto-off" locally to build locally
 ARG LTO_RELEASE=release
 ARG APP_CACHE_DIR=kms
+# Extra Cargo features to enable, comma-separated (e.g., "heap-profiling" for jemalloc profiling)
+ARG CARGO_EXTRA_FEATURES=""
+ARG RUSTFLAGS=""
+ENV RUSTFLAGS=${RUSTFLAGS}
 
 # Fetch dependencies and build binaries
 WORKDIR /app/kms
@@ -24,11 +28,11 @@ COPY docker ./docker
 RUN mkdir -p /app/kms/core/service/bin
 
 RUN --mount=type=cache,target=/root/.cargo/registry,sharing=locked \
-    --mount=type=cache,target=/app/${APP_CACHE_DIR}/target,sharing=locked \
+    --mount=type=cache,target=/app/${APP_CACHE_DIR}/target,sharing=locked,id=cargo-target-${LTO_RELEASE} \
     cargo fetch --locked
 RUN --mount=type=cache,target=/root/.cargo/registry,sharing=locked \
-    --mount=type=cache,target=/app/${APP_CACHE_DIR}/target,sharing=locked \
-    cargo build --locked --profile=${LTO_RELEASE} -p kms --bin kms-server --bin kms-gen-tls-certs --bin kms-init --bin kms-custodian -F insecure && \
+    --mount=type=cache,target=/app/${APP_CACHE_DIR}/target,sharing=locked,id=cargo-target-${LTO_RELEASE} \
+    cargo build --locked --profile=${LTO_RELEASE} -p kms --bin kms-server --bin kms-gen-tls-certs --bin kms-init --bin kms-custodian -F insecure ${CARGO_EXTRA_FEATURES:+-F "$CARGO_EXTRA_FEATURES"} && \
     cargo build --locked --profile=${LTO_RELEASE} -p kms --bin kms-gen-keys -F testing -F threshold-fhe/testing -F insecure && \
     cp /app/kms/target/${LTO_RELEASE}/kms-server \
        /app/kms/target/${LTO_RELEASE}/kms-gen-tls-certs \
diff --git a/observability/Cargo.toml b/observability/Cargo.toml
index 68ee7840a..9b6f0cf9f 100644
--- a/observability/Cargo.toml
+++ b/observability/Cargo.toml
@@ -29,6 +29,7 @@ strum.workspace = true
 strum_macros.workspace = true
 sysinfo.workspace = true
 thiserror.workspace = true
+tikv-jemalloc-ctl = { workspace = true, optional = true }
 tokio = { workspace = true, features = ["rt", "rt-multi-thread", "macros", "net"] }
 tonic.workspace = true
 tracing = { workspace = true, features = ["log", "async-await"] }
@@ -48,3 +49,4 @@ ignored = ["strum"]
 
 [features]
 default = []
+heap-profiling = ["dep:tikv-jemalloc-ctl"]
diff --git a/observability/src/metrics.rs b/observability/src/metrics.rs
index 6687026b7..ab0524835 100644
--- a/observability/src/metrics.rs
+++ b/observability/src/metrics.rs
@@ -80,6 +80,10 @@ pub struct CoreMetrics {
     process_cpu_usage_gauge: TaggedMetric<Gauge<f64>>, // CPU load for the current process in percentage
     total_memory_gauge: TaggedMetric<Gauge<u64>>,      // Total memory available
     process_memory_gauge: TaggedMetric<Gauge<u64>>,    // Memory usage for the current process
+    #[cfg(feature = "heap-profiling")]
+    jemalloc_allocated_gauge: TaggedMetric<Gauge<u64>>, // Bytes actively allocated by the application (via jemalloc)
+    #[cfg(feature = "heap-profiling")]
+    jemalloc_resident_gauge: TaggedMetric<Gauge<u64>>, // Bytes mapped by jemalloc from OS
     cpu_load_gauge: TaggedMetric<Gauge<f64>>, // 1-minute average CPU load, divided by number of cores
     memory_usage_gauge: TaggedMetric<Gauge<u64>>,
     // Trace guard for file-based logging
@@ -146,6 +150,12 @@ impl CoreMetrics {
             format!("{}_total_memory", config.prefix).into();
         let process_memory_metric: Cow<'static, str> =
             format!("{}_process_memory_usage", config.prefix).into();
+        #[cfg(feature = "heap-profiling")]
+        let jemalloc_allocated_metric: Cow<'static, str> =
+            format!("{}_jemalloc_allocated", config.prefix).into();
+        #[cfg(feature = "heap-profiling")]
+        let jemalloc_resident_metric: Cow<'static, str> =
+            format!("{}_jemalloc_resident", config.prefix).into();
         let cpu_load_metric: Cow<'static, str> = format!("{}_cpu_load", config.prefix).into();
         let memory_usage_metric: Cow<'static, str> =
             format!("{}_memory_usage", config.prefix).into();
@@ -318,6 +328,26 @@ impl CoreMetrics {
         //Record 0 just to make sure the gauge is exported
         process_memory_gauge.record(0, &[]);
 
+        #[cfg(feature = "heap-profiling")]
+        let jemalloc_allocated_gauge = meter
+            .u64_gauge(jemalloc_allocated_metric)
+            .with_description("Bytes actively allocated by the application (via jemalloc)")
+            .with_unit("bytes")
+            .build();
+        #[cfg(feature = "heap-profiling")]
+        //Record 0 just to make sure the gauge is exported
+        jemalloc_allocated_gauge.record(0, &[]);
+
+        #[cfg(feature = "heap-profiling")]
+        let jemalloc_resident_gauge = meter
+            .u64_gauge(jemalloc_resident_metric)
+            .with_description("Bytes mapped by jemalloc from OS (resident set)")
+            .with_unit("bytes")
+            .build();
+        #[cfg(feature = "heap-profiling")]
+        //Record 0 just to make sure the gauge is exported
+        jemalloc_resident_gauge.record(0, &[]);
+
         let cpu_gauge = meter
             .f64_gauge(cpu_load_metric)
             .with_description("CPU load for KMS (averaged over all CPUs)")
@@ -358,6 +388,10 @@ impl CoreMetrics {
             total_memory_gauge: TaggedMetric::new(total_memory_gauge),
             process_cpu_usage_gauge: TaggedMetric::new(process_cpu_usage_gauge),
             process_memory_gauge: TaggedMetric::new(process_memory_gauge),
+            #[cfg(feature = "heap-profiling")]
+            jemalloc_allocated_gauge: TaggedMetric::new(jemalloc_allocated_gauge),
+            #[cfg(feature = "heap-profiling")]
+            jemalloc_resident_gauge: TaggedMetric::new(jemalloc_resident_gauge),
             trace_guard: Arc::new(Mutex::new(None)),
         }
     }
@@ -598,6 +632,22 @@ impl CoreMetrics {
             .metric
             .record(usage, &self.process_memory_gauge.with_tags(&[]));
     }
+
+    /// Record jemalloc's active allocation size (stats.allocated)
+    #[cfg(feature = "heap-profiling")]
+    pub fn record_jemalloc_allocated(&self, usage: u64) {
+        self.jemalloc_allocated_gauge
+            .metric
+            .record(usage, &self.jemalloc_allocated_gauge.with_tags(&[]));
+    }
+
+    /// Record jemalloc's resident memory size (stats.resident)
+    #[cfg(feature = "heap-profiling")]
+    pub fn record_jemalloc_resident(&self, usage: u64) {
+        self.jemalloc_resident_gauge
+            .metric
+            .record(usage, &self.jemalloc_resident_gauge.with_tags(&[]));
+    }
 }
 
 /// Builder for DurationGuard to ensure proper initialization
diff --git a/observability/src/sys_metrics.rs b/observability/src/sys_metrics.rs
index fffcde51d..41f42a5b5 100644
--- a/observability/src/sys_metrics.rs
+++ b/observability/src/sys_metrics.rs
@@ -3,6 +3,8 @@ use std::{cmp::max, ffi::OsStr, fs, time::Duration};
 use sysinfo::{
     ProcessRefreshKind, ProcessesToUpdate, RefreshKind, System, MINIMUM_CPU_UPDATE_INTERVAL,
 };
+#[cfg(feature = "heap-profiling")]
+use tikv_jemalloc_ctl::stats as jemalloc_stats;
 
 pub fn start_sys_metrics_collection(refresh_interval: Duration) -> anyhow::Result<()> {
     // Only fail for info we'll actually poll later on
@@ -80,6 +82,28 @@ pub fn start_sys_metrics_collection(refresh_interval: Duration) -> anyhow::Resul
             let socat_count = get_socat_file_descriptor_count(&system);
             METRICS.record_socat_file_descriptors(socat_count);
 
+            // Jemalloc allocator stats
+            #[cfg(feature = "heap-profiling")]
+            {
+                // Advance jemalloc's stats epoch to get fresh values
+                match tikv_jemalloc_ctl::epoch::mib() {
+                    Ok(epoch) => {
+                        if let Err(e) = epoch.advance() {
+                            tracing::debug!("jemalloc epoch advance failed: {e}");
+                        }
+                    }
+                    Err(e) => {
+                        tracing::debug!("jemalloc epoch mib lookup failed: {e}");
+                    }
+                }
+                if let Ok(allocated) = jemalloc_stats::allocated::read() {
+                    METRICS.record_jemalloc_allocated(allocated as u64);
+                }
+                if let Ok(resident) = jemalloc_stats::resident::read() {
+                    METRICS.record_jemalloc_resident(resident as u64);
+                }
+            }
+
             // Ensure we sleep at least the time needed to accurately update CPU usage, as recommended by sysinfo documentation
             tokio::time::sleep(max(refresh_interval, MINIMUM_CPU_UPDATE_INTERVAL)).await;
         }
diff --git a/profiling/.gitignore b/profiling/.gitignore
new file mode 100644
index 000000000..9fd4d8cc4
--- /dev/null
+++ b/profiling/.gitignore
@@ -0,0 +1,2 @@
+heap-dumps/
+heap-analysis/
diff --git a/profiling/README.md b/profiling/README.md
new file mode 100644
index 000000000..b27156250
--- /dev/null
+++ b/profiling/README.md
@@ -0,0 +1,138 @@
+# Heap Profiling
+
+Detect memory leaks in KMS core nodes using jemalloc heap profiling.
+
+## How it works
+
+The `heap-profiling` Cargo profile builds with jemalloc and preserves debug symbols. At runtime, jemalloc samples allocations and dumps heap snapshots on memory peaks and on `SIGUSR1`. Comparing two snapshots shows only allocations that *grew* — these are your leak *candidates*. Some growth may be legitimate (caches warming up, connection pools expanding, etc.), so the analyst must judge which sites represent actual leaks.
+
+> **Note:** `heap-profiling` is used as both a Cargo **feature** (enables jemalloc code paths and SIGUSR1 handler) and a Cargo **profile** (preserves debug info for jeprof). The docker-compose file enables both automatically.
+
+## Host dependencies
+
+- **jeprof** (from gperftools) — reads jemalloc `.heap` dumps
+- **graphviz** — renders SVG flamegraphs (`dot`)
+- **addr2line** (from binutils) — resolves addresses to source lines
+
+Install on Ubuntu/Debian:
+```bash
+sudo apt install google-perftools graphviz binutils
+```
+
+Install on macOS:
+```bash
+brew install gperftools graphviz binutils
+```
+
+## Usage
+
+All commands run from the repo root.
+
+### 1. Build
+
+```bash
+make build-compose-heap-profiling
+```
+
+### 2. Start
+
+```bash
+make start-compose-heap-profiling
+```
+
+Verify `[heap-profiling] Ready` appears in container logs.
+
+### 3. Dump heap profiles
+
+```bash
+# Take a baseline dump before load
+make dump-heap-profiles
+
+# ... run your workload ...
+
+# Take another dump after load
+make dump-heap-profiles
+```
+
+This sends `SIGUSR1` to each core, copies `.heap` files, the binary, and `/proc/PID/maps` (for PIE address resolution) to `profiling/heap-dumps/`.
+
+### 4. Analyze
+
+```bash
+./profiling/analyze-heap.sh ./profiling/heap-dumps/kms-server ./profiling/heap-dumps/core-1/
+```
+
+Output in `profiling/heap-analysis/`:
+
+| File | Description |
+|---|---|
+| `top-leaks.txt` | Top allocation sites in the latest snapshot |
+| `latest.svg` | Flamegraph of the latest snapshot |
+| `diff-leaks.txt` | Allocation sites that grew between first and last snapshot |
+| `diff.svg` | Diff flamegraph — **the most useful output** |
+
+Open the `.svg` files in a browser. The diff shows only allocations that increased between the two dumps.
+
+## Profiling
+
+### `lg_prof_sample` trade-offs
+
+The `lg_prof_sample` setting in `MALLOC_CONF` controls profiling granularity:
+
+| Value | Sample interval | Overhead | Use case |
+|---|---|---|---|
+| `19` | 512 KB | ~1-2% | Quick smoke-test, production-safe |
+| `12` | 4 KB | ~15-20% | Detailed leak hunting (default in this stack) |
+
+Lower values capture more allocations but slow things down. The default is `12` for thorough profiling; bump to `19` if you only need a quick pass.
+
+### Diagnosing leak type with Prometheus metrics
+
+After deploying with the telemetry stack (included by default), compare these three metrics in Prometheus/Grafana:
+
+| `kms_jemalloc_allocated` | `kms_jemalloc_resident` | `kms_process_memory_usage` (RSS) | Diagnosis |
+|---|---|---|---|
+| Staircases up | Staircases up | Staircases up | **Application-level leak** — objects allocated and never freed |
+| Flat | Staircases up | Staircases up | **Allocator fragmentation** — freed memory can't be returned due to mixed page usage |
+| Flat | Flat | Staircases up | **Non-jemalloc memory growth** — mmap, thread stacks, shared libs, etc. |
+
+- `kms_jemalloc_allocated` — bytes the app actively holds via jemalloc
+- `kms_jemalloc_resident` — bytes jemalloc has mapped from the OS (includes fragmentation)
+- `kms_process_memory_usage` — total process RSS (includes non-jemalloc memory)
+
+## Files
+
+```
+profiling/
+├── README.md
+├── analyze-heap.sh              # Analysis script (handles PIE/ASLR address resolution)
+├── docker-compose-heap-profiling.yml  # Compose override (build args + MALLOC_CONF)
+├── heap-dumps/                  # Dumped .heap files + binary + maps.txt (git-ignored)
+└── heap-analysis/               # Analysis output (git-ignored)
+```
+
+## Troubleshooting
+
+### Symbols show as `??:0`
+
+jeprof needs three things to resolve addresses:
+
+1. **Debug info in the binary** — the `heap-profiling` Cargo profile sets `debug=1` (line tables)
+2. **`addr2line` on the host** — `which addr2line` (from binutils)
+3. **`MAPPED_LIBRARIES:` section in the heap dump** — jemalloc writes this from `/proc/self/maps`. If missing, `make dump-heap-profiles` captures it separately as `maps.txt`, and `analyze-heap.sh` injects it automatically
+
+If symbols still don't resolve, check:
+```bash
+# Binary has debug sections?
+readelf -S ./profiling/heap-dumps/kms-server | grep debug
+
+# Heap dump has MAPPED_LIBRARIES?
+grep -c MAPPED_LIBRARIES ./profiling/heap-dumps/core-1/*.heap
+
+# maps.txt was captured?
+ls -l ./profiling/heap-dumps/core-1/maps.txt
+```
+
+### Negative diff totals
+
+This happens when auto-dumps from `prof_gdump:true` (taken at memory peaks) get mixed with manual dumps. The script prefers manual dumps (from SIGUSR1) for diffing. For reliable diffs, always take two manual dumps: one before and one after your workload.
diff --git a/profiling/analyze-heap.sh b/profiling/analyze-heap.sh
new file mode 100755
index 000000000..b4df21aa3
--- /dev/null
+++ b/profiling/analyze-heap.sh
@@ -0,0 +1,261 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Analyze jemalloc heap profile dumps from KMS core nodes.
+#
+# Works on both Linux and macOS.
+#
+# Prerequisites:
+#   - Install jeprof:
+#       Ubuntu/Debian: apt install jemalloc
+#       macOS:         brew install jemalloc
+#   - graphviz for SVG output:
+#       Ubuntu/Debian: apt install graphviz
+#       macOS:         brew install graphviz
+#   - addr2line (from binutils) for symbol resolution
+#
+# Usage:
+#   # 1. Start the profiling stack (from repo root)
+#   make build-compose-heap-profiling
+#   make start-compose-heap-profiling
+#
+#   # 2. Run your decryption workload, then dump + copy profiles
+#   make dump-heap-profiles
+#
+#   # 3. Analyze
+#   ./profiling/analyze-heap.sh ./profiling/heap-dumps/kms-server ./profiling/heap-dumps/core-1/
+#
+# Output (inside profiling/heap-analysis/):
+#   top-leaks.txt    — text listing of largest allocation sites
+#   latest.svg       — flamegraph of the latest heap snapshot
+#   diff-leaks.txt   — allocation sites that GREW between first and last dump
+#   diff.svg         — diff flamegraph (the most useful: shows your leaks)
+
+BINARY="${1:?Usage: $0 <kms-server-binary> <heap-dump-dir>}"
+DUMP_DIR="${2:?Usage: $0 <kms-server-binary> <heap-dump-dir>}"
+
+# ── Find jeprof ──────────────────────────────────────────────────────────
+JEPROF=""
+for cmd in jeprof; do
+    if command -v "$cmd" &>/dev/null; then
+        JEPROF="$cmd"
+        break
+    fi
+done
+
+if [ -z "$JEPROF" ]; then
+    echo "ERROR: jeprof not found. Install with:"
+    echo "  Ubuntu/Debian: apt install jemalloc"
+    echo "  macOS:         brew install jemalloc"
+    echo "  Or build from: https://github.com/jemalloc/jemalloc"
+    exit 1
+fi
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+OUT_DIR="$SCRIPT_DIR/heap-analysis"
+mkdir -p "$OUT_DIR"
+
+# ── Cross-platform helpers ────────────────────────────────────────────────
+# realpath: available on Linux, but not always on macOS
+portable_realpath() {
+    if command -v realpath &>/dev/null; then
+        realpath "$1"
+    else
+        # Python fallback (available on macOS by default)
+        python3 -c "import os,sys; print(os.path.realpath(sys.argv[1]))" "$1"
+    fi
+}
+
+# sed -i: GNU sed uses -i (no suffix arg), BSD (macOS) sed requires -i '' (explicit empty suffix)
+portable_sed_i() {
+    local expr="$1"
+    local file="$2"
+    if sed --version 2>/dev/null | grep -q GNU; then
+        sed -i "$expr" "$file"
+    else
+        sed -i '' "$expr" "$file"
+    fi
+}
+
+ABS_BINARY="$(portable_realpath "$BINARY")"
+BINARY_NAME="$(basename "$BINARY")"
+
+# Extract the binary load base address from a heap dump's MAPPED_LIBRARIES
+get_binary_base() {
+    grep -A9999 '^MAPPED_LIBRARIES:' "$1" 2>/dev/null \
+        | grep -F "$ABS_BINARY" | grep -E "^[0-9a-f].*r.xp " \
+        | head -1 | cut -d'-' -f1
+}
+
+# ── Working directory for processed heap dumps ───────────────────────────
+# We create copies with the binary path in MAPPED_LIBRARIES rewritten to
+# match the local binary. This is the critical fix for PIE/ASLR: jeprof
+# uses the path in MAPPED_LIBRARIES to match the binary and compute the
+# load offset — if the container path (/app/kms/.../kms-server) doesn't
+# match the local path, jeprof can't translate addresses → ??:0.
+WORK_DIR=$(mktemp -d)
+trap 'rm -rf "$WORK_DIR"' EXIT
+
+# ── Find maps.txt (captured from /proc/PID/maps by make dump-heap-profiles)
+MAPS_FILE=""
+for candidate in "$DUMP_DIR/maps.txt" "$(dirname "$DUMP_DIR")/maps.txt"; do
+    if [ -f "$candidate" ]; then
+        MAPS_FILE="$candidate"
+        break
+    fi
+done
+
+# ── Prepare a heap dump for jeprof ───────────────────────────────────────
+# 1. Inject MAPPED_LIBRARIES if jemalloc didn't include it
+# 2. Rewrite the binary path so jeprof can match it to the local binary
+DUMP_COUNTER=0
+prepare_dump() {
+    local src="$1"
+    local idx="$2"
+    local dst="$WORK_DIR/${idx}.heap"
+    cp "$src" "$dst"
+
+    # Inject MAPPED_LIBRARIES from maps.txt if the dump doesn't have it
+    if [ -n "$MAPS_FILE" ] && ! grep -q '^MAPPED_LIBRARIES:' "$dst" 2>/dev/null; then
+        echo "  Injecting MAPPED_LIBRARIES into $(basename "$src")"
+        printf '\nMAPPED_LIBRARIES:\n' >> "$dst"
+        cat "$MAPS_FILE" >> "$dst"
+    fi
+
+    # Rewrite the binary path in MAPPED_LIBRARIES so jeprof can match it.
+    # Container path: /app/kms/core/service/bin/kms-server
+    # Local path:     /home/user/.../profiling/heap-dumps/kms-server
+    if grep -q 'MAPPED_LIBRARIES:' "$dst" 2>/dev/null; then
+        portable_sed_i "s|[^ ]*/${BINARY_NAME}\$|${ABS_BINARY}|" "$dst"
+    fi
+
+    echo "$dst"
+}
+
+# ── Find heap dumps, sorted by modification time (oldest first) ──────────
+# Prepare working copies and partition manual vs auto in a single pass.
+# Auto-dumps (in /auto/ subdir from prof_gdump) fire at memory peaks and can
+# invert diff direction, so we prefer manual dumps (from SIGUSR1) for diffs.
+ALL_ORIG=()            # all original paths
+WORK_ALL=()            # all working copies
+MANUAL_ORIGINALS=()    # manual-only originals
+MANUAL_WORK=()         # manual-only working copies
+while IFS= read -r heap; do
+    work="$(prepare_dump "$heap" "$DUMP_COUNTER")"
+    DUMP_COUNTER=$((DUMP_COUNTER + 1))
+    ALL_ORIG+=("$heap")
+    WORK_ALL+=("$work")
+    if [[ "$heap" != *"/auto/"* ]]; then
+        MANUAL_ORIGINALS+=("$heap")
+        MANUAL_WORK+=("$work")
+    fi
+done < <(find "$DUMP_DIR" -name '*.heap' -type f -exec ls -1tr {} +)
+
+if [ ${#ALL_ORIG[@]} -eq 0 ]; then
+    echo "ERROR: No .heap files found in $DUMP_DIR"
+    echo "Did you run 'make dump-heap-profiles'?"
+    exit 1
+fi
+
+echo "Found ${#ALL_ORIG[@]} heap dump(s)"
+
+if [ ${#MANUAL_WORK[@]} -ge 2 ]; then
+    DUMPS_ORIG=("${MANUAL_ORIGINALS[@]}")
+    DUMPS_WORK=("${MANUAL_WORK[@]}")
+    echo "Using ${#MANUAL_WORK[@]} manual dump(s) for diff analysis"
+else
+    DUMPS_ORIG=("${ALL_ORIG[@]}")
+    DUMPS_WORK=("${WORK_ALL[@]}")
+fi
+
+LATEST_ORIG="${DUMPS_ORIG[@]: -1}"
+LATEST_WORK="${DUMPS_WORK[@]: -1}"
+
+# ── 1. Top allocation sites in the latest dump ───────────────────────────
+echo ""
+echo "=== Top allocation sites (latest dump: $(basename "$LATEST_ORIG")) ==="
+("$JEPROF" --text --lines "$BINARY" "$LATEST_WORK" || true) | head -40 | tee "$OUT_DIR/top-leaks.txt"
+echo ""
+
+# ── Fallback: manual addr2line if jeprof shows ??:0 ──────────────────────
+if grep -q '??:0' "$OUT_DIR/top-leaks.txt" 2>/dev/null; then
+    echo "WARNING: jeprof could not resolve symbols (??:0)."
+
+    # Try to extract the binary base address from MAPPED_LIBRARIES
+    BASE_ADDR=$(get_binary_base "$LATEST_WORK")
+
+    if [ -n "$BASE_ADDR" ] && command -v addr2line &>/dev/null; then
+        echo "  Falling back to manual addr2line (binary base: 0x${BASE_ADDR})"
+        echo ""
+        echo "=== Manual symbol resolution ==="
+        # Re-read the jeprof output and resolve each address
+        grep -oE '0x[0-9a-f]+' "$OUT_DIR/top-leaks.txt" | sort -u | while read -r addr; do
+            # Compute binary offset: virtual_addr - load_base
+            offset=$(printf "0x%x" $(( addr - 0x${BASE_ADDR} )) 2>/dev/null) || continue
+            resolved=$(addr2line -C -f -e "$BINARY" "$offset" 2>/dev/null | head -2 | tr '\n' ' ')
+            if [ -n "$resolved" ] && [[ "$resolved" != *"??"* ]]; then
+                printf "  %-20s → %s\n" "$addr" "$resolved"
+            fi
+        done | tee "$OUT_DIR/resolved-symbols.txt"
+        echo ""
+    else
+        echo "  Checklist:"
+        echo "    1. Binary has debug info?  readelf -S '$BINARY' | grep debug"
+        echo "    2. addr2line installed?    which addr2line"
+        if [ -z "$MAPS_FILE" ]; then
+            echo "    3. No maps.txt found — re-run 'make dump-heap-profiles' to capture /proc/PID/maps"
+        fi
+        echo ""
+    fi
+fi
+
+# ── 2. SVG of the latest dump ────────────────────────────────────────────
+echo "Generating $OUT_DIR/latest.svg ..."
+"$JEPROF" --svg --lines "$BINARY" "$LATEST_WORK" > "$OUT_DIR/latest.svg"
+echo "  Open $OUT_DIR/latest.svg in a browser to see the full allocation flamegraph."
+echo ""
+
+# ── 3. Diff between earliest and latest dump ─────────────────────────────
+if [ ${#DUMPS_WORK[@]} -ge 2 ]; then
+    EARLIEST_ORIG="${DUMPS_ORIG[0]}"
+    EARLIEST_WORK="${DUMPS_WORK[0]}"
+
+    # Detect cross-run diffs: if the ASLR base addresses differ, the dumps
+    # are from different process instances and the diff is meaningless.
+    BASE_EARLIEST=$(get_binary_base "$EARLIEST_WORK")
+    BASE_LATEST=$(get_binary_base "$LATEST_WORK")
+
+    if [ -n "$BASE_EARLIEST" ] && [ -n "$BASE_LATEST" ] && [ "$BASE_EARLIEST" != "$BASE_LATEST" ]; then
+        echo "WARNING: Dumps are from DIFFERENT process instances (ASLR bases differ)."
+        echo "  earliest: 0x${BASE_EARLIEST}  ($(basename "$EARLIEST_ORIG"))"
+        echo "  latest:   0x${BASE_LATEST}  ($(basename "$LATEST_ORIG"))"
+        echo "  The diff below will be meaningless. Clean up and take fresh dumps:"
+        echo "    rm -rf ./profiling/heap-dumps/core-*/"
+        echo "    make dump-heap-profiles   # first dump"
+        echo "    # ... run workload ..."
+        echo "    make dump-heap-profiles   # second dump"
+        echo ""
+    fi
+
+    echo "=== Diff: $(basename "$EARLIEST_ORIG") → $(basename "$LATEST_ORIG") ==="
+    echo "  (Shows allocations that GREW — i.e., your leaks)"
+    echo ""
+    ("$JEPROF" --text --lines --base="$EARLIEST_WORK" "$BINARY" "$LATEST_WORK" || true) | head -40 | tee "$OUT_DIR/diff-leaks.txt"
+    echo ""
+
+    echo "Generating $OUT_DIR/diff.svg ..."
+    "$JEPROF" --svg --lines --base="$EARLIEST_WORK" "$BINARY" "$LATEST_WORK" > "$OUT_DIR/diff.svg"
+    echo "  Open $OUT_DIR/diff.svg — this is the MOST USEFUL output."
+    echo "  It shows only the allocations that grew between the two dumps."
+    echo ""
+else
+    echo "Only 1 dump found. For diff analysis, take at least 2 manual dumps:"
+    echo "  make dump-heap-profiles   # before load"
+    echo "  # ... run your workload ..."
+    echo "  make dump-heap-profiles   # after load"
+    echo ""
+fi
+
+echo "=== Analysis complete ==="
+echo "Files in $OUT_DIR/:"
+ls -lh "$OUT_DIR/"
diff --git a/profiling/docker-compose-heap-profiling.yml b/profiling/docker-compose-heap-profiling.yml
new file mode 100644
index 000000000..9dfc9e90c
--- /dev/null
+++ b/profiling/docker-compose-heap-profiling.yml
@@ -0,0 +1,63 @@
+# Docker Compose override for heap profiling KMS core nodes.
+#
+# Usage:
+#   # Build with heap profiling enabled
+#   make build-compose-heap-profiling
+#
+#   # Run normally, then at any point dump + copy + analyze:
+#   make dump-heap-profiles
+#   ./profiling/analyze-heap.sh ./profiling/heap-dumps/kms-server ./profiling/heap-dumps/core-1/
+#
+# The key insight: take a dump BEFORE sustained load, and another AFTER.
+# The diff shows only allocations that grew — i.e., your leaks.
+
+x-heap-profiling-build: &heap-profiling-build
+  args:
+    CARGO_EXTRA_FEATURES: heap-profiling
+    LTO_RELEASE: heap-profiling
+    # Frame pointers give jemalloc reliable stack traces for allocation profiling.
+    # Without them, some frames may be missing from heap dump backtraces.
+    RUSTFLAGS: "-C force-frame-pointers=yes"
+
+x-heap-profiling-env: &heap-profiling-env
+  # Enable jemalloc profiling:
+  #   prof:true          — enable profiling
+  #   lg_prof_sample:12  — sample every 4KB of allocation (2^12 bytes)
+  #   prof_gdump:true    — auto-dump on new global memory peak
+  #   prof_final:true    — dump on process exit
+  - MALLOC_CONF=prof:true,lg_prof_sample:12,prof_gdump:true,prof_final:true,prof_prefix:/tmp/kms-heap/auto/prof
+
+services:
+  dev-kms-core-gen-signing-keys-ca-certs:
+    image: ghcr.io/zama-ai/kms/core-service:latest-dev-heap-profiling
+    build:
+      <<: *heap-profiling-build
+
+  dev-kms-core-1:
+    image: ghcr.io/zama-ai/kms/core-service:latest-dev-heap-profiling
+    build:
+      <<: *heap-profiling-build
+    environment: *heap-profiling-env
+
+  dev-kms-core-2:
+    image: ghcr.io/zama-ai/kms/core-service:latest-dev-heap-profiling
+    build:
+      <<: *heap-profiling-build
+    environment: *heap-profiling-env
+
+  dev-kms-core-3:
+    image: ghcr.io/zama-ai/kms/core-service:latest-dev-heap-profiling
+    build:
+      <<: *heap-profiling-build
+    environment: *heap-profiling-env
+
+  dev-kms-core-4:
+    image: ghcr.io/zama-ai/kms/core-service:latest-dev-heap-profiling
+    build:
+      <<: *heap-profiling-build
+    environment: *heap-profiling-env
+
+  dev-kms-core-init:
+    image: ghcr.io/zama-ai/kms/core-service:latest-dev-heap-profiling
+    build:
+      <<: *heap-profiling-build