feat(kin-db): throughput-profile embedding budget wiring; release 0.2.7

troyjr4103 · web-flow · commit d8d9c260dc76 · 2026-06-19T16:19:48.000-04:00
ResourcePlan-derived BatchBudget + graph-chunk sizing, active only under KIN_RESOURCE_PROFILE=throughput; default/proof path byte-identical and explicit KIN_EMBED_* overrides still win. Resolves kin-infer from registry 0.2.2.
diff --git a/.cargo/config.toml b/.cargo/config.toml
@@ -9,6 +9,5 @@ index = "sparse+https://kinlab.ai/registry/cargo/"
 
 [patch.kin]
 kin-blobs = { git = "https://github.com/firelock-ai/kin-blobs.git", rev = "5e8b3c3a596163fe6c4b276bcd27c02ffacd694f" }
-kin-infer = { git = "https://github.com/firelock-ai/kin-infer.git", rev = "ff1a60c46aa51487b8a492c499b0841592102447" }
 kin-search = { git = "https://github.com/firelock-ai/kin-search.git", rev = "cb0680b82f38c8477a0facd33b4cdefde3fa6dfc" }
 kin-vector = { git = "https://github.com/firelock-ai/kin-vector.git", rev = "dd743d741c6832e15179ac60720b5218558e10fb" }
diff --git a/Cargo.toml b/Cargo.toml
@@ -6,7 +6,7 @@ members = ["crates/kin-db", ]
 resolver = "2"
 
 [workspace.package]
-version = "0.2.6"
+version = "0.2.7"
 edition = "2021"
 license = "Apache-2.0"
 authors = ["Troy Fortin <troy@firelock.ai>"]
diff --git a/crates/kin-db/src/embed/mod.rs b/crates/kin-db/src/embed/mod.rs
@@ -1639,6 +1639,63 @@ fn default_max_attention_area(backend: GpuBackend) -> usize {
     }
 }
 
+/// True only when `KIN_RESOURCE_PROFILE` is explicitly set to `throughput`.
+/// Read live (never cached) so behavior tracks the current environment.
+#[cfg(feature = "embeddings")]
+pub(crate) fn resource_profile_is_throughput() -> bool {
+    std::env::var("KIN_RESOURCE_PROFILE")
+        .map(|value| value.trim().eq_ignore_ascii_case("throughput"))
+        .unwrap_or(false)
+}
+
+/// Throughput-profile embedding plan for `backend`, detected once per backend.
+/// Host detection is cached; the plan is otherwise deterministic for a backend.
+#[cfg(feature = "embeddings")]
+fn throughput_embedding_plan(backend: GpuBackend) -> &'static kin_infer::resource::EmbeddingPlan {
+    use kin_infer::resource::{
+        detect_host, detect_memory, AcceleratorBackend, AcceleratorInfo, Profile, ResourcePlan,
+    };
+    use std::sync::OnceLock;
+
+    static METAL: OnceLock<kin_infer::resource::EmbeddingPlan> = OnceLock::new();
+    static CUDA: OnceLock<kin_infer::resource::EmbeddingPlan> = OnceLock::new();
+    static CPU: OnceLock<kin_infer::resource::EmbeddingPlan> = OnceLock::new();
+
+    let (cell, accel_backend, unified_memory) = match backend {
+        GpuBackend::Metal => (&METAL, AcceleratorBackend::Metal, true),
+        GpuBackend::Cuda => (&CUDA, AcceleratorBackend::Cuda, false),
+        GpuBackend::Cpu => (&CPU, AcceleratorBackend::Cpu, false),
+    };
+
+    cell.get_or_init(|| {
+        let accel = AcceleratorInfo {
+            backend: accel_backend,
+            device_index: 0,
+            unified_memory,
+            device_total_bytes: None,
+            device_available_bytes: None,
+            recommended_working_set_bytes: None,
+            max_single_buffer_bytes: None,
+            max_inflight_command_buffers: 1,
+            reserve_device_bytes: None,
+            allow_cpu_fallback: true,
+        };
+        ResourcePlan::for_profile(
+            Profile::Throughput,
+            &detect_host(),
+            &accel,
+            &detect_memory(),
+        )
+        .embedding
+    })
+}
+
+/// Throughput-profile graph entity-chunk size (backend-independent).
+#[cfg(all(feature = "embeddings", feature = "vector"))]
+pub(crate) fn throughput_graph_chunk_size() -> usize {
+    throughput_embedding_plan(GpuBackend::Cpu).max_entities_per_graph_chunk
+}
+
 /// The two budgets that bound a single embed GPU dispatch, and the rule that packs
 /// a length-sorted run of entities into one.
 ///
@@ -1670,15 +1727,25 @@ impl BatchBudget {
                 .filter(|value| *value > 0)
                 .unwrap_or(fallback)
         };
-        Self {
-            max_tokens: env_usize(
-                "KIN_EMBED_MAX_BATCH_TOKENS",
+        // Fallbacks default to today's hardcoded budgets; under the throughput
+        // profile they become the throughput plan's budgets. An explicit
+        // KIN_EMBED_* override still wins over both.
+        let (default_tokens, default_area) = if resource_profile_is_throughput() {
+            let plan = throughput_embedding_plan(backend);
+            let area = match plan.max_attention_area {
+                Some(value) => value as usize,
+                None => usize::MAX,
+            };
+            (plan.max_batch_tokens, area)
+        } else {
+            (
                 default_max_batch_tokens(backend),
-            ),
-            max_attention_area: env_usize(
-                "KIN_EMBED_MAX_ATTENTION_AREA",
                 default_max_attention_area(backend),
-            ),
+            )
+        };
+        Self {
+            max_tokens: env_usize("KIN_EMBED_MAX_BATCH_TOKENS", default_tokens),
+            max_attention_area: env_usize("KIN_EMBED_MAX_ATTENTION_AREA", default_area),
         }
     }
 
@@ -2619,6 +2686,83 @@ mod tests {
         assert_eq!(longest, 2048);
     }
 
+    #[cfg(feature = "embeddings")]
+    static RESOURCE_ENV_LOCK: std::sync::Mutex<()> = std::sync::Mutex::new(());
+
+    /// Serializes process-env access for the resource-profile tests and snapshots
+    /// the relevant vars, restoring them on drop so the suite never leaks state.
+    #[cfg(feature = "embeddings")]
+    struct ResourceEnvGuard {
+        _lock: std::sync::MutexGuard<'static, ()>,
+        profile: Option<String>,
+        max_tokens: Option<String>,
+        max_area: Option<String>,
+    }
+
+    #[cfg(feature = "embeddings")]
+    impl ResourceEnvGuard {
+        fn acquire() -> Self {
+            let lock = RESOURCE_ENV_LOCK
+                .lock()
+                .unwrap_or_else(|poison| poison.into_inner());
+            let guard = Self {
+                _lock: lock,
+                profile: std::env::var("KIN_RESOURCE_PROFILE").ok(),
+                max_tokens: std::env::var("KIN_EMBED_MAX_BATCH_TOKENS").ok(),
+                max_area: std::env::var("KIN_EMBED_MAX_ATTENTION_AREA").ok(),
+            };
+            std::env::remove_var("KIN_RESOURCE_PROFILE");
+            std::env::remove_var("KIN_EMBED_MAX_BATCH_TOKENS");
+            std::env::remove_var("KIN_EMBED_MAX_ATTENTION_AREA");
+            guard
+        }
+    }
+
+    #[cfg(feature = "embeddings")]
+    impl Drop for ResourceEnvGuard {
+        fn drop(&mut self) {
+            let restore = |key: &str, prev: &Option<String>| match prev {
+                Some(value) => std::env::set_var(key, value),
+                None => std::env::remove_var(key),
+            };
+            restore("KIN_RESOURCE_PROFILE", &self.profile);
+            restore("KIN_EMBED_MAX_BATCH_TOKENS", &self.max_tokens);
+            restore("KIN_EMBED_MAX_ATTENTION_AREA", &self.max_area);
+        }
+    }
+
+    #[cfg(feature = "embeddings")]
+    #[test]
+    fn batch_budget_unset_profile_matches_today() {
+        let _env = ResourceEnvGuard::acquire();
+        let budget = BatchBudget::from_env(GpuBackend::Metal);
+        assert_eq!(budget.max_tokens, METAL_MAX_BATCH_TOKENS);
+        assert_eq!(budget.max_tokens, 16_384);
+        assert_eq!(budget.max_attention_area, METAL_MAX_ATTENTION_AREA);
+        assert_eq!(budget.max_attention_area, 8_388_608);
+    }
+
+    #[cfg(feature = "embeddings")]
+    #[test]
+    fn batch_budget_throughput_lifts_metal_tokens_keeps_area() {
+        let _env = ResourceEnvGuard::acquire();
+        std::env::set_var("KIN_RESOURCE_PROFILE", "throughput");
+        let budget = BatchBudget::from_env(GpuBackend::Metal);
+        assert_eq!(budget.max_tokens, 65_536);
+        assert_eq!(budget.max_attention_area, 8_388_608);
+    }
+
+    #[cfg(feature = "embeddings")]
+    #[test]
+    fn batch_budget_env_override_wins_over_throughput() {
+        let _env = ResourceEnvGuard::acquire();
+        std::env::set_var("KIN_RESOURCE_PROFILE", "throughput");
+        std::env::set_var("KIN_EMBED_MAX_BATCH_TOKENS", "12345");
+        let budget = BatchBudget::from_env(GpuBackend::Metal);
+        assert_eq!(budget.max_tokens, 12_345);
+        assert_eq!(budget.max_attention_area, 8_388_608);
+    }
+
     #[cfg(feature = "embeddings")]
     #[test]
     fn resolve_embed_backend_honors_env_and_metal_default() {
diff --git a/crates/kin-db/src/engine/graph.rs b/crates/kin-db/src/engine/graph.rs
@@ -38,9 +38,13 @@ fn default_embedding_batch_size() -> usize {
         .and_then(|value| value.parse::<usize>().ok())
         .filter(|value| *value > 0)
         .unwrap_or_else(|| {
-            std::thread::available_parallelism()
-                .map(|threads| (threads.get() * 16).clamp(64, 192))
-                .unwrap_or(128)
+            if crate::embed::resource_profile_is_throughput() {
+                crate::embed::throughput_graph_chunk_size()
+            } else {
+                std::thread::available_parallelism()
+                    .map(|threads| (threads.get() * 16).clamp(64, 192))
+                    .unwrap_or(128)
+            }
         })
 }