Skip to content

Commit d8d9c26

Browse files
authored
feat(kin-db): throughput-profile embedding budget wiring; release 0.2.7
ResourcePlan-derived BatchBudget + graph-chunk sizing, active only under KIN_RESOURCE_PROFILE=throughput; default/proof path byte-identical and explicit KIN_EMBED_* overrides still win. Resolves kin-infer from registry 0.2.2.
1 parent 6121fe4 commit d8d9c26

4 files changed

Lines changed: 159 additions & 12 deletions

File tree

.cargo/config.toml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,5 @@ index = "sparse+https://kinlab.ai/registry/cargo/"
99

1010
[patch.kin]
1111
kin-blobs = { git = "https://github.com/firelock-ai/kin-blobs.git", rev = "5e8b3c3a596163fe6c4b276bcd27c02ffacd694f" }
12-
kin-infer = { git = "https://github.com/firelock-ai/kin-infer.git", rev = "ff1a60c46aa51487b8a492c499b0841592102447" }
1312
kin-search = { git = "https://github.com/firelock-ai/kin-search.git", rev = "cb0680b82f38c8477a0facd33b4cdefde3fa6dfc" }
1413
kin-vector = { git = "https://github.com/firelock-ai/kin-vector.git", rev = "dd743d741c6832e15179ac60720b5218558e10fb" }

Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ members = ["crates/kin-db", ]
66
resolver = "2"
77

88
[workspace.package]
9-
version = "0.2.6"
9+
version = "0.2.7"
1010
edition = "2021"
1111
license = "Apache-2.0"
1212
authors = ["Troy Fortin <troy@firelock.ai>"]

crates/kin-db/src/embed/mod.rs

Lines changed: 151 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1639,6 +1639,63 @@ fn default_max_attention_area(backend: GpuBackend) -> usize {
16391639
}
16401640
}
16411641

1642+
/// True only when `KIN_RESOURCE_PROFILE` is explicitly set to `throughput`.
1643+
/// Read live (never cached) so behavior tracks the current environment.
1644+
#[cfg(feature = "embeddings")]
1645+
pub(crate) fn resource_profile_is_throughput() -> bool {
1646+
std::env::var("KIN_RESOURCE_PROFILE")
1647+
.map(|value| value.trim().eq_ignore_ascii_case("throughput"))
1648+
.unwrap_or(false)
1649+
}
1650+
1651+
/// Throughput-profile embedding plan for `backend`, detected once per backend.
1652+
/// Host detection is cached; the plan is otherwise deterministic for a backend.
1653+
#[cfg(feature = "embeddings")]
1654+
fn throughput_embedding_plan(backend: GpuBackend) -> &'static kin_infer::resource::EmbeddingPlan {
1655+
use kin_infer::resource::{
1656+
detect_host, detect_memory, AcceleratorBackend, AcceleratorInfo, Profile, ResourcePlan,
1657+
};
1658+
use std::sync::OnceLock;
1659+
1660+
static METAL: OnceLock<kin_infer::resource::EmbeddingPlan> = OnceLock::new();
1661+
static CUDA: OnceLock<kin_infer::resource::EmbeddingPlan> = OnceLock::new();
1662+
static CPU: OnceLock<kin_infer::resource::EmbeddingPlan> = OnceLock::new();
1663+
1664+
let (cell, accel_backend, unified_memory) = match backend {
1665+
GpuBackend::Metal => (&METAL, AcceleratorBackend::Metal, true),
1666+
GpuBackend::Cuda => (&CUDA, AcceleratorBackend::Cuda, false),
1667+
GpuBackend::Cpu => (&CPU, AcceleratorBackend::Cpu, false),
1668+
};
1669+
1670+
cell.get_or_init(|| {
1671+
let accel = AcceleratorInfo {
1672+
backend: accel_backend,
1673+
device_index: 0,
1674+
unified_memory,
1675+
device_total_bytes: None,
1676+
device_available_bytes: None,
1677+
recommended_working_set_bytes: None,
1678+
max_single_buffer_bytes: None,
1679+
max_inflight_command_buffers: 1,
1680+
reserve_device_bytes: None,
1681+
allow_cpu_fallback: true,
1682+
};
1683+
ResourcePlan::for_profile(
1684+
Profile::Throughput,
1685+
&detect_host(),
1686+
&accel,
1687+
&detect_memory(),
1688+
)
1689+
.embedding
1690+
})
1691+
}
1692+
1693+
/// Throughput-profile graph entity-chunk size (backend-independent).
1694+
#[cfg(all(feature = "embeddings", feature = "vector"))]
1695+
pub(crate) fn throughput_graph_chunk_size() -> usize {
1696+
throughput_embedding_plan(GpuBackend::Cpu).max_entities_per_graph_chunk
1697+
}
1698+
16421699
/// The two budgets that bound a single embed GPU dispatch, and the rule that packs
16431700
/// a length-sorted run of entities into one.
16441701
///
@@ -1670,15 +1727,25 @@ impl BatchBudget {
16701727
.filter(|value| *value > 0)
16711728
.unwrap_or(fallback)
16721729
};
1673-
Self {
1674-
max_tokens: env_usize(
1675-
"KIN_EMBED_MAX_BATCH_TOKENS",
1730+
// Fallbacks default to today's hardcoded budgets; under the throughput
1731+
// profile they become the throughput plan's budgets. An explicit
1732+
// KIN_EMBED_* override still wins over both.
1733+
let (default_tokens, default_area) = if resource_profile_is_throughput() {
1734+
let plan = throughput_embedding_plan(backend);
1735+
let area = match plan.max_attention_area {
1736+
Some(value) => value as usize,
1737+
None => usize::MAX,
1738+
};
1739+
(plan.max_batch_tokens, area)
1740+
} else {
1741+
(
16761742
default_max_batch_tokens(backend),
1677-
),
1678-
max_attention_area: env_usize(
1679-
"KIN_EMBED_MAX_ATTENTION_AREA",
16801743
default_max_attention_area(backend),
1681-
),
1744+
)
1745+
};
1746+
Self {
1747+
max_tokens: env_usize("KIN_EMBED_MAX_BATCH_TOKENS", default_tokens),
1748+
max_attention_area: env_usize("KIN_EMBED_MAX_ATTENTION_AREA", default_area),
16821749
}
16831750
}
16841751

@@ -2619,6 +2686,83 @@ mod tests {
26192686
assert_eq!(longest, 2048);
26202687
}
26212688

2689+
#[cfg(feature = "embeddings")]
2690+
static RESOURCE_ENV_LOCK: std::sync::Mutex<()> = std::sync::Mutex::new(());
2691+
2692+
/// Serializes process-env access for the resource-profile tests and snapshots
2693+
/// the relevant vars, restoring them on drop so the suite never leaks state.
2694+
#[cfg(feature = "embeddings")]
2695+
struct ResourceEnvGuard {
2696+
_lock: std::sync::MutexGuard<'static, ()>,
2697+
profile: Option<String>,
2698+
max_tokens: Option<String>,
2699+
max_area: Option<String>,
2700+
}
2701+
2702+
#[cfg(feature = "embeddings")]
2703+
impl ResourceEnvGuard {
2704+
fn acquire() -> Self {
2705+
let lock = RESOURCE_ENV_LOCK
2706+
.lock()
2707+
.unwrap_or_else(|poison| poison.into_inner());
2708+
let guard = Self {
2709+
_lock: lock,
2710+
profile: std::env::var("KIN_RESOURCE_PROFILE").ok(),
2711+
max_tokens: std::env::var("KIN_EMBED_MAX_BATCH_TOKENS").ok(),
2712+
max_area: std::env::var("KIN_EMBED_MAX_ATTENTION_AREA").ok(),
2713+
};
2714+
std::env::remove_var("KIN_RESOURCE_PROFILE");
2715+
std::env::remove_var("KIN_EMBED_MAX_BATCH_TOKENS");
2716+
std::env::remove_var("KIN_EMBED_MAX_ATTENTION_AREA");
2717+
guard
2718+
}
2719+
}
2720+
2721+
#[cfg(feature = "embeddings")]
2722+
impl Drop for ResourceEnvGuard {
2723+
fn drop(&mut self) {
2724+
let restore = |key: &str, prev: &Option<String>| match prev {
2725+
Some(value) => std::env::set_var(key, value),
2726+
None => std::env::remove_var(key),
2727+
};
2728+
restore("KIN_RESOURCE_PROFILE", &self.profile);
2729+
restore("KIN_EMBED_MAX_BATCH_TOKENS", &self.max_tokens);
2730+
restore("KIN_EMBED_MAX_ATTENTION_AREA", &self.max_area);
2731+
}
2732+
}
2733+
2734+
#[cfg(feature = "embeddings")]
2735+
#[test]
2736+
fn batch_budget_unset_profile_matches_today() {
2737+
let _env = ResourceEnvGuard::acquire();
2738+
let budget = BatchBudget::from_env(GpuBackend::Metal);
2739+
assert_eq!(budget.max_tokens, METAL_MAX_BATCH_TOKENS);
2740+
assert_eq!(budget.max_tokens, 16_384);
2741+
assert_eq!(budget.max_attention_area, METAL_MAX_ATTENTION_AREA);
2742+
assert_eq!(budget.max_attention_area, 8_388_608);
2743+
}
2744+
2745+
#[cfg(feature = "embeddings")]
2746+
#[test]
2747+
fn batch_budget_throughput_lifts_metal_tokens_keeps_area() {
2748+
let _env = ResourceEnvGuard::acquire();
2749+
std::env::set_var("KIN_RESOURCE_PROFILE", "throughput");
2750+
let budget = BatchBudget::from_env(GpuBackend::Metal);
2751+
assert_eq!(budget.max_tokens, 65_536);
2752+
assert_eq!(budget.max_attention_area, 8_388_608);
2753+
}
2754+
2755+
#[cfg(feature = "embeddings")]
2756+
#[test]
2757+
fn batch_budget_env_override_wins_over_throughput() {
2758+
let _env = ResourceEnvGuard::acquire();
2759+
std::env::set_var("KIN_RESOURCE_PROFILE", "throughput");
2760+
std::env::set_var("KIN_EMBED_MAX_BATCH_TOKENS", "12345");
2761+
let budget = BatchBudget::from_env(GpuBackend::Metal);
2762+
assert_eq!(budget.max_tokens, 12_345);
2763+
assert_eq!(budget.max_attention_area, 8_388_608);
2764+
}
2765+
26222766
#[cfg(feature = "embeddings")]
26232767
#[test]
26242768
fn resolve_embed_backend_honors_env_and_metal_default() {

crates/kin-db/src/engine/graph.rs

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -38,9 +38,13 @@ fn default_embedding_batch_size() -> usize {
3838
.and_then(|value| value.parse::<usize>().ok())
3939
.filter(|value| *value > 0)
4040
.unwrap_or_else(|| {
41-
std::thread::available_parallelism()
42-
.map(|threads| (threads.get() * 16).clamp(64, 192))
43-
.unwrap_or(128)
41+
if crate::embed::resource_profile_is_throughput() {
42+
crate::embed::throughput_graph_chunk_size()
43+
} else {
44+
std::thread::available_parallelism()
45+
.map(|threads| (threads.get() * 16).clamp(64, 192))
46+
.unwrap_or(128)
47+
}
4448
})
4549
}
4650

0 commit comments

Comments
 (0)