@@ -1639,6 +1639,63 @@ fn default_max_attention_area(backend: GpuBackend) -> usize {
16391639 }
16401640}
16411641
1642+ /// True only when `KIN_RESOURCE_PROFILE` is explicitly set to `throughput`.
1643+ /// Read live (never cached) so behavior tracks the current environment.
1644+ #[ cfg( feature = "embeddings" ) ]
1645+ pub ( crate ) fn resource_profile_is_throughput ( ) -> bool {
1646+ std:: env:: var ( "KIN_RESOURCE_PROFILE" )
1647+ . map ( |value| value. trim ( ) . eq_ignore_ascii_case ( "throughput" ) )
1648+ . unwrap_or ( false )
1649+ }
1650+
1651+ /// Throughput-profile embedding plan for `backend`, detected once per backend.
1652+ /// Host detection is cached; the plan is otherwise deterministic for a backend.
1653+ #[ cfg( feature = "embeddings" ) ]
1654+ fn throughput_embedding_plan ( backend : GpuBackend ) -> & ' static kin_infer:: resource:: EmbeddingPlan {
1655+ use kin_infer:: resource:: {
1656+ detect_host, detect_memory, AcceleratorBackend , AcceleratorInfo , Profile , ResourcePlan ,
1657+ } ;
1658+ use std:: sync:: OnceLock ;
1659+
1660+ static METAL : OnceLock < kin_infer:: resource:: EmbeddingPlan > = OnceLock :: new ( ) ;
1661+ static CUDA : OnceLock < kin_infer:: resource:: EmbeddingPlan > = OnceLock :: new ( ) ;
1662+ static CPU : OnceLock < kin_infer:: resource:: EmbeddingPlan > = OnceLock :: new ( ) ;
1663+
1664+ let ( cell, accel_backend, unified_memory) = match backend {
1665+ GpuBackend :: Metal => ( & METAL , AcceleratorBackend :: Metal , true ) ,
1666+ GpuBackend :: Cuda => ( & CUDA , AcceleratorBackend :: Cuda , false ) ,
1667+ GpuBackend :: Cpu => ( & CPU , AcceleratorBackend :: Cpu , false ) ,
1668+ } ;
1669+
1670+ cell. get_or_init ( || {
1671+ let accel = AcceleratorInfo {
1672+ backend : accel_backend,
1673+ device_index : 0 ,
1674+ unified_memory,
1675+ device_total_bytes : None ,
1676+ device_available_bytes : None ,
1677+ recommended_working_set_bytes : None ,
1678+ max_single_buffer_bytes : None ,
1679+ max_inflight_command_buffers : 1 ,
1680+ reserve_device_bytes : None ,
1681+ allow_cpu_fallback : true ,
1682+ } ;
1683+ ResourcePlan :: for_profile (
1684+ Profile :: Throughput ,
1685+ & detect_host ( ) ,
1686+ & accel,
1687+ & detect_memory ( ) ,
1688+ )
1689+ . embedding
1690+ } )
1691+ }
1692+
1693+ /// Throughput-profile graph entity-chunk size (backend-independent).
1694+ #[ cfg( all( feature = "embeddings" , feature = "vector" ) ) ]
1695+ pub ( crate ) fn throughput_graph_chunk_size ( ) -> usize {
1696+ throughput_embedding_plan ( GpuBackend :: Cpu ) . max_entities_per_graph_chunk
1697+ }
1698+
16421699/// The two budgets that bound a single embed GPU dispatch, and the rule that packs
16431700/// a length-sorted run of entities into one.
16441701///
@@ -1670,15 +1727,25 @@ impl BatchBudget {
16701727 . filter ( |value| * value > 0 )
16711728 . unwrap_or ( fallback)
16721729 } ;
1673- Self {
1674- max_tokens : env_usize (
1675- "KIN_EMBED_MAX_BATCH_TOKENS" ,
1730+ // Fallbacks default to today's hardcoded budgets; under the throughput
1731+ // profile they become the throughput plan's budgets. An explicit
1732+ // KIN_EMBED_* override still wins over both.
1733+ let ( default_tokens, default_area) = if resource_profile_is_throughput ( ) {
1734+ let plan = throughput_embedding_plan ( backend) ;
1735+ let area = match plan. max_attention_area {
1736+ Some ( value) => value as usize ,
1737+ None => usize:: MAX ,
1738+ } ;
1739+ ( plan. max_batch_tokens , area)
1740+ } else {
1741+ (
16761742 default_max_batch_tokens ( backend) ,
1677- ) ,
1678- max_attention_area : env_usize (
1679- "KIN_EMBED_MAX_ATTENTION_AREA" ,
16801743 default_max_attention_area ( backend) ,
1681- ) ,
1744+ )
1745+ } ;
1746+ Self {
1747+ max_tokens : env_usize ( "KIN_EMBED_MAX_BATCH_TOKENS" , default_tokens) ,
1748+ max_attention_area : env_usize ( "KIN_EMBED_MAX_ATTENTION_AREA" , default_area) ,
16821749 }
16831750 }
16841751
@@ -2619,6 +2686,83 @@ mod tests {
26192686 assert_eq ! ( longest, 2048 ) ;
26202687 }
26212688
2689+ #[ cfg( feature = "embeddings" ) ]
2690+ static RESOURCE_ENV_LOCK : std:: sync:: Mutex < ( ) > = std:: sync:: Mutex :: new ( ( ) ) ;
2691+
2692+ /// Serializes process-env access for the resource-profile tests and snapshots
2693+ /// the relevant vars, restoring them on drop so the suite never leaks state.
2694+ #[ cfg( feature = "embeddings" ) ]
2695+ struct ResourceEnvGuard {
2696+ _lock : std:: sync:: MutexGuard < ' static , ( ) > ,
2697+ profile : Option < String > ,
2698+ max_tokens : Option < String > ,
2699+ max_area : Option < String > ,
2700+ }
2701+
2702+ #[ cfg( feature = "embeddings" ) ]
2703+ impl ResourceEnvGuard {
2704+ fn acquire ( ) -> Self {
2705+ let lock = RESOURCE_ENV_LOCK
2706+ . lock ( )
2707+ . unwrap_or_else ( |poison| poison. into_inner ( ) ) ;
2708+ let guard = Self {
2709+ _lock : lock,
2710+ profile : std:: env:: var ( "KIN_RESOURCE_PROFILE" ) . ok ( ) ,
2711+ max_tokens : std:: env:: var ( "KIN_EMBED_MAX_BATCH_TOKENS" ) . ok ( ) ,
2712+ max_area : std:: env:: var ( "KIN_EMBED_MAX_ATTENTION_AREA" ) . ok ( ) ,
2713+ } ;
2714+ std:: env:: remove_var ( "KIN_RESOURCE_PROFILE" ) ;
2715+ std:: env:: remove_var ( "KIN_EMBED_MAX_BATCH_TOKENS" ) ;
2716+ std:: env:: remove_var ( "KIN_EMBED_MAX_ATTENTION_AREA" ) ;
2717+ guard
2718+ }
2719+ }
2720+
2721+ #[ cfg( feature = "embeddings" ) ]
2722+ impl Drop for ResourceEnvGuard {
2723+ fn drop ( & mut self ) {
2724+ let restore = |key : & str , prev : & Option < String > | match prev {
2725+ Some ( value) => std:: env:: set_var ( key, value) ,
2726+ None => std:: env:: remove_var ( key) ,
2727+ } ;
2728+ restore ( "KIN_RESOURCE_PROFILE" , & self . profile ) ;
2729+ restore ( "KIN_EMBED_MAX_BATCH_TOKENS" , & self . max_tokens ) ;
2730+ restore ( "KIN_EMBED_MAX_ATTENTION_AREA" , & self . max_area ) ;
2731+ }
2732+ }
2733+
2734+ #[ cfg( feature = "embeddings" ) ]
2735+ #[ test]
2736+ fn batch_budget_unset_profile_matches_today ( ) {
2737+ let _env = ResourceEnvGuard :: acquire ( ) ;
2738+ let budget = BatchBudget :: from_env ( GpuBackend :: Metal ) ;
2739+ assert_eq ! ( budget. max_tokens, METAL_MAX_BATCH_TOKENS ) ;
2740+ assert_eq ! ( budget. max_tokens, 16_384 ) ;
2741+ assert_eq ! ( budget. max_attention_area, METAL_MAX_ATTENTION_AREA ) ;
2742+ assert_eq ! ( budget. max_attention_area, 8_388_608 ) ;
2743+ }
2744+
2745+ #[ cfg( feature = "embeddings" ) ]
2746+ #[ test]
2747+ fn batch_budget_throughput_lifts_metal_tokens_keeps_area ( ) {
2748+ let _env = ResourceEnvGuard :: acquire ( ) ;
2749+ std:: env:: set_var ( "KIN_RESOURCE_PROFILE" , "throughput" ) ;
2750+ let budget = BatchBudget :: from_env ( GpuBackend :: Metal ) ;
2751+ assert_eq ! ( budget. max_tokens, 65_536 ) ;
2752+ assert_eq ! ( budget. max_attention_area, 8_388_608 ) ;
2753+ }
2754+
2755+ #[ cfg( feature = "embeddings" ) ]
2756+ #[ test]
2757+ fn batch_budget_env_override_wins_over_throughput ( ) {
2758+ let _env = ResourceEnvGuard :: acquire ( ) ;
2759+ std:: env:: set_var ( "KIN_RESOURCE_PROFILE" , "throughput" ) ;
2760+ std:: env:: set_var ( "KIN_EMBED_MAX_BATCH_TOKENS" , "12345" ) ;
2761+ let budget = BatchBudget :: from_env ( GpuBackend :: Metal ) ;
2762+ assert_eq ! ( budget. max_tokens, 12_345 ) ;
2763+ assert_eq ! ( budget. max_attention_area, 8_388_608 ) ;
2764+ }
2765+
26222766 #[ cfg( feature = "embeddings" ) ]
26232767 #[ test]
26242768 fn resolve_embed_backend_honors_env_and_metal_default ( ) {
0 commit comments