@@ -675,11 +675,9 @@ def _general_warmup(self,
675675 self .kv_cache_manager_key )
676676 token_num_upper_bound = min (self .max_num_tokens ,
677677 self .batch_size * (self .max_seq_len - 1 ))
678- curr_max_num_tokens = min (
679- kv_cache_manager .get_num_available_tokens (
680- token_num_upper_bound = token_num_upper_bound ,
681- max_num_draft_tokens = self .original_max_draft_len ),
682- token_num_upper_bound )
678+ curr_max_num_tokens = kv_cache_manager .get_num_available_tokens (
679+ token_num_upper_bound = token_num_upper_bound ,
680+ max_num_draft_tokens = self .original_max_draft_len )
683681 max_batch_size = min (
684682 self .batch_size ,
685683 curr_max_num_tokens // (1 + self .runtime_draft_len ))
@@ -730,11 +728,9 @@ def _run_autotuner_warmup(self, resource_manager: ResourceManager):
730728 self .kv_cache_manager_key )
731729 token_num_upper_bound = min (self .max_num_tokens ,
732730 self .batch_size * (self .max_seq_len - 1 ))
733- curr_max_num_tokens = min (
734- kv_cache_manager .get_num_available_tokens (
735- token_num_upper_bound = token_num_upper_bound ,
736- max_num_draft_tokens = self .original_max_draft_len ),
737- token_num_upper_bound )
731+ curr_max_num_tokens = kv_cache_manager .get_num_available_tokens (
732+ token_num_upper_bound = token_num_upper_bound ,
733+ max_num_draft_tokens = self .original_max_draft_len )
738734
739735 cache_path = os .environ .get ("TLLM_AUTOTUNER_CACHE_PATH" , None )
740736 with self .no_cuda_graph (), autotune (cache_path = cache_path ):
0 commit comments