Remove O(prompt_len) prompt copies (#35)

JiuChen0 · dannywillowliu-uchi · root · web-flow · commit b3126a6f88fa · 2025-11-21T00:00:23.000-08:00
* Add batch inference support and CPU compatibility

- Add --batch_size CLI argument for parallel sequence processing
- Add conditional CUDA stream creation for CPU-only mode
- Add device-aware ExecutionEnv and Policy resource distribution
- Fix MPS compatibility on macOS

* fix hardcode of model loading and support batch size

* Resolving dependency conflicts

* docs: refine README setup and usage sections for clarity and correctness

* Add batch size related updates

* delete ddebug output

* delete .id files

* fix max token size problem

* add prompt

* Reduce /dev/shm peak usage during warmup/prefill stage

* delete dead code

* chore: comment out unused compare_tensors function

* delete bitsandbytes quant

* support flexgen 4bit quant

* clean debug output for server id

* add effective throughput

* clean up unnecessary files

* fix the error of start compute time

* Use rolling buffer to avoid O(prompt_len) copy on each forward

* The debug I/O issue has been fixed

* Use rolling buffer to avoid O(prompt_len) copy on each forward

---------

Co-authored-by: Danny Willow Liu &lt;dannywillowliu@uchicago.edu&gt;
Co-authored-by: root &lt;root@investorairig80.maas&gt;
diff --git a/src/bloombee/models/llama/block.py b/src/bloombee/models/llama/block.py
@@ -94,10 +94,10 @@ def forward( # pyright: ignore[reportIncompatibleMethodOverride]
         output_attentions = False
         assert not output_attentions
 
-        # print('🔧 OptimizedLlamaAttention.forward(): received position_ids:', position_ids)
+        # print(' OptimizedLlamaAttention.forward(): received position_ids:', position_ids)
         # if position_ids is not None:
-        #     print(f'🔧 position_ids shape: {position_ids.shape}, dtype: {position_ids.dtype}')
-        #     print(f'🔧 position_ids content: {position_ids}')
+        #     print(f' position_ids shape: {position_ids.shape}, dtype: {position_ids.dtype}')
+        #     print(f' position_ids content: {position_ids}')
 
         if position_ids is None:
             past_seen_tokens = past_key_value[0].shape[2] if past_key_value is not None else 0
@@ -107,9 +107,9 @@ def forward( # pyright: ignore[reportIncompatibleMethodOverride]
                 device=hidden_states.device,
                 dtype=torch.long
             ).unsqueeze(0) # pyright: ignore[reportAssignmentType]
-            # print(f'🔧 Generated fallback position_ids: {position_ids}')
+            # print(f' Generated fallback position_ids: {position_ids}')
 
-        # print('🔧 Final position_ids before processing:', position_ids)
+        # print(' Final position_ids before processing:', position_ids)
 
         #   Optimized: Avoid .item() CPU-GPU sync by using direct indexing
         # Most common case: 2D tensor [batch_size, seq_len]
@@ -124,7 +124,7 @@ def forward( # pyright: ignore[reportIncompatibleMethodOverride]
         else:
             start_position = 0
 
-        # print(f'🔧 Extracted start_position: {start_position}')
+        # print(f' Extracted start_position: {start_position}')
 
         self.temp_hidden_states.val = super(OptimizedLlamaAttention, self).forward(
             hidden_states, cache_read_buf, weight_read_buf, attention_mask, cache_write_buf, start_position, k
@@ -210,6 +210,11 @@ def __init__(self, config: LlamaConfig, layer_id: int, env: ExecutionEnv, policy
         
         # GPU stream management optimization
         self._streams_initialized = False
+        
+        # Rolling buffer for output_ids to avoid O(prompt_len) copy on each forward
+        self._cached_output_ids = None
+        self._cached_output_ids_shape = None
+        self._output_ids_prompt_initialized = False
 
         # log_mem(f"[LlamaDecoderLayer:{self.layer_id}] before init_all_weights")
         self.init_all_weights()
@@ -406,13 +411,16 @@ def forward(
             self._last_prompt_len = actual_prompt_len
             self._last_gen_len = max_new_tokens
             
+            # Reset output_ids prompt flag when task changes
+            self._output_ids_prompt_initialized = False
+            
             if not self._is_initialized:
                 self._is_initialized = True
                 
             # Performance monitoring: record Task rebuild time
             if task_rebuild_start is not None:
                 task_rebuild_time = (time.time() - task_rebuild_start) * 1000
-                if task_rebuild_time > 1.0:  # 只记录超过1ms的情况
+                if task_rebuild_time > 1.0:  # Record only when it takes more than 1ms
                     print(f"[BLOCK_PERF] Layer {self.layer_id} Task rebuild took: {task_rebuild_time:.3f}ms")
 
         task = self._cached_task
@@ -424,8 +432,29 @@ def forward(
         num_prompts = len(task.inputs)
         prompt_len, gen_len = task.prompt_len, task.gen_len
 
-        self.output_ids = np.ones((num_prompts, prompt_len + gen_len), dtype=np.int64)
-        self.output_ids[:, :prompt_len] = np.asarray(task.inputs)
+        # Use rolling buffer to avoid O(prompt_len) copy on each forward
+        # Only reallocate when shape changes
+        output_ids_start = time.time()
+        target_shape = (num_prompts, prompt_len + gen_len)
+        if self._cached_output_ids is None or self._cached_output_ids_shape != target_shape:
+            # Shape changed, need to reallocate
+            self._cached_output_ids = np.ones(target_shape, dtype=np.int64)
+            self._cached_output_ids_shape = target_shape
+            self._output_ids_prompt_initialized = False
+            if verbose > 0:
+                print(f"[OUTPUT_IDS_PERF] Layer {self.layer_id}: Reallocated output_ids with shape {target_shape}")
+        
+        # Only copy prompt tokens when necessary (first time or task changed)
+        if not self._output_ids_prompt_initialized:
+            self._cached_output_ids[:, :prompt_len] = np.asarray(task.inputs)
+            self._output_ids_prompt_initialized = True
+            if verbose > 0:
+                print(f"[OUTPUT_IDS_PERF] Layer {self.layer_id}: Initialized prompt tokens ({prompt_len} tokens)")
+        
+        self.output_ids = self._cached_output_ids
+        output_ids_time = (time.time() - output_ids_start) * 1000
+        if output_ids_time > 1.0:
+            print(f"[OUTPUT_IDS_PERF] Layer {self.layer_id} output_ids setup took: {output_ids_time:.3f}ms")
 
         # Smart cache clearing - avoid clearing every time
         cache_clear_start = time.time()
@@ -496,10 +525,10 @@ def forward(
                 if position_ids is not None and position_ids.numel() > 0:
                     #   Optimized: Avoid .item() sync
                     current_position = position_ids.flatten()[0]
-                    # print(f'🔧 Using actual position from position_ids: {current_position}')
+                    # print(f' Using actual position from position_ids: {current_position}')
                 else:
                     current_position = 0
-                    # print(f'🔧 No position_ids provided, using fallback position: {current_position}')
+                    # print(f' No position_ids provided, using fallback position: {current_position}')
 
                 i = current_position
 
@@ -613,7 +642,7 @@ def to_torch_tensor(x):
         outputs = (hidden_states, past_key_value)
         # log_mem(f"[Layer:{self.layer_id}] forward(end) out_shape={hidden_states.shape}")
         # Remove empty_cache call from each forward to reduce GPU overhead
-        # torch.cuda.empty_cache()  # 这会导致性能问题
+        # torch.cuda.empty_cache()  
         return outputs
 
     def load_weight(self, i, j, k, overlap=True):
@@ -672,7 +701,7 @@ def store_cache(self, i, j, k, overlap=True):
             with torch.cuda.stream(self.store_cache_stream):
                 self.layers[j].store_cache(self.cache_home[j][k], self.cache_write_buf[j][k], i)
             # Remove unnecessary synchronization to reduce GPU blocking
-            # torch.cuda.synchronize()  # 这会造成性能瓶颈
+            # torch.cuda.synchronize()  
         else:
             self.layers[j].store_cache(self.cache_home[j][k], self.cache_write_buf[j][k], i)
 
@@ -742,7 +771,7 @@ def compute_layer(self, i, j, k, position_ids=None, generated_tokens_num=0):
         if j == 1:
             self.hidden[0][j][k].val = self.temp_hidden.val
 
-        # print(f'🔧 compute_layer: i={i}, j={j}, k={k}, received position_ids={position_ids}')
+        # print(f' compute_layer: i={i}, j={j}, k={k}, received position_ids={position_ids}')
 
         self.layers[j].forward(hidden_states=self.hidden[0][j][k],
                                cache_read_buf=self.cache_read_buf[j][k],
@@ -784,10 +813,10 @@ def forward(
             seq_length_with_past = seq_length_with_past + past_key_values_length
             past_key_value = self._reorder_cache_from_bloom_to_llama(past_key_value, batch_size, past_key_values_length)
 
-        # print(f'🔧 WrappedLlamaBlock.forward: received position_ids={position_ids}')
+        # print(f' WrappedLlamaBlock.forward: received position_ids={position_ids}')
         if position_ids is not None:
             pass
-            # print(f'🔧 WrappedLlamaBlock.forward: position_ids shape={position_ids.shape}, content={position_ids}')
+            # print(f' WrappedLlamaBlock.forward: position_ids shape={position_ids.shape}, content={position_ids}')
 
         # print(f"WrappedLlamaBlock, hidden_states: {hidden_states}, seq_length: {seq_length}, past_key_value: {past_key_value}")
         #   Optimized: Reuse cached attention_mask
diff --git a/src/bloombee/models/llama/model.py b/src/bloombee/models/llama/model.py
@@ -185,7 +185,7 @@ def prepare_inputs_for_generation(
                 input_ids = input_ids[:, past_length:]
                 # print(f"   Past length case: {original_shape} -> {input_ids.shape}, kept tokens: {input_ids}")
             else:
-                print(f"   No truncation needed: past_length={past_length}, input_ids.shape[1]={input_ids.shape[1]}")
+                logger.debug(f"No truncation needed: past_length={past_length}, input_ids.shape[1]={input_ids.shape[1]}")
 
             if (
                 max_cache_length is not None
@@ -205,10 +205,10 @@ def prepare_inputs_for_generation(
         # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
         if inputs_embeds is not None and past_key_values is None:
             model_inputs = {"inputs_embeds": inputs_embeds}
-            print(f"   Using inputs_embeds for first generation step")
+            logger.debug("Using inputs_embeds for first generation step")
         else:
             model_inputs = {"input_ids": input_ids}
-            # print(f"   Using input_ids: {input_ids}")
+            # logger.debug(f"Using input_ids: {input_ids}")
 
         model_inputs.update(
             {
diff --git a/src/bloombee/server/block_functions.py b/src/bloombee/server/block_functions.py
@@ -289,6 +289,7 @@ async def iterate_rpc_inference(
         
         # Add Cross-GPU Transfer Latency measurement
         cross_gpu_start_time = perf_counter()
+        start_compute_time = perf_counter()  # Initialize compute time tracking
 
         # parse deep prompts (optional argument)
         has_prompts = prompts is not None and not is_dummy(prompts)