fix(attention): derive num_partition from max_model_len instead of block_length (#513)

rebel-jaehunryu · web-flow · commit 853cf33f6de5 · 2026-04-08T19:02:55.000+09:00
diff --git a/vllm_rbln/v1/attention/backends/flash_attention.py b/vllm_rbln/v1/attention/backends/flash_attention.py
@@ -1099,11 +1099,9 @@ def build(
 
         # The length of the partition equals the block size.
         partition_len = self.block_size
-        # no. of block(HW constraint) determines max sequence length.
-        # max_model_len(Model constraint) determines max sequence length.
-        # One of them is selected for max_seq_len.
-        block_length = self.cache_config.num_gpu_blocks * partition_len
-        max_seq_len = min(self.model_config.max_model_len, block_length)
+        # num_partition is derived from max_model_len (not hardware block count)
+        # to ensure seq_idx/seq_lens dimensions stay within block_table bounds.
+        max_seq_len = self.model_config.max_model_len
 
         num_partition = max_seq_len // partition_len
         cs = seq_idx.repeat(1, num_partition)
diff --git a/vllm_rbln/v1/worker/metrics.py b/vllm_rbln/v1/worker/metrics.py
@@ -219,7 +219,9 @@ def record_prefill(
                 f"got {len(request_ids)}: {request_ids}"
             )
             request_id = request_ids[0]
-        self.prefill_metrics.add_measurement(latency, token_count, host_time, device_time, ccl_time)
+        self.prefill_metrics.add_measurement(
+            latency, token_count, host_time, device_time, ccl_time
+        )
         if request_id:
             self.prefill_metrics_by_request_id.add_measurement(
                 request_id, latency, token_count, host_time, device_time, ccl_time
diff --git a/vllm_rbln/v1/worker/optimum_model_runner.py b/vllm_rbln/v1/worker/optimum_model_runner.py
@@ -456,7 +456,8 @@ def _prepare_inputs(
             finished_requests_ids=list(finished_requests_ids),
             cached_block_tables=cached_block_tables,
             cached_lengths=cached_lengths,
-            is_prompt=is_prefill, # FIXME unify the variable name is_prefill and is_prompt
+            # FIXME unify the variable name is_prefill and is_prompt
+            is_prompt=is_prefill,
             dummy_block=scheduler_output.dummy_block,
         )
         return model_input, num_scheduled_tokens