Skip to content

Commit 853cf33

Browse files
fix(attention): derive num_partition from max_model_len instead of block_length (#513)
1 parent bba7042 commit 853cf33

3 files changed

Lines changed: 8 additions & 7 deletions

File tree

vllm_rbln/v1/attention/backends/flash_attention.py

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1099,11 +1099,9 @@ def build(
10991099

11001100
# The length of the partition equals the block size.
11011101
partition_len = self.block_size
1102-
# no. of block(HW constraint) determines max sequence length.
1103-
# max_model_len(Model constraint) determines max sequence length.
1104-
# One of them is selected for max_seq_len.
1105-
block_length = self.cache_config.num_gpu_blocks * partition_len
1106-
max_seq_len = min(self.model_config.max_model_len, block_length)
1102+
# num_partition is derived from max_model_len (not hardware block count)
1103+
# to ensure seq_idx/seq_lens dimensions stay within block_table bounds.
1104+
max_seq_len = self.model_config.max_model_len
11071105

11081106
num_partition = max_seq_len // partition_len
11091107
cs = seq_idx.repeat(1, num_partition)

vllm_rbln/v1/worker/metrics.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -219,7 +219,9 @@ def record_prefill(
219219
f"got {len(request_ids)}: {request_ids}"
220220
)
221221
request_id = request_ids[0]
222-
self.prefill_metrics.add_measurement(latency, token_count, host_time, device_time, ccl_time)
222+
self.prefill_metrics.add_measurement(
223+
latency, token_count, host_time, device_time, ccl_time
224+
)
223225
if request_id:
224226
self.prefill_metrics_by_request_id.add_measurement(
225227
request_id, latency, token_count, host_time, device_time, ccl_time

vllm_rbln/v1/worker/optimum_model_runner.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -456,7 +456,8 @@ def _prepare_inputs(
456456
finished_requests_ids=list(finished_requests_ids),
457457
cached_block_tables=cached_block_tables,
458458
cached_lengths=cached_lengths,
459-
is_prompt=is_prefill, # FIXME unify the variable name is_prefill and is_prompt
459+
# FIXME unify the variable name is_prefill and is_prompt
460+
is_prompt=is_prefill,
460461
dummy_block=scheduler_output.dummy_block,
461462
)
462463
return model_input, num_scheduled_tokens

0 commit comments

Comments
 (0)