Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
71 changes: 49 additions & 22 deletions fastdeploy/worker/input_batch.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
import paddle
from paddleformers.utils.log import logger

from fastdeploy import envs
from fastdeploy.config import CacheConfig, FDConfig, ModelConfig, SpeculativeConfig
from fastdeploy.model_executor.layers.rotary_embedding import get_rope
from fastdeploy.model_executor.logits_processor import build_logits_processors
Expand Down Expand Up @@ -577,7 +578,11 @@ def reset_share_inputs(self):
# Reset reasoning buffers
fill_paddle_tensor(self, "reasoning_status", 0)
# Reset reasoning allowed tokens (not using fill_paddle_tensor since it's a fixed tensor)
self.reasoning_allowed_tokens = paddle.to_tensor([100973, 100975], dtype="int64")
if envs.FD_DETERMINISTIC_MODE and hasattr(self, "reasoning_allowed_tokens"):
# In-place copy to preserve GPU address for CUDAGraph safety.
self.reasoning_allowed_tokens[:] = paddle.to_tensor([100973, 100975], dtype="int64")
else:
self.reasoning_allowed_tokens = paddle.to_tensor([100973, 100975], dtype="int64")

# Reset block tables
fill_paddle_tensor(self, "block_tables", -1)
Expand All @@ -590,8 +595,15 @@ def reset_share_inputs(self):
-1,
)
)
self.free_list = paddle.to_tensor(free_list, dtype="int32")
self.free_list_len = paddle.full([1], len(free_list), dtype="int32")
if envs.FD_DETERMINISTIC_MODE and hasattr(self, "free_list"):
# In-place copy to preserve GPU address for CUDAGraph safety.
self.free_list.fill_(0)
free_list_tensor = paddle.to_tensor(free_list, dtype="int32")
self.free_list[: len(free_list)] = free_list_tensor
self.free_list_len.fill_(len(free_list))
else:
self.free_list = paddle.to_tensor(free_list, dtype="int32")
self.free_list_len = paddle.full([1], len(free_list), dtype="int32")

# Reset stop sequences
fill_paddle_tensor(self, "stop_seqs_len", 0)
Expand Down Expand Up @@ -625,29 +637,44 @@ def reset_share_inputs(self):
else:
rope_head_dim = head_dim // 2

self.rope_emb = paddle.full(
shape=[
max_num_seqs,
2,
1,
self.model_config.max_model_len,
1,
rope_head_dim,
],
fill_value=0,
dtype="float32",
)
if envs.FD_DETERMINISTIC_MODE and hasattr(self, "rope_emb"):
# In-place fill to preserve GPU address for CUDAGraph safety.
self.rope_emb.fill_(0)
else:
self.rope_emb = paddle.full(
shape=[
max_num_seqs,
2,
1,
self.model_config.max_model_len,
1,
rope_head_dim,
],
fill_value=0,
dtype="float32",
)
self.image_features = None
self.image_features_list = None
else:
# Reset non-multimodal rope_emb
self.rope_emb = get_rope(
rotary_dim=self.model_config.head_dim,
position_ids=paddle.arange(self.model_config.max_model_len).reshape((1, -1)),
base=self.model_config.rope_theta,
model_config=self.model_config,
partial_rotary_factor=self.model_config.partial_rotary_factor,
)
if envs.FD_DETERMINISTIC_MODE and hasattr(self, "rope_emb"):
# In-place copy to preserve GPU address for CUDAGraph safety.
new_rope_emb = get_rope(
rotary_dim=self.model_config.head_dim,
position_ids=paddle.arange(self.model_config.max_model_len).reshape((1, -1)),
base=self.model_config.rope_theta,
model_config=self.model_config,
partial_rotary_factor=self.model_config.partial_rotary_factor,
)
self.rope_emb.copy_(new_rope_emb, False)
else:
self.rope_emb = get_rope(
rotary_dim=self.model_config.head_dim,
position_ids=paddle.arange(self.model_config.max_model_len).reshape((1, -1)),
base=self.model_config.rope_theta,
model_config=self.model_config,
partial_rotary_factor=self.model_config.partial_rotary_factor,
)

# Reset other miscellaneous tensors
fill_paddle_tensor(self, "mask_rollback", 0)
Expand Down
Loading