Skip to content

fix stopwords kv cache #3494

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
May 12, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 3 additions & 4 deletions lmdeploy/pytorch/engine/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,8 @@ class InferOutput:
def _tensorlize_block_offsets(block_offsets, dtype=torch.int32):
"""tensorlize block_offsets."""
from torch.nn.utils.rnn import pad_sequence
block_offsets = [torch.from_numpy(off).to(dtype) for off in block_offsets]
block_offsets = pad_sequence(block_offsets, batch_first=True)
block_offsets = [torch.from_numpy(off) for off in block_offsets]
block_offsets = pad_sequence(block_offsets, batch_first=True).to(dtype)
return block_offsets


Expand Down Expand Up @@ -541,6 +541,7 @@ def __update_max_new_tokens(msg):
req.data['token_ids'],
multimodals=req.data.get('input_multimodals'),
embeddings=req.data.get('input_embeddings'),
append_tokens=True,
)
msg.num_new_tokens = 0
msg.sampling_param = sampling_param
Expand Down Expand Up @@ -699,8 +700,6 @@ def update_running(self, running: SeqList, next_token_ids: torch.Tensor, stopped
msg.update_token_ids(update_token, model_meta=model_meta)
msg.num_new_tokens += 1
if stop:
update_token = _EMPTY_TOKEN
msg.update_token_ids(update_token, model_meta=model_meta)
msg.status = MessageStatus.STOPPED

def _make_infer_outputs(self, next_token_ids: torch.LongTensor, running: SeqList, logits: torch.Tensor,
Expand Down
13 changes: 10 additions & 3 deletions lmdeploy/pytorch/messages.py
Original file line number Diff line number Diff line change
Expand Up @@ -567,11 +567,15 @@ def update_token_ids(self,
token_ids: Tensor,
multimodals: MultiModalInputs = None,
embeddings: List[InputEmbeddings] = None,
model_meta: Dict[str, Any] = None):
model_meta: Dict[str, Any] = None,
append_tokens: bool = False):
"""Update token ids, old token ids will be added to history."""
old_num_history_ids = self._num_history_ids

self._num_history_ids += self._num_token_ids
# update history
if not append_tokens:
self._num_history_ids += self._num_token_ids

# update history image nums
self._num_history_images += self._num_images
self._num_images = 0
Expand Down Expand Up @@ -601,7 +605,10 @@ def update_token_ids(self,
token_ids = np.array(token_ids)
if token_ids.ndim == 0:
token_ids = token_ids[None]
self._num_token_ids = len(token_ids)
if append_tokens:
self._num_token_ids += len(token_ids)
else:
self._num_token_ids = len(token_ids)
self.history_cache.append(token_ids)
self.random_offsets += 1
self.arrive_time = time.time()
Expand Down