-
Notifications
You must be signed in to change notification settings - Fork 1.1k
[infer] Support infer cache impl #7150
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -62,6 +62,7 @@ def __init__( | |
| model_kwargs: Optional[Dict[str, Any]] = None, | ||
| template: Optional[Template] = None, | ||
| reranker_use_activation: bool = True, | ||
| cache_impl: Optional[str] = None, | ||
| **kwargs): | ||
| download_model = kwargs.pop('download_model', True) | ||
| self.model, self.processor = get_model_tokenizer( | ||
|
|
@@ -80,6 +81,7 @@ def __init__( | |
| model_kwargs=model_kwargs, | ||
| **kwargs) | ||
| self.reranker_use_activation = reranker_use_activation | ||
| self.cache_impl = cache_impl | ||
| self.max_batch_size = max_batch_size | ||
| if isinstance(adapters, str): | ||
| adapters = [adapters] | ||
|
|
@@ -151,11 +153,21 @@ def _add_adapter(self, adapter_path: str, adapter_name: Optional[str] = None) -> | |
| self.model = Swift.from_pretrained(self.model, adapter_path, adapter_name) | ||
|
|
||
| @classmethod | ||
| def from_model_template(cls, model, template=None, *, max_batch_size: int = 1): | ||
| def from_model_template( | ||
| cls, | ||
| model, | ||
| template=None, | ||
| *, | ||
| max_batch_size: int = 1, | ||
| reranker_use_activation: bool = True, | ||
| cache_impl: Optional[str] = None, | ||
| ): | ||
| self = super().__new__(cls) | ||
| self.model = model | ||
| self.processor = template.processor | ||
| self.max_batch_size = max_batch_size | ||
| self.reranker_use_activation = reranker_use_activation | ||
| self.cache_impl = cache_impl | ||
| self._post_init(template) | ||
| return self | ||
|
|
||
|
|
@@ -233,6 +245,8 @@ def _model_generate(**kwargs): | |
| template.generate(self.model, **kwargs) | ||
|
|
||
| generate_kwargs = template.prepare_generate_kwargs(generate_kwargs, model=self.model) | ||
| if self.cache_impl is not None: | ||
| generate_kwargs['cache_implementation'] = self.cache_impl | ||
|
Comment on lines
247
to
+249
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This block of code, which prepares |
||
| thread = Thread(target=_model_generate, kwargs=generate_kwargs) | ||
| thread.start() | ||
| batch_size = inputs['attention_mask'].shape[0] | ||
|
|
@@ -392,6 +406,8 @@ def _infer_full(self, template: Template, inputs: Dict[str, Any], *, generation_ | |
| generate_kwargs['adapter_names'] = adapter_names | ||
| num_prompt_tokens = self._get_num_tokens(inputs) | ||
| generate_kwargs = template.prepare_generate_kwargs(generate_kwargs, model=self.model) | ||
| if self.cache_impl is not None: | ||
| generate_kwargs['cache_implementation'] = self.cache_impl | ||
| output = dict(template.generate(self.model, **generate_kwargs)) | ||
| output.pop('past_key_values', None) | ||
| batched_generate_ids = output['sequences'] | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
For better type clarity and consistency, the type hint for
cache_implshould beOptional[str]since its default value isNone. This aligns with its usage and its definition in other parts of the codebase, such as inPtEngine.