InternLM · AllentDan · Feb 1, 2024 · Feb 4, 2024 · Feb 5, 2024 · Feb 5, 2024
diff --git a/docs/en/inference/pipeline.md b/docs/en/inference/pipeline.md
@@ -235,6 +235,66 @@ This class contains the generation parameters used by inference engines.
 | min_new_tokens      | int         | The minimum numbers of tokens to generate, ignoring the number of tokens in the prompt.                               | None    |
 | skip_special_tokens | bool        | Whether or not to remove special tokens in the decoding.                                                              | True    |
 
+## Customize chat template
+
+LMDeploy supports two methods for adding dialogue templates:
+
+- One method involves customizing a Python dialogue template class based on LMDeploy's existing dialogue templates, which can be used directly after successful registration. The advantages are a high degree of customization and strong controllability. Below is an example of registering an LMDeploy dialogue template:
+  ```python
+  from typing import Dict, Union
+
+  from lmdeploy import ChatTemplateConfig, pipeline
+  from lmdeploy.model import MODELS, BaseModel
+
+
+  @MODELS.register_module(name='customized_model')
+  class CustomizedModel(BaseModel):
+      """A customized chat template."""
+
+      def messages2prompt(self,
+                          messages: Union[str, Dict],
+                          sequence_start: bool = True) -> str:
+          """This func apply chat template for input messages
+              Args:
+                    messages (str | Dict): input messages. Could be a str prompt or
+                    OpenAI format chat history. The former is for interactive chat.
+                    sequence_start (bool): Only for interactive chatting. Begin of the
+                    prompt token will be removed in interactive chatting when
+                    the sequence_start is False.
+              Returns:
+                    string. The return value will be sent to tokenizer.encode directly.
+              """
+          print(f'Any modification can be done for {messages}')
+          return str(messages)  # just a dummpy conversion.
+
+
+  pipe = pipeline('internlm/internlm2-chat-7b',
+                  chat_template_config=ChatTemplateConfig('customized_model'))
+
+  response = pipe('hi')
+  print(response)  # text completion in this case because of customized_model
+  ```
+  In this example, we registered an LMDeploy dialogue template that simply returns the input prompt as is, or converts the dialogue history into a string directly. The user needs to implement the actual dialogue template logic themselves, ideally considering both input scenarios. This allows the pipeline to handle both string inputs and OpenAI format dialogue history inputs after initialization.
+- The other method involves passing in [Huggingface's dialogue templates](https://huggingface.co/docs/transformers/main/en/chat_templating), which are Jinja templates.
+  Starting with a Python script is as follows:
+  ```python
+  from lmdeploy import ChatTemplateConfig, pipeline
+
+  pipe = pipeline('internlm/internlm2-chat-7b',
+                  chat_template_config=ChatTemplateConfig(
+                      jinja_template='jinja_template_str_or_file'))
+
+  response = pipe([[{
+      'role': 'user',
+      'content': 'Hi, pls intro yourself'
+  }], [{
+      'role': 'user',
+      'content': 'Shanghai is'
+  }]])
+  print(response)  # Jinja template can only handle OpenAI format chat history
+  ```
+  It's important to note that after passing in the Jinja template, the pipeline can only process dialogue history inputs in the OpenAI format.
+
 ## FAQs
 
 - *RuntimeError: context has already been set*. If you got this for tp>1 in pytorch backend. Please make sure the python script has following

diff --git a/docs/en/serving/restful_api.md b/docs/en/serving/restful_api.md
@@ -161,6 +161,68 @@ openaoe -f /path/to/your/config-template.yaml
 
 Please refer to the [guidance](https://github.com/InternLM/OpenAOE/blob/main/docs/tech-report/model_serving_by_lmdeploy/model_serving_by_lmdeploy.md) for more deploy information.
 
+### Customize the chat template
+
+LMDeploy supports two forms of chat templates:
+
+- The first approach is to customize a Python dialogue template class like the existing LMDeploy dialogue templates. It can be used directly after successful registration. The advantages are a high degree of customization and strong controllability. Below is an example of registering an LMDeploy dialogue template.
+
+  ```python
+  from typing import Dict, Union
+
+  from lmdeploy import ChatTemplateConfig, serve
+  from lmdeploy.model import MODELS, BaseModel
+
+
+  @MODELS.register_module(name='customized_model')
+  class CustomizedModel(BaseModel):
+      """A customized chat template."""
+
+      def messages2prompt(self,
+                          messages: Union[str, Dict],
+                          sequence_start: bool = True) -> str:
+          """This func apply chat template for input messages
+          Args:
+              messages (str | Dict): input messages. Could be a str prompt or
+                  OpenAI format chat history. The former is for interactive chat.
+              sequence_start (bool): Only for interactive chatting. Begin of the
+                  prompt token will be removed in interactive chatting when
+                  the sequence_start is False.
+          Returns:
+              string. The return value will be sent to tokenizer.encode directly.
+          """
+          print(f'Any modification can be done for {messages}')
+          return str(messages)  # just a dummpy conversion.
+
+
+  client = serve('internlm/internlm2-chat-7b',
+                chat_template_config=ChatTemplateConfig('customized_model'))
+  for item in client.chat_completions_v1('customized_model', [{
+          'role': 'user',
+          'content': 'hi'
+  }]):
+      print(item)
+  ```
+
+  In this example, we registered an LMDeploy dialogue template that simply returns the input prompt as is, or converts the dialogue history into a string directly. The user needs to implement the actual dialogue template logic themselves, ideally considering both input scenarios. With such a service started, all interfaces can be used.
+
+- Another approach is using [Huggingface chat template](https://huggingface.co/docs/transformers/main/en/chat_templating).
+  You can start the service by passing parameters directly through the command line, or by passing parameters through an LMDeploy API function to a script.
+
+  ```shell
+  lmdeploy serve api_server internlm/internlm2-chat-7b --jinja-template ${JINJA_STR_OR_FILE}
+  ```
+
+  ```python
+  from lmdeploy import ChatTemplateConfig, serve
+
+  serve('internlm/internlm2-chat-7b',
+        ChatTemplateConfig(jinja_template='jinja_template_str_or_file'),
+        block=True)
+  ```
+
+  It's important to note that after passing in the Jinja template, the service's endpoint should query the model name preferably through the user's `/v1/models` endpoint first. Additionally, Jinja templates can only be used for inputs in the OpenAI format, which means they are only suitable for services that use the OpenAI interface.
+
 ### FAQ
 
 1. When user got `"finish_reason":"length"`, it means the session is too long to be continued. The session length can be

diff --git a/docs/zh_cn/inference/pipeline.md b/docs/zh_cn/inference/pipeline.md
@@ -235,6 +235,74 @@ print(response)
 | min_new_tokens      | int         | 最小令牌生成数。                                      | None    |
 | skip_special_tokens | bool        | 是否跳过 special token。                              | True    |
 
+## 自定义对话模板
+
+LMDeploy 支持两种添加对话模板的形式：
+
+- 一种是以 LMDeploy 现有对话模板，自定义一个python对话模板类，注册成功后直接用即可。优点是自定义程度高，可控性强。
+  下面是一个注册 LMDeploy 对话模板的例子：
+
+  ```python
+  from typing import Dict, Union
+
+  from lmdeploy import ChatTemplateConfig, pipeline
+  from lmdeploy.model import MODELS, BaseModel
+
+
+  @MODELS.register_module(name='customized_model')
+  class CustomizedModel(BaseModel):
+      """A customized chat template."""
+
+      def messages2prompt(self,
+                          messages: Union[str, Dict],
+                          sequence_start: bool = True) -> str:
+          """This func apply chat template for input messages
+              Args:
+                    messages (str | Dict): input messages. Could be a str prompt or
+                    OpenAI format chat history. The former is for interactive chat.
+                    sequence_start (bool): Only for interactive chatting. Begin of the
+                    prompt token will be removed in interactive chatting when
+                    the sequence_start is False.
+              Returns:
+                    string. The return value will be sent to tokenizer.encode directly.
+              """
+          print(f'Any modification can be done for {messages}')
+          return str(messages)  # just a dummpy conversion.
+
+
+  pipe = pipeline('internlm/internlm2-chat-7b',
+                  chat_template_config=ChatTemplateConfig('customized_model'))
+
+  response = pipe('hi')
+  print(response)  # text completion in this case because of customized_model
+  ```
+
+  在这个例子中，我们注册了一个 LMDeploy 的对话模板，该模板只是将输入的 prompt 直接返回，或者
+  将对话历史直接转成了一个字符串。用户真正需要的对话模板逻辑，需要用户自己做填充，最好对两种输入情况都考虑到。
+  这样 pipeline 初始化后既能处理 string 输入又能处理 OpenAI 格式的对话历史输入。
+
+- 另一种是传入 [Huggingface 的对话模板](https://huggingface.co/docs/transformers/main/en/chat_templating)，即 Jinja 模板。
+  通过 python 脚本启动为：
+
+  ```python
+  from lmdeploy import ChatTemplateConfig, pipeline
+
+  pipe = pipeline('internlm/internlm2-chat-7b',
+                  chat_template_config=ChatTemplateConfig(
+                      jinja_template='jinja_template_str_or_file'))
+
+  response = pipe([[{
+      'role': 'user',
+      'content': 'Hi, pls intro yourself'
+  }], [{
+      'role': 'user',
+      'content': 'Shanghai is'
+  }]])
+  print(response)  # Jinja template can only handle OpenAI format chat history
+  ```
+
+  需要注意的时，传入的 Jinja 模板后，pipeline 只能处理 OpenAI 格式的对话历史作为输入。
+
 ## FAQs
 
 - *RuntimeError: context has already been set*. 如果你在使用 tp>1 和 pytorch 后端的时候，遇到了这个错误。请确保 python 脚本中有下面内容作为入口

diff --git a/docs/zh_cn/serving/restful_api.md b/docs/zh_cn/serving/restful_api.md
@@ -156,6 +156,74 @@ openaoe -f /path/to/your/config-template.yaml
 
 具体信息请参考 [部署说明](https://github.com/InternLM/OpenAOE/blob/main/docs/tech-report/model_serving_by_lmdeploy/model_serving_by_lmdeploy.md).
 
+### 自定义对话模板
+
+LMDeploy 支持两种添加对话模板的形式：
+
+- 一种是以 LMDeploy 现有对话模板，自定义一个python对话模板类，注册成功后直接用即可。优点是自定义程度高，可控性强。
+  下面是一个注册 LMDeploy 对话模板的例子：
+
+  ```python
+  from typing import Dict, Union
+
+  from lmdeploy import ChatTemplateConfig, serve
+  from lmdeploy.model import MODELS, BaseModel
+
+
+  @MODELS.register_module(name='customized_model')
+  class CustomizedModel(BaseModel):
+      """A customized chat template."""
+
+      def messages2prompt(self,
+                          messages: Union[str, Dict],
+                          sequence_start: bool = True) -> str:
+          """This func apply chat template for input messages
+          Args:
+              messages (str | Dict): input messages. Could be a str prompt or
+                  OpenAI format chat history. The former is for interactive chat.
+              sequence_start (bool): Only for interactive chatting. Begin of the
+                  prompt token will be removed in interactive chatting when
+                  the sequence_start is False.
+          Returns:
+              string. The return value will be sent to tokenizer.encode directly.
+          """
+          print(f'Any modification can be done for {messages}')
+          return str(messages)  # just a dummpy conversion.
+
+
+  client = serve('internlm/internlm2-chat-7b',
+                chat_template_config=ChatTemplateConfig('customized_model'))
+  for item in client.chat_completions_v1('customized_model', [{
+          'role': 'user',
+          'content': 'hi'
+  }]):
+      print(item)
+  ```
+
+  在这个例子中，我们注册了一个 LMDeploy 的对话模板，该模板只是将输入的 prompt 直接返回，或者
+  将对话历史直接转成了一个字符串。用户真正需要的对话模板逻辑，需要用户自己做填充，最好对两种输入情况都考虑到。
+  这样启动的服务，各个接口都可以使用。
+
+- 另一种是传入 [Huggingface 的对话模板](https://huggingface.co/docs/transformers/main/en/chat_templating)，即 Jinja 模板。
+  可以通过命令行直接传参启动，也可以通过 LMDeploy 的 API 函数传参脚本启动。
+
+  ```shell
+  lmdeploy serve api_server internlm/internlm2-chat-7b --jinja-template ${JINJA_STR_OR_FILE}
+  ```
+
+  通过 python 脚本启动为：
+
+  ```python
+  from lmdeploy import ChatTemplateConfig, serve
+
+  serve('internlm/internlm2-chat-7b',
+        ChatTemplateConfig(jinja_template='jinja_template_str_or_file'),
+        block=True)
+  ```
+
+  需要注意的时，传入的 Jinja 模板后，服务的 endpoint 需要的模型名字最好用户先通过 `/v1/models`查询。此外，Jinja 模板只能用于 OpenAI
+  格式的输入，这就意味着只能用 OpenAI 接口的服务。
+
 ### FAQ
 
 1. 当返回结果结束原因为 `"finish_reason":"length"`，这表示回话长度超过最大值。如需调整会话支持的最大长度，可以通过启动`api_server`时，设置`--session_len`参数大小。

diff --git a/lmdeploy/cli/serve.py b/lmdeploy/cli/serve.py
@@ -126,6 +126,7 @@ def add_parser_api_server():
 
         # chat template args
         ArgumentHelper.meta_instruction(parser)
+        ArgumentHelper.jinja_template(parser)
         ArgumentHelper.cap(parser)
 
         # pytorch engine args
@@ -210,6 +211,7 @@ def gradio(args):
         chat_template_config = ChatTemplateConfig(
             model_name=args.model_name,
             meta_instruction=args.meta_instruction,
+            jinja_template=args.jinja_template,
             capability=args.cap)
         run(args.model_path_or_server,
             server_name=args.server_name,
@@ -244,7 +246,8 @@ def api_server(args):
         chat_template_config = ChatTemplateConfig(
             model_name=args.model_name,
             meta_instruction=args.meta_instruction,
-            capability=args.cap)
+            capability=args.cap,
+            jinja_template=args.jinja_template)
         run_api_server(args.model_path,
                        backend=args.backend,
                        backend_config=backend_config,

diff --git a/lmdeploy/cli/utils.py b/lmdeploy/cli/utils.py
@@ -326,6 +326,18 @@ def meta_instruction(parser):
                                    default=None,
                                    help='System prompt for ChatTemplateConfig')
 
+    @staticmethod
+    def jinja_template(parser):
+        """Add argument jinjia template to parser."""
+
+        return parser.add_argument(
+            '--jinja-template',
+            type=str,
+            default=None,
+            help=\
+            'The file path to the chat template, or the template in single-line form. Could refer to https://huggingface.co/docs/transformers/main/en/chat_templating'  # noqa
+        )
+
     @staticmethod
     def cache_max_entry_count(parser):
         """Add argument cache_max_entry_count to parser."""

diff --git a/lmdeploy/model.py b/lmdeploy/model.py
@@ -1,4 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+import codecs
 import dataclasses
 import os
 from abc import abstractmethod
@@ -34,7 +35,7 @@ class ChatTemplateConfig:
         capability: ('completion' | 'infilling' | 'chat' | 'python') = None
     """  # noqa: E501
 
-    model_name: str
+    model_name: str = None
     system: Optional[str] = None
     meta_instruction: Optional[str] = None
     eosys: Optional[str] = None
@@ -44,17 +45,31 @@ class ChatTemplateConfig:
     eoa: Optional[str] = None
     capability: Optional[Literal['completion', 'infilling', 'chat',
                                  'python']] = None
+    jinja_template: Optional[str] = None
 
     @property
     def chat_template(self):
         attrs = {
             key: value
             for key, value in dataclasses.asdict(self).items()
-            if value is not None
+            if value is not None and key != 'jinja_template'
         }
         model: BaseModel = MODELS.get(self.model_name).from_config(**attrs)
         return model
 
+    def get_jinja_template(self):
+        """Get the jinja template."""
+        if self.jinja_template is not None:
+            try:
+                with open(self.jinja_template, 'r') as f:
+                    template = f.read()
+            except OSError:
+                # If opening a file fails, set chat template to be args to
+                # ensure we decode so our escape are interpreted correctly
+                template = codecs.decode(self.jinja_template, 'unicode_escape')
+            return template
+        return None
+
 
 @MODELS.register_module(name='internlm')
 @MODELS.register_module(name='llama')