enhance(BA-6053): preserve str start_command form in model definitions

achimnol · achimnol · commit aa7d19a642f5 · 2026-05-17T16:48:42.000+09:00
Stop coercing string ``start_command`` values into argv lists via ``shlex.split`` in the API and config validators. The kernel runner already wraps ``str`` commands as ``[shell, "-c", str]`` at exec time (``kernel/service.py``), so passing the original string through gives users full shell semantics — line continuations, ``$VAR`` expansion, pipes — and removes the need to manually strip backslashes from copy-pasted multi-line vendor recipes (e.g. vLLM). - ``ModelServiceConfig.start_command`` and the draft mirror now accept ``str | list[str] | None``; the previous ``shlex.split`` validator is removed. - ``{model_path}`` substitution and ``with_args_appended`` handle both forms; preset ARGS appended to a string command are shell-quoted via ``shlex.join``. - DTOs in ``common/dto/manager/v2/deployment`` are widened to match. The GraphQL types expose the field as ``JSON | None`` (the same pattern already used for ``ModelMetadata.version``) since GraphQL cannot natively represent a scalar/list union. Existing list-form definitions are unaffected. resolves #11624
diff --git a/changes/11624.enhance.md b/changes/11624.enhance.md
@@ -0,0 +1 @@
+Preserve the original `str` form of `start_command` in model definitions all the way to the kernel runner so the kernel runner's existing `[shell, "-c", str]` wrapping handles shell semantics (line continuations, `$VAR` expansion, pipes) — eliminating the need to manually strip backslashes from copy-pasted multi-line vendor recipes (e.g. vLLM).
diff --git a/src/ai/backend/common/config.py b/src/ai/backend/common/config.py
@@ -14,7 +14,6 @@
     AliasChoices,
     ConfigDict,
     Field,
-    field_validator,
 )
 
 from . import validators as tx
@@ -144,16 +143,6 @@ def snake_to_kebab_case(string: str) -> str:
 agent_selector_config_iv = t.Dict({}) | agent_selector_globalconfig_iv
 
 
-def _normalize_start_command(value: Any) -> Any:
-    """Coerce legacy ``str`` ``start_command`` into argv list via
-    :func:`shlex.split`. Lists, ``None``, and other types pass through so
-    the schema's strict check rejects them.
-    """
-    if isinstance(value, str):
-        return shlex.split(value)
-    return value
-
-
 model_definition_iv = t.Dict({
     t.Key("models"): t.List(
         t.Dict({
@@ -170,8 +159,9 @@ def _normalize_start_command(value: Any) -> Any:
                         t.Key("args"): t.Dict().allow_extra("*"),
                     })
                 ),
-                t.Key("start_command", default=None): (t.Null | t.String | t.List(t.String))
-                >> _normalize_start_command,
+                # A ``str`` ``start_command`` is preserved as-is so the
+                # kernel runner can wrap it as ``[shell, "-c", str]``.
+                t.Key("start_command", default=None): t.Null | t.String | t.List(t.String),
                 t.Key("shell", default="/bin/bash"): t.String,
                 t.Key("port"): t.ToInt[1:],
                 t.Key("health_check", default=None): t.Null
@@ -258,14 +248,23 @@ class ModelServiceConfig(BaseConfigModel):
         default_factory=list,
         description="List of pre-start actions to execute before starting the model service.",
     )
-    start_command: list[str] | None = Field(
+    start_command: str | list[str] | None = Field(
         default=None,
         description=(
-            "Argv list to start the model service. ``{model_path}`` in any "
-            "token is replaced per-token with the resolved ``model_path`` "
-            "before launch. ``None`` falls back to the image's default CMD."
+            "Command to start the model service. Two forms are accepted: "
+            "an argv list (``list[str]``) which is exec'ed directly, or a "
+            "single shell script string which the kernel runner wraps as "
+            "``[shell, '-c', str]`` (giving the user full shell semantics "
+            "such as line continuations, ``$VAR`` expansion, and pipes). "
+            "``{model_path}`` is replaced with the resolved ``model_path`` "
+            "before launch — per-token for the list form, in-place for the "
+            "string form. ``None`` falls back to the image's default CMD."
         ),
-        examples=[["python", "service.py"], ["vllm", "serve", "{model_path}"]],
+        examples=[
+            ["python", "service.py"],
+            ["vllm", "serve", "{model_path}"],
+            "vllm serve {model_path} --tensor-parallel-size 2",
+        ],
     )
     shell: str = Field(
         default="/bin/bash",
@@ -282,11 +281,6 @@ class ModelServiceConfig(BaseConfigModel):
         description="Health check configuration for the model service.",
     )
 
-    @field_validator("start_command", mode="before")
-    @classmethod
-    def _coerce_start_command(cls, value: Any) -> Any:
-        return _normalize_start_command(value)
-
 
 class ModelMetadata(BaseConfigModel):
     author: str | None = Field(
@@ -498,7 +492,12 @@ def health_check_config(self) -> ModelHealthCheck | None:
 
     def with_args_appended(self, args: list[str]) -> ModelDefinition:
         """Return a copy with ``args`` appended to each model's
-        ``service.start_command`` as separate argv tokens.
+        ``service.start_command``.
+
+        For the list form, ``args`` are concatenated as separate argv
+        tokens.  For the string form, ``args`` are shell-quoted via
+        :func:`shlex.join` and appended after a single space so they are
+        parsed by the same shell that runs the user's script.
 
         Models with ``service is None`` are passed through unchanged;
         a model whose ``start_command`` is ``None`` receives ``args``
@@ -512,9 +511,14 @@ def with_args_appended(self, args: list[str]) -> ModelDefinition:
             if model.service is None:
                 new_models.append(model)
                 continue
-            existing = model.service.start_command or []
+            existing = model.service.start_command
+            merged: str | list[str]
+            if isinstance(existing, str):
+                merged = f"{existing} {shlex.join(args)}"
+            else:
+                merged = (existing or []) + args
             new_service = model.service.model_copy(
-                update={"start_command": existing + args},
+                update={"start_command": merged},
             )
             new_models.append(model.model_copy(update={"service": new_service}))
         return self.model_copy(update={"models": new_models})
@@ -548,16 +552,11 @@ def to_resolved(self) -> ModelHealthCheck:
 
 class ModelServiceConfigDraft(BaseConfigModel):
     pre_start_actions: list[PreStartAction] | None = None
-    start_command: list[str] | None = None
+    start_command: str | list[str] | None = None
     shell: str | None = None
     port: int | None = None
     health_check: ModelHealthCheckDraft | None = None
 
-    @field_validator("start_command", mode="before")
-    @classmethod
-    def _coerce_start_command(cls, value: Any) -> Any:
-        return _normalize_start_command(value)
-
     def to_resolved(self) -> ModelServiceConfig:
         # Drop unset (None) scalars so the strict type's ``Field(default=...)``
         # declarations remain the single source of truth for default values;
@@ -583,9 +582,15 @@ def to_resolved(self) -> ModelConfig:
             # ``start_command`` are resolved here, at the same moment the
             # draft becomes a strict ``ModelConfig`` and ``model_path`` is
             # finalized. Placeholders therefore never propagate downstream.
-            service.start_command = [
-                token.replace("{model_path}", self.model_path) for token in service.start_command
-            ]
+            if isinstance(service.start_command, str):
+                service.start_command = service.start_command.replace(
+                    "{model_path}", self.model_path
+                )
+            else:
+                service.start_command = [
+                    token.replace("{model_path}", self.model_path)
+                    for token in service.start_command
+                ]
         payload = self.model_dump(exclude_none=True, exclude={"service"})
         payload["service"] = service
         return ModelConfig.model_validate(payload)
diff --git a/src/ai/backend/common/dto/manager/v2/deployment/request.py b/src/ai/backend/common/dto/manager/v2/deployment/request.py
@@ -148,7 +148,7 @@ class ModelMetadataInput(BaseRequestModel):
 
 class ModelServiceConfigInput(BaseRequestModel):
     pre_start_actions: list[PreStartAction] | None = None
-    start_command: list[str] | None = None
+    start_command: str | list[str] | None = None
     shell: str | None = None
     port: int | None = None
     health_check: ModelHealthCheckInput | None = None
diff --git a/src/ai/backend/common/dto/manager/v2/deployment/types.py b/src/ai/backend/common/dto/manager/v2/deployment/types.py
@@ -299,8 +299,14 @@ class ModelServiceConfigInfoDTO(BaseResponseModel):
         default_factory=list,
         description="List of pre-start actions to execute before starting the model service.",
     )
-    start_command: list[str] | None = Field(
-        default=None, description="Command to start the model service."
+    start_command: str | list[str] | None = Field(
+        default=None,
+        description=(
+            "Command to start the model service. A list is exec'ed directly "
+            "as argv; a string is wrapped as ``[shell, '-c', str]`` by the "
+            "kernel runner so shell semantics (line continuations, ``$VAR`` "
+            "expansion, pipes) apply."
+        ),
     )
     shell: str = Field(
         default="/bin/bash",
diff --git a/src/ai/backend/manager/api/gql/deployment/types/revision.py b/src/ai/backend/manager/api/gql/deployment/types/revision.py
@@ -383,8 +383,14 @@ class ModelServiceConfigGQL:
     pre_start_actions: list[PreStartActionGQL] = gql_field(
         description="List of pre-start actions to execute before starting the model service."
     )
-    start_command: list[str] | None = gql_field(
-        description="Command to start the model service.", default=None
+    start_command: JSON | None = gql_field(
+        description=(
+            "Command to start the model service. A JSON array (``list[str]``) "
+            "is exec'ed directly as argv; a JSON string is wrapped as "
+            "``[shell, '-c', str]`` by the kernel runner so shell semantics "
+            "(line continuations, ``$VAR`` expansion, pipes) apply."
+        ),
+        default=None,
     )
     shell: str = gql_field(description="Shell configured for the model service.")
     port: int = gql_field(description="Port number for the model service.")
@@ -891,8 +897,14 @@ class ModelServiceConfigInputGQL(PydanticInputMixin[ModelServiceConfigInputDTO])
         description="List of pre-start actions to execute before starting the model service.",
         default=None,
     )
-    start_command: list[str] | None = gql_field(
-        description="Command to start the model service.", default=None
+    start_command: JSON | None = gql_field(
+        description=(
+            "Command to start the model service. A JSON array (``list[str]``) "
+            "is exec'ed directly as argv; a JSON string is wrapped as "
+            "``[shell, '-c', str]`` by the kernel runner so shell semantics "
+            "(line continuations, ``$VAR`` expansion, pipes) apply."
+        ),
+        default=None,
     )
     shell: str | None = gql_field(
         description="Shell configured for the model service.", default=None
diff --git a/tests/unit/common/test_config.py b/tests/unit/common/test_config.py
@@ -304,6 +304,45 @@ def test_to_resolved_leaves_start_command_with_no_placeholder_unchanged(self) ->
         assert service is not None
         assert service.start_command == ["my-server", "--bind", "0.0.0.0"]
 
+    def test_to_resolved_preserves_str_start_command(self) -> None:
+        # When ``start_command`` is given as a single shell-script string,
+        # the validator must NOT split it into argv tokens; the kernel
+        # runner is responsible for wrapping it via ``[shell, "-c", str]``.
+        # This protects copy-pasted multi-line vendor recipes (e.g. vLLM)
+        # where backslash-newline must be interpreted by the shell, not
+        # by ``shlex.split``.
+        script = "vllm serve {model_path} \\\n  --tensor-parallel-size 2"
+        draft = ModelDefinitionDraft.model_validate({
+            "models": [
+                {
+                    "name": "demo",
+                    "model_path": "/data",
+                    "service": {
+                        "start_command": script,
+                        "port": 8000,
+                    },
+                }
+            ]
+        })
+
+        resolved = draft.to_resolved()
+
+        service = resolved.models[0].service
+        assert service is not None
+        assert isinstance(service.start_command, str)
+        assert service.start_command == ("vllm serve /data \\\n  --tensor-parallel-size 2")
+
+    def test_model_service_config_accepts_str_start_command(self) -> None:
+        # Strict ``ModelServiceConfig`` (post-merge) accepts the str form
+        # too — required so the resolved revision can carry it all the
+        # way to the agent and kernel runner unchanged.
+        service = ModelServiceConfig.model_validate({
+            "start_command": "python -m foo --bar",
+            "port": 8000,
+        })
+
+        assert service.start_command == "python -m foo --bar"
+
 
 class TestModelDefinitionWithArgsAppended:
     @pytest.fixture
@@ -438,3 +477,27 @@ async def test_each_model_receives_args(
         assert first is not None and second is not None
         assert first.start_command == ["a", "--shared", "true"]
         assert second.start_command == ["b", "--shared", "true"]
+
+    async def test_appends_args_to_str_start_command_with_shell_quoting(self) -> None:
+        # Preset ARGS are appended to a string ``start_command`` after a
+        # single space, with each argument passed through ``shlex.join``
+        # so values containing whitespace or quotes are safely parsed by
+        # the same shell that will run the script.
+        definition = ModelDefinition(
+            models=[
+                ModelConfig(
+                    name="demo",
+                    model_path="/models",
+                    service=ModelServiceConfig(
+                        port=8000,
+                        start_command="vllm serve /models",
+                    ),
+                )
+            ]
+        )
+
+        result = definition.with_args_appended(["--prompt", "hello world"])
+
+        service = result.models[0].service
+        assert service is not None
+        assert service.start_command == "vllm serve /models --prompt 'hello world'"

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	+Preserve the original `str` form of `start_command` in model definitions all the way to the kernel runner so the kernel runner's existing `[shell, "-c", str]` wrapping handles shell semantics (line continuations, `$VAR` expansion, pipes) — eliminating the need to manually strip backslashes from copy-pasted multi-line vendor recipes (e.g. vLLM).