From 9e3f5c3c24c7a3a37e0121a8b8ca1e2d5707b9a4 Mon Sep 17 00:00:00 2001 From: BoKeum Date: Mon, 18 May 2026 12:56:16 +0900 Subject: [PATCH 1/3] fix(BA-6062): wrap legacy string start_command via shell -c instead of shlex.split Co-Authored-By: Claude Opus 4.7 --- src/ai/backend/common/config.py | 89 ++++++++++--------- ...rewrap_legacy_string_start_command_via_.py | 74 +++++++++++++++ ...t_model_definition_start_command_compat.py | 83 +++++++++++------ 3 files changed, 176 insertions(+), 70 deletions(-) create mode 100644 src/ai/backend/manager/models/alembic/versions/70ffcaaa5f5c_rewrap_legacy_string_start_command_via_.py diff --git a/src/ai/backend/common/config.py b/src/ai/backend/common/config.py index 95b4e2dbae9..80adc4ac65e 100644 --- a/src/ai/backend/common/config.py +++ b/src/ai/backend/common/config.py @@ -1,7 +1,6 @@ from __future__ import annotations import os -import shlex import sys from collections.abc import Mapping, MutableMapping from pathlib import Path @@ -14,7 +13,7 @@ AliasChoices, ConfigDict, Field, - field_validator, + model_validator, ) from . import validators as tx @@ -144,14 +143,18 @@ def snake_to_kebab_case(string: str) -> str: agent_selector_config_iv = t.Dict({}) | agent_selector_globalconfig_iv -def _normalize_start_command(value: Any) -> Any: - """Coerce legacy ``str`` ``start_command`` into argv list via - :func:`shlex.split`. Lists, ``None``, and other types pass through so - the schema's strict check rejects them. - """ - if isinstance(value, str): - return shlex.split(value) - return value +DEFAULT_SHELL = "/bin/bash" + + +def _wrap_str_start_command_into_argv(service: Any) -> Any: + if not isinstance(service, dict): + return service + sc = service.get("start_command") + if not isinstance(sc, str): + return service + shell = service.get("shell") or DEFAULT_SHELL + # key override with the wrapped value, preserving other keys if present. + return {**service, "start_command": [shell, "-c", sc]} model_definition_iv = t.Dict({ @@ -159,31 +162,33 @@ def _normalize_start_command(value: Any) -> Any: t.Dict({ t.Key("name"): t.String, t.Key("model_path"): t.String, - t.Key("service", default=None): t.Null - | t.Dict({ - # ai.backend.kernel.service.ServiceParser.start_service() - # ai.backend.kernel.service_actions - t.Key("pre_start_actions", default=[]): t.Null - | t.List( - t.Dict({ - t.Key("action"): t.String, - t.Key("args"): t.Dict().allow_extra("*"), - }) - ), - t.Key("start_command", default=None): (t.Null | t.String | t.List(t.String)) - >> _normalize_start_command, - t.Key("shell", default="/bin/bash"): t.String, - t.Key("port"): t.ToInt[1:], - t.Key("health_check", default=None): t.Null + t.Key("service", default=None): ( + t.Null | t.Dict({ - t.Key("interval", default=10): t.Null | t.ToFloat[0:], - t.Key("path"): t.String, - t.Key("max_retries", default=10): t.Null | t.ToInt[1:], - t.Key("max_wait_time", default=15): t.Null | t.ToFloat[0:], - t.Key("expected_status_code", default=200): t.Null | t.ToInt[100:], - t.Key("initial_delay", default=60): t.Null | t.ToFloat[0:], - }), - }), + # ai.backend.kernel.service.ServiceParser.start_service() + # ai.backend.kernel.service_actions + t.Key("pre_start_actions", default=[]): t.Null + | t.List( + t.Dict({ + t.Key("action"): t.String, + t.Key("args"): t.Dict().allow_extra("*"), + }) + ), + t.Key("start_command", default=None): t.Null | t.String | t.List(t.String), + t.Key("shell", default=DEFAULT_SHELL): t.String, + t.Key("port"): t.ToInt[1:], + t.Key("health_check", default=None): t.Null + | t.Dict({ + t.Key("interval", default=10): t.Null | t.ToFloat[0:], + t.Key("path"): t.String, + t.Key("max_retries", default=10): t.Null | t.ToInt[1:], + t.Key("max_wait_time", default=15): t.Null | t.ToFloat[0:], + t.Key("expected_status_code", default=200): t.Null | t.ToInt[100:], + t.Key("initial_delay", default=60): t.Null | t.ToFloat[0:], + }), + }) + ) + >> _wrap_str_start_command_into_argv, t.Key("metadata", default=None): t.Null | t.Dict({ t.Key("author", default=None): t.Null | t.String(allow_blank=True), @@ -268,9 +273,9 @@ class ModelServiceConfig(BaseConfigModel): examples=[["python", "service.py"], ["vllm", "serve", "{model_path}"]], ) shell: str = Field( - default="/bin/bash", + default=DEFAULT_SHELL, description="Shell configured for the model service.", - examples=["/bin/bash"], + examples=[DEFAULT_SHELL], ) port: int = Field( description="Port number for the model service. Must be greater than 1.", @@ -282,10 +287,10 @@ class ModelServiceConfig(BaseConfigModel): description="Health check configuration for the model service.", ) - @field_validator("start_command", mode="before") + @model_validator(mode="before") @classmethod - def _coerce_start_command(cls, value: Any) -> Any: - return _normalize_start_command(value) + def _wrap_str_start_command(cls, data: Any) -> Any: + return _wrap_str_start_command_into_argv(data) class ModelMetadata(BaseConfigModel): @@ -553,10 +558,10 @@ class ModelServiceConfigDraft(BaseConfigModel): port: int | None = None health_check: ModelHealthCheckDraft | None = None - @field_validator("start_command", mode="before") + @model_validator(mode="before") @classmethod - def _coerce_start_command(cls, value: Any) -> Any: - return _normalize_start_command(value) + def _wrap_str_start_command(cls, data: Any) -> Any: + return _wrap_str_start_command_into_argv(data) def to_resolved(self) -> ModelServiceConfig: # Drop unset (None) scalars so the strict type's ``Field(default=...)`` diff --git a/src/ai/backend/manager/models/alembic/versions/70ffcaaa5f5c_rewrap_legacy_string_start_command_via_.py b/src/ai/backend/manager/models/alembic/versions/70ffcaaa5f5c_rewrap_legacy_string_start_command_via_.py new file mode 100644 index 00000000000..8b8b5cd52cb --- /dev/null +++ b/src/ai/backend/manager/models/alembic/versions/70ffcaaa5f5c_rewrap_legacy_string_start_command_via_.py @@ -0,0 +1,74 @@ +"""rewrap legacy string start_command via shell -c + +Migration ``8c1f7d3a9e2b`` (Part of: 26.5.0) wrapped any legacy ``str`` +``start_command`` as a one-item argv list (e.g. ``"python service.py"`` +became ``["python service.py"]``). That value is meaningless at exec +time -- ``execve`` would look for a binary literally named +``"python service.py"`` -- and it loses the shell semantics the user +originally intended. + +This migration repairs those rows by replacing single-element argv +lists whose only token still looks like a shell command (contains +whitespace) with ``[shell, "-c", token]``, where ``shell`` comes from +the sibling ``service.shell`` field and falls back to ``/bin/bash`` to +match ``ai.backend.common.config.DEFAULT_SHELL``. Multi-token argv +lists and single-token lists without whitespace are left untouched, +so the migration is safe to re-apply. + +Revision ID: 70ffcaaa5f5c +Revises: ba42cb865efe +Create Date: 2026-05-18 + +""" + +import json + +import sqlalchemy as sa +from alembic import op +from sqlalchemy.engine import Connection + +# revision identifiers, used by Alembic. +revision = "70ffcaaa5f5c" +down_revision = "ba42cb865efe" +# Part of: 26.5.1 +branch_labels = None +depends_on = None + +DEFAULT_SHELL = "/bin/bash" + + +def _rewrap_model_definition(conn: Connection, table: str, column: str) -> None: + rows = conn.execute( + sa.text(f"SELECT id, {column} FROM {table} WHERE {column} IS NOT NULL") + ).fetchall() + + for row_id, model_definition in rows: + changed = False + for model in model_definition.get("models") or []: + service = model.get("service") or {} + start_command = service.get("start_command") + if isinstance(start_command, list) and len(start_command) == 1 and " " in start_command[0]: + # Case when the start_command is a single string that looks like a shell command. + # e.g. ["python service.py"] -> ["/bin/bash", "-c", "python service.py"] + shell = service.get("shell") or DEFAULT_SHELL + service["start_command"] = [shell, "-c", start_command[0]] + changed = True + + if changed: + conn.execute( + sa.text(f"UPDATE {table} SET {column} = CAST(:definition AS JSONB) WHERE id = :id"), + {"definition": json.dumps(model_definition), "id": row_id}, + ) + + +def upgrade() -> None: + conn = op.get_bind() + _rewrap_model_definition(conn, "deployment_revisions", "model_definition") + _rewrap_model_definition(conn, "deployment_revision_presets", "model_definition") + _rewrap_model_definition(conn, "runtime_variants", "default_model_definition") + + +def downgrade() -> None: + # No-op: the ``[shell, "-c", token]`` form is also valid argv under + # any prior schema. + pass diff --git a/tests/unit/manager/data/deployment/test_model_definition_start_command_compat.py b/tests/unit/manager/data/deployment/test_model_definition_start_command_compat.py index 8afbb479c54..450296d2016 100644 --- a/tests/unit/manager/data/deployment/test_model_definition_start_command_compat.py +++ b/tests/unit/manager/data/deployment/test_model_definition_start_command_compat.py @@ -2,9 +2,11 @@ ``model_definition`` validators. PR #11402 narrowed ``ModelServiceConfig.start_command`` from -``str | list[str]`` to ``list[str] | None``. To preserve backward -compatibility for users who still author the field as a shell string, -the validators normalize ``str`` input via :func:`shlex.split`. +``str | list[str]`` to ``list[str] | None``. Legacy ``str`` inputs are +preserved by wrapping them as ``[shell, "-c", start_command]`` so the +original shell semantics (pipes, redirections, env expansion, +multiline) survive. ``shell`` falls back to ``/bin/bash`` when the +input omits it. """ from __future__ import annotations @@ -26,27 +28,38 @@ START_COMMAND_CASES = [ pytest.param( "python service.py", - ["python", "service.py"], - id="simple-split", + None, + ["/bin/bash", "-c", "python service.py"], + id="simple-default-shell", ), pytest.param( 'python -c "import x; x.run()"', - ["python", "-c", "import x; x.run()"], - id="shlex-preserves-quoted", + None, + ["/bin/bash", "-c", 'python -c "import x; x.run()"'], + id="quoted-default-shell", + ), + pytest.param( + "echo $HOME && exec python serve.py", + "/bin/zsh", + ["/bin/zsh", "-c", "echo $HOME && exec python serve.py"], + id="shell-semantics-custom-shell", ), ] -def _wrap_definition(start_command: str) -> dict[str, Any]: +def _wrap_definition(start_command: str, shell: str | None = None) -> dict[str, Any]: + service: dict[str, Any] = { + "start_command": start_command, + "port": 8080, + } + if shell is not None: + service["shell"] = shell return { "models": [ { "name": "model", "model_path": "/models/x", - "service": { - "start_command": start_command, - "port": 8080, - }, + "service": service, } ] } @@ -55,21 +68,21 @@ def _wrap_definition(start_command: str) -> dict[str, Any]: class TestPydanticInputCompat: """REST/GQL input → ``ModelServiceConfigDraft`` Pydantic validator.""" - @pytest.mark.parametrize(("raw", "expected"), START_COMMAND_CASES) - def test_str_input_is_normalized(self, raw: str, expected: list[str]) -> None: - draft = ModelServiceConfigDraft.model_validate({ - "start_command": raw, - "port": 8080, - }) + @pytest.mark.parametrize(("raw", "shell", "expected"), START_COMMAND_CASES) + def test_str_input_is_wrapped(self, raw: str, shell: str | None, expected: list[str]) -> None: + payload: dict[str, Any] = {"start_command": raw, "port": 8080} + if shell is not None: + payload["shell"] = shell + draft = ModelServiceConfigDraft.model_validate(payload) assert draft.start_command == expected class TestTrafaretInputCompat: """vfolder YAML scan → ``model_definition_iv`` trafaret validator.""" - @pytest.mark.parametrize(("raw", "expected"), START_COMMAND_CASES) - def test_str_input_is_normalized(self, raw: str, expected: list[str]) -> None: - result = model_definition_iv.check(_wrap_definition(raw)) + @pytest.mark.parametrize(("raw", "shell", "expected"), START_COMMAND_CASES) + def test_str_input_is_wrapped(self, raw: str, shell: str | None, expected: list[str]) -> None: + result = model_definition_iv.check(_wrap_definition(raw, shell)) assert result["models"][0]["service"]["start_command"] == expected @@ -77,8 +90,9 @@ class TestYAMLInputCompat: """End-to-end vfolder scan path: a user-authored ``model-definition.yaml`` is parsed by ruamel.yaml and fed to ``model_definition_iv``. Confirms that every YAML notation a user might write — legacy shell string, inline flow - sequence, hyphenated block sequence — resolves to the same canonical - ``list[str]``. + sequence, hyphenated block sequence — resolves to the expected + ``list[str]``. Legacy shell strings are wrapped via ``shell -c``; explicit + argv lists pass through unchanged. """ @pytest.mark.parametrize( @@ -93,8 +107,21 @@ class TestYAMLInputCompat: start_command: python service.py port: 8080 """), - ["python", "service.py"], - id="legacy-shell-string", + ["/bin/bash", "-c", "python service.py"], + id="legacy-shell-string-default-shell", + ), + pytest.param( + textwrap.dedent("""\ + models: + - name: model + model_path: /models/x + service: + start_command: echo $HOME | tee /tmp/out + shell: /bin/zsh + port: 8080 + """), + ["/bin/zsh", "-c", "echo $HOME | tee /tmp/out"], + id="legacy-shell-string-custom-shell", ), pytest.param( textwrap.dedent("""\ @@ -138,10 +165,10 @@ class TestPydanticColumnReadCompat: read → to_data flow. """ - @pytest.mark.parametrize(("raw", "expected"), START_COMMAND_CASES) - def test_legacy_str_is_normalized(self, raw: str, expected: list[str]) -> None: + @pytest.mark.parametrize(("raw", "shell", "expected"), START_COMMAND_CASES) + def test_legacy_str_is_wrapped(self, raw: str, shell: str | None, expected: list[str]) -> None: column = PydanticColumn(ModelDefinition) - resolved = column.process_result_value(_wrap_definition(raw), DefaultDialect()) + resolved = column.process_result_value(_wrap_definition(raw, shell), DefaultDialect()) assert resolved is not None assert resolved.models[0].service is not None assert resolved.models[0].service.start_command == expected From 455ccf7fe55c58fdf5244e7d6fa7b32fe491fdae Mon Sep 17 00:00:00 2001 From: BoKeum Date: Mon, 18 May 2026 12:56:48 +0900 Subject: [PATCH 2/3] style(BA-6062): apply ruff format to new migration file Co-Authored-By: Claude Opus 4.7 --- .../70ffcaaa5f5c_rewrap_legacy_string_start_command_via_.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/ai/backend/manager/models/alembic/versions/70ffcaaa5f5c_rewrap_legacy_string_start_command_via_.py b/src/ai/backend/manager/models/alembic/versions/70ffcaaa5f5c_rewrap_legacy_string_start_command_via_.py index 8b8b5cd52cb..9feea30bd50 100644 --- a/src/ai/backend/manager/models/alembic/versions/70ffcaaa5f5c_rewrap_legacy_string_start_command_via_.py +++ b/src/ai/backend/manager/models/alembic/versions/70ffcaaa5f5c_rewrap_legacy_string_start_command_via_.py @@ -47,7 +47,11 @@ def _rewrap_model_definition(conn: Connection, table: str, column: str) -> None: for model in model_definition.get("models") or []: service = model.get("service") or {} start_command = service.get("start_command") - if isinstance(start_command, list) and len(start_command) == 1 and " " in start_command[0]: + if ( + isinstance(start_command, list) + and len(start_command) == 1 + and " " in start_command[0] + ): # Case when the start_command is a single string that looks like a shell command. # e.g. ["python service.py"] -> ["/bin/bash", "-c", "python service.py"] shell = service.get("shell") or DEFAULT_SHELL From 25fbe3f62819db4b774ab1e8cbc004a1551521db Mon Sep 17 00:00:00 2001 From: BoKeum Date: Mon, 18 May 2026 13:12:34 +0900 Subject: [PATCH 3/3] changelog: add news fragment for PR #11648 --- changes/11648.fix.md | 1 + 1 file changed, 1 insertion(+) create mode 100644 changes/11648.fix.md diff --git a/changes/11648.fix.md b/changes/11648.fix.md new file mode 100644 index 00000000000..47236f09299 --- /dev/null +++ b/changes/11648.fix.md @@ -0,0 +1 @@ +Wrap legacy string `start_command` in model definitions as `[shell, '-c', value]` to preserve shell semantics, with a data migration that repairs previously-broken single-token argv rows