[owl] Bug fixes and updates (#849)

jiahuei · jiahuei · commit df0f10e54f4e · 2025-12-17T06:42:28.000Z
Backend - owl (API server)

- Bug fixes:
  - Always update reasoning content in state
  - Only record reasoning time if there's reasoning content
  - Anthropic 4.1 and 4.5 models cannot specify both `temperature` and `top_p`
- Delete org secret upon org deletion
- Bump dependencies
- Deps: Pin Pydantic to `2.11.x`
- Test-LLM: Support reasoning content
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -247,6 +247,11 @@ jobs:
         timeout-minutes: 1
         run: mkdir -p logs && docker compose -p jm -f docker/compose.ci.yml logs starling > logs/starling.log
 
+      - name: Inspect test-llm logs
+        timeout-minutes: 1
+        if: always() && steps.launch_services.outcome == 'success'
+        run: mkdir -p logs && docker compose -p jm -f docker/compose.ci.yml logs test-llm > logs/test-llm.log
+
       - name: Test Stripe integration (Cloud only)
         id: test_stripe
         if: matrix.jamai-mode == 'cloud' && matrix.test-group == 'group1' && steps.launch_services.outcome == 'success'
diff --git a/clients/python/pyproject.toml b/clients/python/pyproject.toml
@@ -117,7 +117,7 @@ dependencies = [
     "Pillow>=10.0",
     "pydantic-extra-types~=2.9",
     "pydantic-settings~=2.4",
-    "pydantic[email,timezone]~=2.10",
+    "pydantic[email,timezone]~=2.11",
     "pyyaml~=6.0",
     "toml~=0.10.2",
     "typing_extensions~=4.10",
diff --git a/services/api/pyproject.toml b/services/api/pyproject.toml
@@ -111,15 +111,15 @@ classifiers = [ # https://pypi.org/classifiers/
 # Sort your dependencies https://sortmylist.com/
 # In general, for v1 and above, we pin to minor version using ~=
 dependencies = [
-    "aioboto3~=7.0.0",
-    "aiobotocore~=2.21.0",                                  # Took long time to resolve
-    "aiofiles~=24.1.0",
+    "aioboto3~=15.5.0",
+    "aiobotocore==2.25.1",                                  # Took long time to resolve
+    "aiofiles~=25.1.0",
     "aiosqlite~=0.21.0",
     "async-lru~=2.0.0",
     "asyncpg~=0.30.0",
     "authlib~=1.6.0",
     "bm25s~=0.2.0",
-    "boto3==1.37.1",                                        # Took long time to resolve
+    "boto3==1.40.61",                                       # Took long time to resolve
     "celery~=5.5.0",
     "clickhouse-connect~=0.8.0",
     "cloudevents~=1.12.0",
@@ -142,7 +142,7 @@ dependencies = [
     "natsort[fast]~=8.4.0",
     "nltk~=3.9.0",
     "numpy>=1.26.0",
-    "openai~=1.99.0",
+    "openai~=2.9.0",
     "opentelemetry-api~=1.36.0",
     "opentelemetry-distro~=0.57b0",
     "opentelemetry-exporter-otlp~=1.36.0",
@@ -187,14 +187,14 @@ dependencies = [
     "pycryptodomex~=3.23.0",
     "pydantic-extra-types~=2.10.0",
     "pydantic-settings~=2.10.0",
-    "pydantic[email,timezone]~=2.11.0",
+    "pydantic[email,timezone]~=2.11.0",                     # 2.12 causes issues with sqlmodel Datetime
     "pydub~=0.25.0",
     "pyjwt~=2.10.0",
     "pylance==0.16.0",
     "python-multipart~=0.0.20",
     "redis[hiredis]~=5.3.0",
     "SQLAlchemy~=2.0.0",
-    "sqlmodel~=0.0.20",
+    "sqlmodel~=0.0.27",
     "sqlparse~=0.5.0",
     "starlette~=0.41.0",
     "stripe~=9.12.0",
diff --git a/services/api/src/owl/db/gen_executor.py b/services/api/src/owl/db/gen_executor.py
@@ -748,7 +748,7 @@ async def _execute_chat_task(self, task: Task, q: Queue[ResultT | None]) -> None
                 ):
                     reasoning += chunk.reasoning_content
                     result += chunk.content
-                    if chunk.content and reasoning_time is None:
+                    if chunk.content and reasoning and reasoning_time is None:
                         reasoning_time = perf_counter() - t0
                     # if chunk.content is None and chunk.usage is None:
                     #     continue
@@ -822,12 +822,10 @@ async def _execute_chat_task(self, task: Task, q: Queue[ResultT | None]) -> None
             await q.put(None)
             state_col = f"{task.output_column_name}_"
             state = self._column_dict.get(state_col, {})
-            if references is not None:
-                state["references"] = references.model_dump(mode="json")
-            if reasoning:
-                state["reasoning_content"] = reasoning
-            if reasoning_time is not None:
-                state["reasoning_time"] = reasoning_time
+            # Always update state
+            state["references"] = references.model_dump(mode="json") if references else None
+            state["reasoning_content"] = reasoning if reasoning else None
+            state["reasoning_time"] = reasoning_time if reasoning_time else None
             self._column_dict[state_col] = state
             await self._signal_task_completion(task, result)
             self.log(f'Streamed completion for column "{output_column}": <{mask_string(result)}>.')
diff --git a/services/api/src/owl/entrypoints/llm.py b/services/api/src/owl/entrypoints/llm.py
@@ -234,6 +234,7 @@ async def chat_completion(body: ChatCompletionRequest):
     model_spec = _parse_chat_model_id(body.model)
     num_input_tokens = len(" ".join(m.text_content for m in body.messages).split(" "))
     user_messages = [m for m in body.messages if m.role == ChatRole.USER]
+    reasoning = body.reasoning_effort not in (None, "disable", "minimal", "none")
 
     # Test context length error handling
     if num_input_tokens > model_spec.max_context_length:
@@ -348,15 +349,16 @@ async def stream_response():
                 for i in range(body.n):
                     if model_spec.tpot_ms > 0:
                         await sleep(model_spec.tpot_ms / 1000)
+                    if reasoning and t < 2:
+                        delta = ChatCompletionDelta(reasoning_content=content)
+                    else:
+                        delta = ChatCompletionDelta(content=content)
                     chunk = ChatCompletionChunkResponse(
                         id=body.id,
                         model=model_spec.id,
                         choices=[
                             ChatCompletionChoice(
-                                index=i,
-                                delta=ChatCompletionDelta(content=content),
-                                logprobs=None,
-                                finish_reason=None,
+                                index=i, delta=delta, logprobs=None, finish_reason=None
                             )
                         ],
                         usage=None,
@@ -417,20 +419,25 @@ async def stream_response():
     # Non-stream
     if (model_spec.ttft_ms + model_spec.tpot_ms) > 0:
         await sleep((model_spec.ttft_ms + model_spec.tpot_ms * len(completion_tokens)) / 1000)
+    contents = []
+    reasoning_contents = []
+    for t in range(num_completion_tokens):
+        if reasoning and t < 2:
+            reasoning_contents.append(completion_tokens[t % len(completion_tokens)])
+        else:
+            contents.append(completion_tokens[t % len(completion_tokens)])
     response = ChatCompletionResponse(
         id=body.id,
         model=model_spec.id,
         choices=[
             ChatCompletionChoice(
                 index=i,
                 message=ChatCompletionMessage(
-                    content=" ".join(
-                        completion_tokens[t % len(completion_tokens)]
-                        for t in range(num_completion_tokens)
-                    )
+                    content=" ".join(contents),
+                    reasoning_content=" ".join(reasoning_contents) if reasoning_contents else None,
                 ),
                 logprobs=None,
-                finish_reason="length",
+                finish_reason="length" if num_completion_tokens == body.max_tokens else "stop",
             )
             for i in range(body.n)
         ],
diff --git a/services/api/src/owl/routers/organizations/__init__.py b/services/api/src/owl/routers/organizations/__init__.py
@@ -8,6 +8,7 @@
 router.include_router(oss_router)
 router.include_router(secrets_router)
 
+
 if ENV_CONFIG.is_cloud:
     from owl.routers.organizations.cloud import router as cloud_router
 
diff --git a/services/api/src/owl/routers/organizations/oss.py b/services/api/src/owl/routers/organizations/oss.py
@@ -17,6 +17,7 @@
     PricePlan,
     Project,
     ProjectMember,
+    Secret,
     User,
 )
 from owl.types import (
@@ -316,6 +317,7 @@ async def delete_organization(
     # Delete related resources
     await session.exec(delete(Organization).where(Organization.id == organization_id))
     await session.exec(delete(Project).where(Project.organization_id == organization_id))
+    await session.exec(delete(Secret).where(Secret.organization_id == organization_id))
     if ENV_CONFIG.is_cloud:
         from owl.db.models.cloud import VerificationCode
 
diff --git a/services/api/src/owl/routers/organizations/secrets.py b/services/api/src/owl/routers/organizations/secrets.py
@@ -24,11 +24,17 @@
     ResourceNotFoundError,
     handle_exception,
 )
+from owl.utils.mcp import MCP_TOOL_TAG
 
 router = APIRouter()
 
 
-@router.post("/v2/secrets")
+@router.post(
+    "/v2/secrets",
+    summary="Create an organization secret.",
+    description="Permissions: `organization.ADMIN`.",
+    tags=[MCP_TOOL_TAG, "organization.ADMIN"],
+)
 @handle_exception
 async def create_secret(
     request: Request,
@@ -80,9 +86,7 @@ async def create_secret(
         existing_projects = (await session.exec(statement)).all()
         if len(existing_projects) != len(body.allowed_projects):
             non_exist_projects = set(body.allowed_projects) - set(existing_projects)
-            raise BadInputError(
-                f"Non-existing projects are not allowed: '{"', '".join(non_exist_projects)}'."
-            )
+            raise ResourceNotFoundError(f"Projects not found: {', '.join(non_exist_projects)}")
 
     # Create new secret
     secret = Secret(
@@ -103,8 +107,9 @@ async def create_secret(
 
 @router.get(
     "/v2/secrets/list",
-    summary="List system-wide secrets.",
+    summary="List organization secrets.",
     description="Permissions: `organization.MEMBER`.",
+    tags=[MCP_TOOL_TAG, "organization.MEMBER"],
 )
 @handle_exception
 async def list_secrets(
@@ -158,8 +163,9 @@ async def list_secrets(
 
 @router.get(
     "/v2/secrets",
-    summary="Get a secret.",
+    summary="Get an organization secret.",
     description="Permissions: `organization.MEMBER`.",
+    tags=[MCP_TOOL_TAG, "organization.MEMBER"],
 )
 @handle_exception
 async def get_secret(
@@ -195,11 +201,15 @@ async def get_secret(
     secret = await session.get(Secret, (organization_id, normalized_name))
     if secret is None:
         raise ResourceNotFoundError(f'Secret "{normalized_name}" is not found.')
-
     return secret.to_read_masked()
 
 
-@router.patch("/v2/secrets")
+@router.patch(
+    "/v2/secrets",
+    summary="Update an organization secret.",
+    description="Permissions: `organization.ADMIN`.",
+    tags=[MCP_TOOL_TAG, "organization.ADMIN"],
+)
 @handle_exception
 async def update_secret(
     request: Request,
@@ -256,9 +266,7 @@ async def update_secret(
         existing_projects = (await session.exec(statement)).all()
         if len(existing_projects) != len(body.allowed_projects):
             non_exist_projects = set(body.allowed_projects) - set(existing_projects)
-            raise BadInputError(
-                f"Non-existing projects are not allowed: '{"', '".join(non_exist_projects)}'."
-            )
+            raise ResourceNotFoundError(f"Projects not found: {', '.join(non_exist_projects)}")
     secret, updates = await Secret.update(
         session, (organization_id, normalized_name), body, name="Secret"
     )
@@ -272,8 +280,9 @@ async def update_secret(
 
 @router.delete(
     "/v2/secrets",
-    summary="Delete a secret.",
+    summary="Delete an organization secret.",
     description="Permissions: `organization.ADMIN`.",
+    tags=[MCP_TOOL_TAG, "organization.ADMIN"],
 )
 @handle_exception
 async def delete_secret(
diff --git a/services/api/src/owl/utils/lm.py b/services/api/src/owl/utils/lm.py
@@ -498,8 +498,8 @@ def _prepare_hyperparams(
 
         # Anthropic specific
         if ctx.inference_provider == CloudProvider.ANTHROPIC:
-            # Sonnet 4.5 cannot specify both `temperature` and `top_p`
-            if "sonnet-4-5" in ctx.routing_id:
+            # 4.1 and 4.5 models cannot specify both `temperature` and `top_p`
+            if "-4-1" in ctx.routing_id or "-4-5" in ctx.routing_id:
                 t = hyperparams.get("temperature", None)
                 p = hyperparams.get("top_p", None)
                 if t is not None and p is not None:
@@ -587,6 +587,7 @@ def _prepare_hyperparams(
                     reasoning_effort = "high"
             if ctx.inference_provider == CloudProvider.ELLM:
                 hyperparams["reasoning_effort"] = reasoning_effort
+                hyperparams["allowed_openai_params"] = ["reasoning_effort"]
                 return
             elif ctx.inference_provider in [CloudProvider.GEMINI, CloudProvider.ANTHROPIC]:
                 # Gemini 3-Pro recommends reasoning_effort
@@ -604,8 +605,10 @@ def _prepare_hyperparams(
                     else:
                         thinking_budget = 8192
                 if ctx.inference_provider == CloudProvider.ANTHROPIC:
-                    hyperparams["temperature"] = 1
-                    hyperparams["top_p"] = min(max(0.95, hyperparams.pop("top_p", 1.0)), 1.0)
+                    if "temperature" in hyperparams:
+                        hyperparams["temperature"] = 1
+                    if "top_p" in hyperparams:
+                        hyperparams["top_p"] = min(max(0.95, hyperparams.pop("top_p", 1.0)), 1.0)
                     thinking_budget = max(thinking_budget, 1024)
                 hyperparams["thinking"] = {
                     "type": "enabled",
diff --git a/services/api/src/owl/utils/test.py b/services/api/src/owl/utils/test.py
@@ -401,6 +401,7 @@ def setup_projects():
     type=ModelType.LLM,
     capabilities=[
         ModelCapability.CHAT,
+        ModelCapability.REASONING,
         ModelCapability.IMAGE,
         ModelCapability.AUDIO,
     ],
@@ -483,7 +484,7 @@ def setup_projects():
 ELLM_DESCRIBE_DEPLOYMENT = DeploymentCreate(
     model_id=ELLM_DESCRIBE_CONFIG.id,
     name=f"{ELLM_DESCRIBE_CONFIG.name} Deployment",
-    provider="custom",
+    provider=CloudProvider.ELLM,
     routing_id=ELLM_DESCRIBE_CONFIG.id,
     api_base=ENV_CONFIG.test_llm_api_base,
 )
diff --git a/services/api/tests/gen_table/test_row_ops_v2.py b/services/api/tests/gen_table/test_row_ops_v2.py
diff --git a/services/api/tests/routers/test_secrets.py b/services/api/tests/routers/test_secrets.py