kaito-project
diff --git a/‎.github/workflows/test.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/test.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎Makefile‎
Lines changed: 9 additions & 2 deletions b/‎Makefile‎
Lines changed: 9 additions & 2 deletions
diff --git a/‎README.md‎
Lines changed: 1 addition & 0 deletions b/‎README.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎agents.md‎
Lines changed: 5 additions & 2 deletions b/‎agents.md‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎backend/scripts/embed-assets.ts‎
Lines changed: 1 addition & 0 deletions b/‎backend/scripts/embed-assets.ts‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎backend/src/hono-app.ts‎
Lines changed: 8 additions & 0 deletions b/‎backend/src/hono-app.ts‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎backend/src/routes/deployments.test.ts‎
Lines changed: 132 additions & 3 deletions b/‎backend/src/routes/deployments.test.ts‎
Lines changed: 132 additions & 3 deletions
diff --git a/‎backend/src/routes/deployments.ts‎
Lines changed: 14 additions & 0 deletions b/‎backend/src/routes/deployments.ts‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎backend/src/routes/index.ts‎
Lines changed: 1 addition & 0 deletions b/‎backend/src/routes/index.ts‎
Lines changed: 1 addition & 0 deletions
@@ -121,7 +121,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        provider: [dynamo, kaito, kuberay, llmd]
+        provider: [dynamo, kaito, kuberay, llmd, vllm]
 
     steps:
       - name: Checkout repository
 
@@ -197,6 +197,7 @@ providers-test: verify-versions
 	cd providers/kaito && go test ./...
 	cd providers/kuberay && go test ./...
 	cd providers/llmd && go test ./...
+	cd providers/vllm && go test ./...
 	@echo "✅ Provider tests completed"
 
 # Generate deploy manifests for controller and dashboard
@@ -279,6 +280,7 @@ cleanup-gateway:
 GAIE_VERSION_RE := $(subst .,\.,$(GAIE_VERSION))
 DYNAMO_VERSION_RE := $(subst .,\.,$(DYNAMO_VERSION))
 KAITO_VERSION_RE := $(subst .,\.,$(KAITO_VERSION))
+VLLM_VERSION_RE := $(subst .,\.,$(VLLM_VERSION))
 
 verify-versions:
 	@# 1. controller/go.mod must pin GAIE_VERSION
@@ -296,7 +298,10 @@ verify-versions:
 	@# 5. providers/kaito/config.go install Command --version arg must match KAITO_VERSION
 	@grep -qE -- '--version $(KAITO_VERSION_RE) ' providers/kaito/config.go || \
 	  { echo "❌ providers/kaito/config.go install Command --version != $(KAITO_VERSION) (from versions.env)"; exit 1; }
-	@# 6. generated TS must be in sync with versions.env.
+	@# 6. providers/vllm/transformer.go fallback literal must match VLLM_VERSION
+	@grep -qE '^var VLLMVersion = "$(VLLM_VERSION_RE)"$$' providers/vllm/transformer.go || \
+	  { echo "❌ providers/vllm/transformer.go VLLMVersion fallback != $(VLLM_VERSION) (from versions.env)"; exit 1; }
+	@# 7. generated TS must be in sync with versions.env.
 	@#    Generate to a temp file and diff against the working-tree copy so
 	@#    that synced uncommitted edits pass (the local-dev case) while
 	@#    stale committed files still fail (the CI case — CI's working
@@ -314,7 +319,9 @@ verify-versions:
 	    echo "❌ shared/types/versions.generated.ts is stale — run 'cd shared && bun run generate-versions' and commit the result"; \
 	    exit 1; \
 	  }
-	@echo "✅ versions in sync (GAIE_VERSION=$(GAIE_VERSION), DYNAMO_VERSION=$(DYNAMO_VERSION), KAITO_VERSION=$(KAITO_VERSION))"
+	@# Print the versions straight from versions.env so this summary stays in
+	@# sync automatically as keys are added (no hardcoded list to maintain).
+	@printf '✅ versions in sync (%s)\n' "$$(awk -F= '/^[A-Z][A-Z0-9_]*=/ { printf "%s%s=%s", sep, $$1, $$2; sep=", " }' versions.env)"
 
 # Test the verify-versions guard itself by deliberately breaking each
 # input it inspects and asserting the target exits non-zero.
 
@@ -31,6 +31,7 @@ AI Runway gives you a web UI and a unified Kubernetes CRD (`ModelDeployment`) to
 | [**KubeRay**](https://github.com/ray-project/kuberay)    | Ray-based distributed inference                                    | [kuberay.yaml](providers/kuberay/deploy/kuberay.yaml) |
 | [**KAITO**](https://github.com/kaito-project/kaito)      | vLLM (GPU) and llama.cpp (CPU/GPU) support                         | [kaito.yaml](providers/kaito/deploy/kaito.yaml)       |
 | [**LLM-D**](https://github.com/llm-d/llm-d)              | vLLM (GPU) with aggregated or disaggregated serving                | [llmd.yaml](providers/llmd/deploy/llmd.yaml)          |
+| [**Direct vLLM**](docs/providers/vllm.md)                 | Direct OpenAI-compatible vLLM Deployments for newest model support | [vllm.yaml](providers/vllm/deploy/vllm.yaml)          |
 
 ## Quick Start
 
 
@@ -2,7 +2,7 @@
 
 ## WHY: Project Purpose
 
-**AI Runway** is a platform for deploying and managing machine learning models on Kubernetes. It provides a unified CRD abstraction (`ModelDeployment`) that works across multiple inference providers (KAITO, Dynamo, KubeRay, llm-d, etc.).
+**AI Runway** is a platform for deploying and managing machine learning models on Kubernetes. It provides a unified CRD abstraction (`ModelDeployment`) that works across multiple inference providers (KAITO, Dynamo, KubeRay, llm-d, Direct vLLM, etc.).
 
 ## WHAT: Tech Stack & Structure
 
@@ -18,6 +18,7 @@
   - `controller/config/` - Kustomize manifests for CRDs/RBAC
 - `frontend/src/` - React components, hooks, pages
 - `backend/src/` - Hono app, providers, services
+- `providers/` - Standalone provider controllers/shims (`dynamo`, `kaito`, `kuberay`, `llmd`, `vllm`); each renders `ModelDeployment` into its upstream resource. `providers/vllm` is the in-repo Direct vLLM provider (renders native `Deployment`+`Service`, selected via `provider.name: vllm`).
 - `shared/types/` - Shared TypeScript definitions
 - `plugins/headlamp/` - Headlamp dashboard plugin
 - `docs/` - Detailed documentation (read as needed; also the source rendered on the website)
@@ -103,7 +104,9 @@ Unified API for deploying ML models. Key fields:
 - `spec.model.id` - HuggingFace model ID or custom identifier
 - `spec.model.source` - `huggingface` or `custom`
 - `spec.engine.type` - `vllm`, `sglang`, `trtllm`, or `llamacpp` (optional, auto-selected from provider capabilities)
-- `spec.provider.name` - Optional explicit provider selection
+- `spec.engine.image` - Optional engine-specific container image override (preferred over legacy top-level `spec.image`; used by Direct vLLM/custom images)
+- `spec.engine.extraArgs` - Optional list of raw engine flags appended verbatim
+- `spec.provider.name` - Optional explicit provider selection (`kaito`, `dynamo`, `kuberay`, `llmd`, `vllm`)
 - `spec.serving.mode` - `aggregated` (default) or `disaggregated`
 - `spec.resources.gpu.count` - GPU count for aggregated mode
 - `spec.scaling.prefill/decode` - Component scaling for disaggregated mode
 
@@ -77,6 +77,7 @@ function collectFiles(dir: string, prefix: string = ''): AssetInfo[] {
 function generateModule(assets: AssetInfo[]): string {
   const lines: string[] = [
     '// AUTO-GENERATED FILE - DO NOT EDIT',
+    '// @ts-nocheck - Bun file imports are validated by the bundler, not tsc',
     '// Generated by scripts/embed-assets.ts',
     '// Run "bun run embed" to regenerate',
     '//',
 
@@ -1,6 +1,7 @@
 import { Hono } from 'hono';
 import { cors } from 'hono/cors';
 import { compress } from 'hono/compress';
+import { trimTrailingSlash } from 'hono/trailing-slash';
 import { HTTPException } from 'hono/http-exception';
 
 import { authService } from './services/auth';
@@ -30,6 +31,7 @@ import {
   aiconfigurator,
   costs,
   gateway,
+  vllmRecipes,
 } from './routes';
 
 // Load static files at startup
@@ -105,6 +107,11 @@ const app = new Hono<AppEnv>();
 
 // Global middleware
 app.use('*', compress());
+// Treat a trailing slash as equivalent to no slash: Hono routes strictly, so
+// "/api/vllm/recipes/" would otherwise 404 while "/api/vllm/recipes" works.
+// This only acts on a would-be 404 GET/HEAD, 301-redirecting to the no-slash
+// path, so it never changes the outcome of an already-matched route.
+app.use('*', trimTrailingSlash());
 app.use(
   '*',
   cors({
@@ -203,6 +210,7 @@ app.route('/api/aikit', aikit);
 app.route('/api/aiconfigurator', aiconfigurator);
 app.route('/api/costs', costs);
 app.route('/api/gateway', gateway);
+app.route('/api/vllm/recipes', vllmRecipes);
 
 // Static file serving middleware - uses Bun.file() for zero-copy serving
 app.use('*', async (c, next) => {
 
@@ -264,6 +264,90 @@ describe('Deployment Routes', () => {
       const data = await res.json();
       expect(data.resources[0].manifest.spec.gateway).toEqual({ enabled: false });
     });
+
+    test('preserves env in preview manifests', async () => {
+      restores.push(
+        mockServiceMethod(configService, 'getDefaultNamespace', async () => 'default'),
+      );
+
+      const env = {
+        VLLM_USE_V1: '1',
+        NCCL_DEBUG: 'INFO',
+      };
+
+      const res = await app.request('/api/deployments/preview', {
+        method: 'POST',
+        headers: { 'Content-Type': 'application/json' },
+        body: JSON.stringify({
+          ...validDeploymentBody,
+          provider: 'vllm',
+          env,
+        }),
+      });
+
+      expect(res.status).toBe(200);
+
+      const data = await res.json();
+      expect(data.resources[0].manifest.spec.env).toEqual([
+        { name: 'VLLM_USE_V1', value: '1' },
+        { name: 'NCCL_DEBUG', value: 'INFO' },
+      ]);
+    });
+
+    test('preserves Direct vLLM recipe provenance as metadata annotations in preview manifests', async () => {
+      restores.push(
+        mockServiceMethod(configService, 'getDefaultNamespace', async () => 'default'),
+      );
+
+      const imageRef = 'vllm/vllm-openai@sha256:1111111111111111111111111111111111111111111111111111111111111111';
+      const recipeFeatures = ['prefixCaching', 'kvCacheDtype'];
+
+      const res = await app.request('/api/deployments/preview', {
+        method: 'POST',
+        headers: { 'Content-Type': 'application/json' },
+        body: JSON.stringify({
+          ...validDeploymentBody,
+          name: 'recipe-vllm',
+          provider: 'vllm',
+          imageRef,
+          engineExtraArgs: ['--enable-auto-tool-choice'],
+          recipeProvenance: {
+            source: 'vllm-recipes',
+            id: 'meta-llama/Llama-3.1-8B-Instruct',
+            strategy: 'single_node_tp',
+            hardware: 'h100',
+            variant: 'default',
+            precision: 'bf16',
+            features: recipeFeatures,
+            revision: '2026-05-04',
+          },
+        }),
+      });
+
+      expect(res.status).toBe(200);
+
+      const data = await res.json();
+      const manifest = data.resources[0].manifest;
+      expect(manifest.metadata.annotations).toEqual({
+        'airunway.ai/generated-by': 'vllm-recipe-resolver',
+        'airunway.ai/recipe.source': 'vllm-recipes',
+        'airunway.ai/recipe.id': 'meta-llama/Llama-3.1-8B-Instruct',
+        'airunway.ai/recipe.strategy': 'single_node_tp',
+        'airunway.ai/recipe.hardware': 'h100',
+        'airunway.ai/recipe.variant': 'default',
+        'airunway.ai/recipe.precision': 'bf16',
+        'airunway.ai/recipe.revision': '2026-05-04',
+        'airunway.ai/recipe.features': JSON.stringify(recipeFeatures),
+      });
+      expect(manifest.spec.provider.name).toBe('vllm');
+      expect(manifest.spec.engine.type).toBe('vllm');
+      expect(manifest.spec.engine.image).toBe(imageRef);
+      expect(manifest.spec.engine.extraArgs).toEqual(['--enable-auto-tool-choice']);
+      expect(manifest.spec.image).toBeUndefined();
+      expect(manifest.spec.recipe).toBeUndefined();
+      expect(manifest.spec.recipes).toBeUndefined();
+      expect(manifest.status?.recipe).toBeUndefined();
+    });
   });
 
   describe('POST /api/deployments - storage validation', () => {
@@ -933,8 +1017,9 @@ describe('Deployment Routes', () => {
       });
 
       expect(res.status).toBe(201);
-      expect(capturedConfig.imageRef).toBe('ghcr.io/kaito-project/aikit/runners/llama-cpp-cuda:latest');
-      expect(capturedConfig.engineArgs?.ggufUrl).toBe(
+      expect(capturedConfig).toBeDefined();
+      expect(capturedConfig!.imageRef).toBe('ghcr.io/kaito-project/aikit/runners/llama-cpp-cuda:latest');
+      expect(capturedConfig!.engineArgs?.ggufUrl).toBe(
         'https://huggingface.co/unsloth/NVIDIA-Nemotron-3-Nano-4B-GGUF/resolve/main/NVIDIA-Nemotron-3-Nano-4B-Q4_K_M.gguf'
       );
     });
@@ -976,7 +1061,51 @@ describe('Deployment Routes', () => {
       });
 
       expect(res.status).toBe(201);
-      expect(capturedConfig.imageRef).toBe('ghcr.io/kaito-project/aikit/llama3.2:3b');
+      expect(capturedConfig).toBeDefined();
+      expect(capturedConfig!.imageRef).toBe('ghcr.io/kaito-project/aikit/llama3.2:3b');
+    });
+
+    test('passes env through create schema to Kubernetes service', async () => {
+      let capturedConfig: DeploymentConfig | undefined;
+
+      restores.push(
+        mockServiceMethod(kubernetesService, 'createDeployment', async (config) => {
+          capturedConfig = config;
+          return undefined;
+        }),
+      );
+      restores.push(
+        mockServiceMethod(kubernetesService, 'getClusterGpuCapacity', async () => ({
+          totalGpus: 16,
+          allocatedGpus: 0,
+          availableGpus: 16,
+          maxContiguousAvailable: 8,
+          nodes: [],
+        })),
+      );
+      restores.push(
+        mockServiceMethod(configService, 'getDefaultNamespace', async () => 'default'),
+      );
+
+      const env = {
+        VLLM_USE_V1: '1',
+        NCCL_DEBUG: 'INFO',
+      };
+
+      const res = await app.request('/api/deployments', {
+        method: 'POST',
+        headers: { 'Content-Type': 'application/json' },
+        body: JSON.stringify({
+          ...validDeploymentBody,
+          name: 'env-test',
+          provider: 'vllm',
+          env,
+        }),
+      });
+
+      expect(res.status).toBe(201);
+      expect(capturedConfig).toBeDefined();
+      expect(capturedConfig!.env).toEqual(env);
     });
 
     test('accepts deployment with providerOverrides', async () => {
 
@@ -132,6 +132,17 @@ const storageSchema = z.object({
   volumes: z.array(storageVolumeSchema).max(8, 'Maximum 8 storage volumes allowed').optional(),
 }).optional();
 
+const recipeProvenanceSchema = z.object({
+  source: z.string().optional(),
+  id: z.string().optional(),
+  strategy: z.string().optional(),
+  hardware: z.string().optional(),
+  variant: z.string().optional(),
+  precision: z.string().optional(),
+  features: z.array(z.string()).optional(),
+  revision: z.string().optional(),
+}).optional();
+
 const createDeploymentSchema = z.object({
   name: resourceNameSchema,
   modelId: z.string().min(1, 'Model ID is required'),
@@ -152,6 +163,8 @@ const createDeploymentSchema = z.object({
     memory: z.string().optional(),
   }).optional(),
   engineArgs: z.record(z.string(), z.unknown()).optional(),
+  engineExtraArgs: z.array(z.string()).optional(),
+  env: z.record(z.string(), z.string()).optional(),
   providerOverrides: z.record(z.string(), z.unknown()).optional(),
   prefillReplicas: z.number().int().min(0).optional(),
   decodeReplicas: z.number().int().min(0).optional(),
@@ -165,6 +178,7 @@ const createDeploymentSchema = z.object({
   computeType: z.enum(['cpu', 'gpu']).optional(),
   maxModelLen: z.number().int().positive().optional(),
   gatewayEnabled: z.boolean().optional(),
+  recipeProvenance: recipeProvenanceSchema,
   storage: storageSchema,
 }).superRefine((data, ctx) => {
   const volumes = data.storage?.volumes;
 
@@ -11,3 +11,4 @@ export { default as aikit } from './aikit';
 export { default as aiconfigurator } from './aiconfigurator';
 export { costsRoutes as costs } from './costs';
 export { default as gateway } from './gateway';
+export { default as vllmRecipes } from './vllmRecipes';