kaito-project · robert-cronin · Jun 18, 2026 · May 4, 2026 · May 8, 2026 · May 8, 2026
@@ -197,6 +197,7 @@ providers-test: verify-versions
 	cd providers/kaito && go test ./...
 	cd providers/kuberay && go test ./...
 	cd providers/llmd && go test ./...
+	cd providers/vllm && go test ./...
 	@echo "✅ Provider tests completed"
 
 # Generate deploy manifests for controller and dashboard

@@ -31,6 +31,7 @@ AI Runway gives you a web UI and a unified Kubernetes CRD (`ModelDeployment`) to
 | [**KubeRay**](https://github.com/ray-project/kuberay)    | Ray-based distributed inference                                    | [kuberay.yaml](providers/kuberay/deploy/kuberay.yaml) |
 | [**KAITO**](https://github.com/kaito-project/kaito)      | vLLM (GPU) and llama.cpp (CPU/GPU) support                         | [kaito.yaml](providers/kaito/deploy/kaito.yaml)       |
 | [**LLM-D**](https://github.com/llm-d/llm-d)              | vLLM (GPU) with aggregated or disaggregated serving                | [llmd.yaml](providers/llmd/deploy/llmd.yaml)          |
+| [**Direct vLLM**](docs/providers/vllm.md)                 | Direct OpenAI-compatible vLLM Deployments for newest model support | [vllm.yaml](providers/vllm/deploy/vllm.yaml)          |
 
 ## Quick Start
 

@@ -2,7 +2,7 @@
 
 ## WHY: Project Purpose
 
-**AI Runway** is a platform for deploying and managing machine learning models on Kubernetes. It provides a unified CRD abstraction (`ModelDeployment`) that works across multiple inference providers (KAITO, Dynamo, KubeRay, llm-d, etc.).
+**AI Runway** is a platform for deploying and managing machine learning models on Kubernetes. It provides a unified CRD abstraction (`ModelDeployment`) that works across multiple inference providers (KAITO, Dynamo, KubeRay, llm-d, Direct vLLM, etc.).
 
 ## WHAT: Tech Stack & Structure
 
@@ -18,6 +18,7 @@
   - `controller/config/` - Kustomize manifests for CRDs/RBAC
 - `frontend/src/` - React components, hooks, pages
 - `backend/src/` - Hono app, providers, services
+- `providers/` - Standalone provider controllers/shims (`dynamo`, `kaito`, `kuberay`, `llmd`, `vllm`); each renders `ModelDeployment` into its upstream resource. `providers/vllm` is the in-repo Direct vLLM provider (renders native `Deployment`+`Service`, selected via `provider.name: vllm`).
 - `shared/types/` - Shared TypeScript definitions
 - `plugins/headlamp/` - Headlamp dashboard plugin
 - `docs/` - Detailed documentation (read as needed; also the source rendered on the website)
@@ -103,7 +104,9 @@ Unified API for deploying ML models. Key fields:
 - `spec.model.id` - HuggingFace model ID or custom identifier
 - `spec.model.source` - `huggingface` or `custom`
 - `spec.engine.type` - `vllm`, `sglang`, `trtllm`, or `llamacpp` (optional, auto-selected from provider capabilities)
-- `spec.provider.name` - Optional explicit provider selection
+- `spec.engine.image` - Optional engine-specific container image override (preferred over legacy top-level `spec.image`; used by Direct vLLM/custom images)
+- `spec.engine.extraArgs` - Optional list of raw engine flags appended verbatim
+- `spec.provider.name` - Optional explicit provider selection (`kaito`, `dynamo`, `kuberay`, `llmd`, `vllm`)
 - `spec.serving.mode` - `aggregated` (default) or `disaggregated`
 - `spec.resources.gpu.count` - GPU count for aggregated mode
 - `spec.scaling.prefill/decode` - Component scaling for disaggregated mode

@@ -77,6 +77,7 @@ function collectFiles(dir: string, prefix: string = ''): AssetInfo[] {
 function generateModule(assets: AssetInfo[]): string {
   const lines: string[] = [
     '// AUTO-GENERATED FILE - DO NOT EDIT',
+    '// @ts-nocheck - Bun file imports are validated by the bundler, not tsc',
     '// Generated by scripts/embed-assets.ts',
     '// Run "bun run embed" to regenerate',
     '//',

@@ -30,6 +30,7 @@ import {
   aiconfigurator,
   costs,
   gateway,
+  vllmRecipes,
 } from './routes';
 
 // Load static files at startup
@@ -203,6 +204,7 @@ app.route('/api/aikit', aikit);
 app.route('/api/aiconfigurator', aiconfigurator);
 app.route('/api/costs', costs);
 app.route('/api/gateway', gateway);
+app.route('/api/vllm/recipes', vllmRecipes);
 
 // Static file serving middleware - uses Bun.file() for zero-copy serving
 app.use('*', async (c, next) => {

@@ -264,6 +264,90 @@ describe('Deployment Routes', () => {
       const data = await res.json();
       expect(data.resources[0].manifest.spec.gateway).toEqual({ enabled: false });
     });
+
+    test('preserves env in preview manifests', async () => {
+      restores.push(
+        mockServiceMethod(configService, 'getDefaultNamespace', async () => 'default'),
+      );
+
+      const env = {
+        VLLM_USE_V1: '1',
+        NCCL_DEBUG: 'INFO',
+      };
+
+      const res = await app.request('/api/deployments/preview', {
+        method: 'POST',
+        headers: { 'Content-Type': 'application/json' },
+        body: JSON.stringify({
+          ...validDeploymentBody,
+          provider: 'vllm',
+          env,
+        }),
+      });
+
+      expect(res.status).toBe(200);
+
+      const data = await res.json();
+      expect(data.resources[0].manifest.spec.env).toEqual([
+        { name: 'VLLM_USE_V1', value: '1' },
+        { name: 'NCCL_DEBUG', value: 'INFO' },
+      ]);
+    });
+
+    test('preserves Direct vLLM recipe provenance as metadata annotations in preview manifests', async () => {
+      restores.push(
+        mockServiceMethod(configService, 'getDefaultNamespace', async () => 'default'),
+      );
+
+      const imageRef = 'vllm/vllm-openai@sha256:1111111111111111111111111111111111111111111111111111111111111111';
+      const recipeFeatures = ['prefixCaching', 'kvCacheDtype'];
+
+      const res = await app.request('/api/deployments/preview', {
+        method: 'POST',
+        headers: { 'Content-Type': 'application/json' },
+        body: JSON.stringify({
+          ...validDeploymentBody,
+          name: 'recipe-vllm',
+          provider: 'vllm',
+          imageRef,
+          engineExtraArgs: ['--enable-auto-tool-choice'],
+          recipeProvenance: {
+            source: 'vllm-recipes',
+            id: 'meta-llama/Llama-3.1-8B-Instruct',
+            strategy: 'single_node_tp',
+            hardware: 'h100',
+            variant: 'default',
+            precision: 'bf16',
+            features: recipeFeatures,
+            revision: '2026-05-04',
+          },
+        }),
+      });
+
+      expect(res.status).toBe(200);
+
+      const data = await res.json();
+      const manifest = data.resources[0].manifest;
+      expect(manifest.metadata.annotations).toEqual({
+        'airunway.ai/generated-by': 'vllm-recipe-resolver',
+        'airunway.ai/recipe.source': 'vllm-recipes',
+        'airunway.ai/recipe.id': 'meta-llama/Llama-3.1-8B-Instruct',
+        'airunway.ai/recipe.strategy': 'single_node_tp',
+        'airunway.ai/recipe.hardware': 'h100',
+        'airunway.ai/recipe.variant': 'default',
+        'airunway.ai/recipe.precision': 'bf16',
+        'airunway.ai/recipe.revision': '2026-05-04',
+        'airunway.ai/recipe.features': JSON.stringify(recipeFeatures),
+      });
+      expect(manifest.spec.provider.name).toBe('vllm');
+      expect(manifest.spec.engine.type).toBe('vllm');
+      expect(manifest.spec.engine.image).toBe(imageRef);
+      expect(manifest.spec.engine.extraArgs).toEqual(['--enable-auto-tool-choice']);
+      expect(manifest.spec.image).toBeUndefined();
+      expect(manifest.spec.recipe).toBeUndefined();
+      expect(manifest.spec.recipes).toBeUndefined();
+      expect(manifest.status?.recipe).toBeUndefined();
+    });
   });
 
   describe('POST /api/deployments - storage validation', () => {
@@ -979,6 +1063,48 @@ describe('Deployment Routes', () => {
       expect(capturedConfig.imageRef).toBe('ghcr.io/kaito-project/aikit/llama3.2:3b');
     });
 
+    test('passes env through create schema to Kubernetes service', async () => {
+      let capturedConfig: DeploymentConfig | undefined;
+
+      restores.push(
+        mockServiceMethod(kubernetesService, 'createDeployment', async (config) => {
+          capturedConfig = config;
+          return undefined;
+        }),
+      );
+      restores.push(
+        mockServiceMethod(kubernetesService, 'getClusterGpuCapacity', async () => ({
+          totalGpus: 16,
+          allocatedGpus: 0,
+          availableGpus: 16,
+          maxContiguousAvailable: 8,
+          nodes: [],
+        })),
+      );
+      restores.push(
+        mockServiceMethod(configService, 'getDefaultNamespace', async () => 'default'),
+      );
+
+      const env = {
+        VLLM_USE_V1: '1',
+        NCCL_DEBUG: 'INFO',
+      };
+
+      const res = await app.request('/api/deployments', {
+        method: 'POST',
+        headers: { 'Content-Type': 'application/json' },
+        body: JSON.stringify({
+          ...validDeploymentBody,
+          name: 'env-test',
+          provider: 'vllm',
+          env,
+        }),
+      });
+
+      expect(res.status).toBe(201);
+      expect(capturedConfig.env).toEqual(env);
+    });
+
     test('accepts deployment with providerOverrides', async () => {
       restores.push(
         mockServiceMethod(kubernetesService, 'createDeployment', async () => undefined),

@@ -132,6 +132,17 @@ const storageSchema = z.object({
   volumes: z.array(storageVolumeSchema).max(8, 'Maximum 8 storage volumes allowed').optional(),
 }).optional();
 
+const recipeProvenanceSchema = z.object({
+  source: z.string().optional(),
+  id: z.string().optional(),
+  strategy: z.string().optional(),
+  hardware: z.string().optional(),
+  variant: z.string().optional(),
+  precision: z.string().optional(),
+  features: z.array(z.string()).optional(),
+  revision: z.string().optional(),
+}).optional();
+
 const createDeploymentSchema = z.object({
   name: resourceNameSchema,
   modelId: z.string().min(1, 'Model ID is required'),
@@ -152,6 +163,8 @@ const createDeploymentSchema = z.object({
     memory: z.string().optional(),
   }).optional(),
   engineArgs: z.record(z.string(), z.unknown()).optional(),
+  engineExtraArgs: z.array(z.string()).optional(),
+  env: z.record(z.string(), z.string()).optional(),
   providerOverrides: z.record(z.string(), z.unknown()).optional(),
   prefillReplicas: z.number().int().min(0).optional(),
   decodeReplicas: z.number().int().min(0).optional(),
@@ -165,6 +178,7 @@ const createDeploymentSchema = z.object({
   computeType: z.enum(['cpu', 'gpu']).optional(),
   maxModelLen: z.number().int().positive().optional(),
   gatewayEnabled: z.boolean().optional(),
+  recipeProvenance: recipeProvenanceSchema,
   storage: storageSchema,
 }).superRefine((data, ctx) => {
   const volumes = data.storage?.volumes;

@@ -11,3 +11,4 @@ export { default as aikit } from './aikit';
 export { default as aiconfigurator } from './aiconfigurator';
 export { costsRoutes as costs } from './costs';
 export { default as gateway } from './gateway';
+export { default as vllmRecipes } from './vllmRecipes';
@@ -0,0 +1,102 @@
+import { Hono } from 'hono';
+import { zValidator } from '@hono/zod-validator';
+import { z } from 'zod';
+import { HTTPException } from 'hono/http-exception';
+import type { ContentfulStatusCode } from 'hono/utils/http-status';
+import type { VllmRecipeResolveRequest } from '@airunway/shared';
+import {
+  vllmRecipesClient,
+  VllmRecipeValidationError,
+  VllmRecipeTimeoutError,
+} from '../services/vllmRecipesClient';
+import { vllmRecipeResolver } from '../services/vllmRecipeResolver';
+import logger from '../lib/logger';
+
+// Map recipe errors to HTTP status so callers can distinguish bad input
+// (4xx, do not retry) from an upstream recipes.vllm.ai outage (5xx).
+function recipeErrorStatus(error: unknown): ContentfulStatusCode {
+  if (error instanceof VllmRecipeValidationError) {
+    return 400;
+  }
+  if (error instanceof VllmRecipeTimeoutError) {
+    return 504;
+  }
+  return 502;
+}
+
+function recipeHttpException(error: unknown, fallbackMessage: string): HTTPException {
+  return new HTTPException(recipeErrorStatus(error), {
+    message: error instanceof Error ? error.message : fallbackMessage,
+  });
+}
+
+const imageChoiceSchema = z.discriminatedUnion('type', [
+  z.object({ type: z.literal('recipe') }),
+  z.object({ type: z.literal('custom'), imageRef: z.string().min(1, 'imageRef is required') }),
+  z.object({ type: z.literal('none') }),
+]);
+
+const resolveRequestSchema = z.object({
+  modelId: z.string().min(1, 'modelId is required'),
+  mode: z.enum(['aggregated', 'disaggregated']).optional(),
+  hardware: z.string().min(1).optional(),
+  strategy: z.string().min(1).optional(),
+  variant: z.string().min(1).optional(),
+  features: z.array(z.string().min(1)).optional(),
+  imageChoice: imageChoiceSchema.optional(),
+});
+
+const vllmRecipes = new Hono()
+  /**
+   * GET /api/vllm/recipes
+   * List recipe index entries from recipes.vllm.ai.
+   */
+  .get('/', async (c) => {
+    try {
+      const result = await vllmRecipesClient.list();
+      return c.json(result);
+    } catch (error) {
+      logger.error({ error }, 'Failed to list vLLM recipes');
+      throw recipeHttpException(error, 'Failed to list vLLM recipes');
+    }
+  })
+
+  /**
+   * POST /api/vllm/recipes/resolve
+   * Resolve a vLLM recipe into Direct vLLM deployment fields.
+   */
+  .post('/resolve', zValidator('json', resolveRequestSchema), async (c) => {
+    const request = c.req.valid('json') as VllmRecipeResolveRequest;
+
+    try {
+      const result = await vllmRecipeResolver.resolve(request);
+      return c.json(result);
+    } catch (error) {
+      logger.error({ error, modelId: request.modelId }, 'Failed to resolve vLLM recipe');
+      throw recipeHttpException(error, 'Failed to resolve vLLM recipe');
+    }
+  })
+
+  /**
+   * GET /api/vllm/recipes/:org/:model
+   * Fetch the raw recipe payload for a Hugging Face model ID.
+   *
+   * `:model` is intentionally a single path segment (no `{.+}`): a Hugging Face
+   * model ID is exactly `<org>/<model>`, and allowing `/` here would let crafted
+   * paths traverse under the recipes base URL. `vllmRecipesClient` re-validates
+   * the resulting ID as a second layer of defense.
+   */
+  .get('/:org/:model', async (c) => {
+    const org = c.req.param('org');
+    const model = c.req.param('model');
+
+    try {
+      const result = await vllmRecipesClient.get(org, model);
+      return c.json(result);
+    } catch (error) {
+      logger.error({ error, org, model }, 'Failed to fetch vLLM recipe');
+      throw recipeHttpException(error, 'Failed to fetch vLLM recipe');
+    }
+  });
+
+export default vllmRecipes;