kaito-project
diff --git a/‎backend/src/routes/costs.test.ts‎
Lines changed: 97 additions & 20 deletions b/‎backend/src/routes/costs.test.ts‎
Lines changed: 97 additions & 20 deletions
diff --git a/‎backend/src/routes/deployments.test.ts‎
Lines changed: 141 additions & 0 deletions b/‎backend/src/routes/deployments.test.ts‎
Lines changed: 141 additions & 0 deletions
diff --git a/‎backend/src/routes/deployments.ts‎
Lines changed: 41 additions & 4 deletions b/‎backend/src/routes/deployments.ts‎
Lines changed: 41 additions & 4 deletions
@@ -1,9 +1,12 @@
 import { describe, test, expect, beforeEach, afterEach, mock } from 'bun:test';
 import app from '../hono-app';
 import { cloudPricingService } from '../services/cloudPricing';
+import { kubernetesService } from '../services/kubernetes';
+import { mockServiceMethod } from '../test/helpers';
 
 // Mock fetch globally for pricing API tests
 const originalFetch = globalThis.fetch;
+const restores: Array<() => void> = [];
 
 function mockFetch(response: unknown, options?: { ok?: boolean; status?: number }) {
   // @ts-expect-error - mocking fetch for tests
@@ -23,6 +26,8 @@ describe('Costs Routes', () => {
   });
 
   afterEach(() => {
+    restores.forEach((restore) => restore());
+    restores.length = 0;
     globalThis.fetch = originalFetch;
   });
 
@@ -313,45 +318,117 @@ describe('Costs Routes', () => {
 
   describe('GET /api/costs/node-pools', () => {
     test('returns node pool costs with cache stats', async () => {
-      // This endpoint requires K8s, so we test the response structure
-      // when K8s returns an error or empty data
+      restores.push(
+        mockServiceMethod(kubernetesService, 'getDetailedClusterGpuCapacity', async () => ({
+          totalGpus: 4,
+          allocatedGpus: 1,
+          availableGpus: 3,
+          maxContiguousAvailable: 3,
+          maxNodeGpuCapacity: 4,
+          gpuNodeCount: 1,
+          nodePools: [
+            {
+              name: 'gpu-pool',
+              gpuCount: 4,
+              nodeCount: 1,
+              availableGpus: 3,
+              gpuModel: 'A100-80GB',
+            },
+          ],
+        })),
+      );
+
       const res = await app.request('/api/costs/node-pools');
 
-      // May succeed with empty data or fail with K8s error
-      expect([200, 500]).toContain(res.status);
+      expect(res.status).toBe(200);
 
-      if (res.status === 200) {
-        const data = await res.json();
-        expect(data.success).toBe(true);
-        expect(data.nodePoolCosts).toBeDefined();
-        expect(Array.isArray(data.nodePoolCosts)).toBe(true);
-        expect(data.pricingSource).toBeDefined();
-        expect(data.cacheStats).toBeDefined();
-      }
+      const data = await res.json();
+      expect(data.success).toBe(true);
+      expect(data.nodePoolCosts).toBeDefined();
+      expect(Array.isArray(data.nodePoolCosts)).toBe(true);
+      expect(data.nodePoolCosts[0].poolName).toBe('gpu-pool');
+      expect(data.pricingSource).toBe('realtime-with-fallback');
+      expect(data.cacheStats).toBeDefined();
     });
 
     test('accepts gpuCount and replicas query params', async () => {
+      restores.push(
+        mockServiceMethod(kubernetesService, 'getDetailedClusterGpuCapacity', async () => ({
+          totalGpus: 8,
+          allocatedGpus: 0,
+          availableGpus: 8,
+          maxContiguousAvailable: 8,
+          maxNodeGpuCapacity: 8,
+          gpuNodeCount: 1,
+          nodePools: [
+            {
+              name: 'gpu-pool',
+              gpuCount: 8,
+              nodeCount: 2,
+              availableGpus: 8,
+              gpuModel: 'H100-80GB',
+            },
+          ],
+        })),
+      );
+
       const res = await app.request('/api/costs/node-pools?gpuCount=2&replicas=3');
 
-      // May succeed or fail depending on K8s
-      expect([200, 500]).toContain(res.status);
+      expect(res.status).toBe(200);
+
+      const data = await res.json();
+      expect(data.nodePoolCosts[0].costBreakdown.totalGpus).toBe(6);
     });
 
     test('accepts realtime=false to disable realtime pricing', async () => {
+      restores.push(
+        mockServiceMethod(kubernetesService, 'getDetailedClusterGpuCapacity', async () => ({
+          totalGpus: 4,
+          allocatedGpus: 0,
+          availableGpus: 4,
+          maxContiguousAvailable: 4,
+          maxNodeGpuCapacity: 4,
+          gpuNodeCount: 1,
+          nodePools: [
+            {
+              name: 'gpu-pool',
+              gpuCount: 4,
+              nodeCount: 1,
+              availableGpus: 4,
+              gpuModel: 'A100-80GB',
+              instanceType: 'Standard_NC24ads_A100_v4',
+            },
+          ],
+        })),
+      );
+
       const res = await app.request('/api/costs/node-pools?realtime=false');
 
-      expect([200, 500]).toContain(res.status);
+      expect(res.status).toBe(200);
 
-      if (res.status === 200) {
-        const data = await res.json();
-        expect(data.pricingSource).toBe('static');
-      }
+      const data = await res.json();
+      expect(data.pricingSource).toBe('static');
+      expect(data.nodePoolCosts[0].realtimePricing).toBeUndefined();
     });
 
     test('accepts computeType=cpu for CPU-only pools', async () => {
+      restores.push(
+        mockServiceMethod(kubernetesService, 'getAllNodePools', async () => ([
+          {
+            name: 'cpu-pool',
+            gpuCount: 0,
+            nodeCount: 2,
+            availableGpus: 0,
+          },
+        ])),
+      );
+
       const res = await app.request('/api/costs/node-pools?computeType=cpu');
 
-      expect([200, 500]).toContain(res.status);
+      expect(res.status).toBe(200);
+
+      const data = await res.json();
+      expect(data.nodePoolCosts).toBeArray();
     });
   });
 });
@@ -91,6 +91,62 @@ describe('Deployment Routes', () => {
       expect(spec.model.storage.volumes[0].name).toBe('model-cache');
       expect(spec.model.storage.volumes[0].size).toBe('100Gi');
     });
+
+    test('normalizes KAITO GGUF deployments to llamacpp in preview manifests', async () => {
+      restores.push(
+        mockServiceMethod(configService, 'getDefaultNamespace', async () => 'kaito-workspace'),
+      );
+
+      const res = await app.request('/api/deployments/preview', {
+        method: 'POST',
+        headers: { 'Content-Type': 'application/json' },
+        body: JSON.stringify({
+          ...validDeploymentBody,
+          namespace: 'kaito-workspace',
+          provider: 'kaito',
+          modelId: 'nvidia/Nemotron-3-Nano-4B-gguf',
+          modelSource: 'huggingface',
+          ggufFile: 'nvidia-nemotron-3-nano-4b.Q4_K_M.gguf',
+          ggufRunMode: 'direct',
+        }),
+      });
+
+      expect(res.status).toBe(200);
+
+      const data = await res.json();
+      expect(data.resources[0].manifest.spec.engine.type).toBe('llamacpp');
+      expect(data.resources[0].manifest.spec.engine.args.ggufUrl).toBe(
+        'https://huggingface.co/nvidia/Nemotron-3-Nano-4B-gguf/resolve/main/nvidia-nemotron-3-nano-4b.Q4_K_M.gguf'
+      );
+      expect(data.resources[0].manifest.spec.image).toBe('ghcr.io/kaito-project/aikit/runners/llama-cpp-cuda:latest');
+      expect(data.resources[0].manifest.spec.provider.name).toBe('kaito');
+    });
+
+    test('normalizes KAITO premade deployments to llamacpp in preview manifests', async () => {
+      restores.push(
+        mockServiceMethod(configService, 'getDefaultNamespace', async () => 'kaito-workspace'),
+      );
+
+      const res = await app.request('/api/deployments/preview', {
+        method: 'POST',
+        headers: { 'Content-Type': 'application/json' },
+        body: JSON.stringify({
+          ...validDeploymentBody,
+          namespace: 'kaito-workspace',
+          provider: 'kaito',
+          modelId: 'llama3.2:3b',
+          modelSource: 'premade',
+          premadeModel: 'llama3.2:3b',
+        }),
+      });
+
+      expect(res.status).toBe(200);
+
+      const data = await res.json();
+      expect(data.resources[0].manifest.spec.engine.type).toBe('llamacpp');
+      expect(data.resources[0].manifest.spec.image).toBe('ghcr.io/kaito-project/aikit/llama3.2:3b');
+      expect(data.resources[0].manifest.spec.provider.name).toBe('kaito');
+    });
   });
 
   describe('POST /api/deployments - storage validation', () => {
@@ -577,6 +633,91 @@ describe('Deployment Routes', () => {
   });
 
   describe('POST /api/deployments', () => {
+    test('resolves direct KAITO GGUF deployments to the runner image', async () => {
+      let capturedConfig: any;
+
+      restores.push(
+        mockServiceMethod(kubernetesService, 'createDeployment', async (config) => {
+          capturedConfig = config;
+          return undefined;
+        }),
+      );
+      restores.push(
+        mockServiceMethod(kubernetesService, 'getClusterGpuCapacity', async () => ({
+          totalGpus: 8,
+          allocatedGpus: 0,
+          availableGpus: 8,
+          maxContiguousAvailable: 8,
+          nodes: [],
+        })),
+      );
+      restores.push(
+        mockServiceMethod(configService, 'getDefaultNamespace', async () => 'kaito-workspace'),
+      );
+
+      const res = await app.request('/api/deployments', {
+        method: 'POST',
+        headers: { 'Content-Type': 'application/json' },
+        body: JSON.stringify({
+          name: 'nemotron-direct',
+          namespace: 'kaito-workspace',
+          provider: 'kaito',
+          modelId: 'unsloth/NVIDIA-Nemotron-3-Nano-4B-GGUF',
+          engine: 'llamacpp',
+          modelSource: 'huggingface',
+          ggufFile: 'NVIDIA-Nemotron-3-Nano-4B-Q4_K_M.gguf',
+          ggufRunMode: 'direct',
+          resources: { gpu: 1 },
+        }),
+      });
+
+      expect(res.status).toBe(201);
+      expect(capturedConfig.imageRef).toBe('ghcr.io/kaito-project/aikit/runners/llama-cpp-cuda:latest');
+      expect(capturedConfig.engineArgs?.ggufUrl).toBe(
+        'https://huggingface.co/unsloth/NVIDIA-Nemotron-3-Nano-4B-GGUF/resolve/main/NVIDIA-Nemotron-3-Nano-4B-Q4_K_M.gguf'
+      );
+    });
+
+    test('resolves premade KAITO deployments to the premade image', async () => {
+      let capturedConfig: any;
+
+      restores.push(
+        mockServiceMethod(kubernetesService, 'createDeployment', async (config) => {
+          capturedConfig = config;
+          return undefined;
+        }),
+      );
+      restores.push(
+        mockServiceMethod(kubernetesService, 'getClusterGpuCapacity', async () => ({
+          totalGpus: 8,
+          allocatedGpus: 0,
+          availableGpus: 8,
+          maxContiguousAvailable: 8,
+          nodes: [],
+        })),
+      );
+      restores.push(
+        mockServiceMethod(configService, 'getDefaultNamespace', async () => 'kaito-workspace'),
+      );
+
+      const res = await app.request('/api/deployments', {
+        method: 'POST',
+        headers: { 'Content-Type': 'application/json' },
+        body: JSON.stringify({
+          name: 'llama-premade',
+          namespace: 'kaito-workspace',
+          provider: 'kaito',
+          modelId: 'llama3.2:3b',
+          engine: 'llamacpp',
+          modelSource: 'premade',
+          premadeModel: 'llama3.2:3b',
+        }),
+      });
+
+      expect(res.status).toBe(201);
+      expect(capturedConfig.imageRef).toBe('ghcr.io/kaito-project/aikit/llama3.2:3b');
+    });
+
     test('accepts deployment with providerOverrides', async () => {
       restores.push(
         mockServiceMethod(kubernetesService, 'createDeployment', async () => undefined),
 
@@ -6,6 +6,7 @@ import { kubernetesService } from '../services/kubernetes';
 import { configService } from '../services/config';
 import { metricsService } from '../services/metrics';
 import { validateGpuFit, formatGpuWarnings } from '../services/gpuValidation';
+import { aikitService, GGUF_RUNNER_IMAGE } from '../services/aikit';
 import { handleK8sError } from '../lib/k8s-errors';
 import models from '../data/models.json';
 import logger from '../lib/logger';
@@ -285,6 +286,42 @@ const createDeploymentSchema = z.object({
   }
 });
 
+function resolveDeploymentImages(config: DeploymentConfig): DeploymentConfig {
+  if (config.provider !== 'kaito') {
+    return config;
+  }
+
+  if (config.modelSource === 'premade' && config.premadeModel) {
+    if (config.imageRef) {
+      return config;
+    }
+
+    const imageRef = aikitService.getImageRef({
+      modelSource: 'premade',
+      premadeModel: config.premadeModel,
+    });
+    return imageRef ? { ...config, imageRef } : config;
+  }
+
+  if (config.modelSource === 'huggingface' && config.ggufRunMode === 'direct') {
+    const resolvedConfig: DeploymentConfig = {
+      ...config,
+      imageRef: config.imageRef || GGUF_RUNNER_IMAGE,
+    };
+
+    if (config.ggufFile) {
+      resolvedConfig.engineArgs = {
+        ...(config.engineArgs || {}),
+        ggufUrl: aikitService.buildHuggingFaceUrl(config.modelId, config.ggufFile),
+      };
+    }
+
+    return resolvedConfig;
+  }
+
+  return config;
+}
+
 const deployments = new Hono()
   .get('/', zValidator('query', listDeploymentsQuerySchema), async (c) => {
     try {
@@ -323,10 +360,10 @@ const deployments = new Hono()
   .post('/', zValidator('json', createDeploymentSchema), async (c) => {
     const body = c.req.valid('json');
 
-    const config: DeploymentConfig = {
+    const config = resolveDeploymentImages({
       ...body,
       namespace: body.namespace || (await configService.getDefaultNamespace()),
-    };
+    });
 
     // GPU fit validation
     let gpuWarnings: string[] = [];
@@ -383,10 +420,10 @@ const deployments = new Hono()
   })
   .post('/preview', zValidator('json', createDeploymentSchema), async (c) => {
     const body = c.req.valid('json');
-    const config: DeploymentConfig = {
+    const config = resolveDeploymentImages({
       ...body,
       namespace: body.namespace || (await configService.getDefaultNamespace()),
-    };
+    });
 
     // Apply storage defaults that the mutating webhook would add,
     // so the preview manifest matches what Kubernetes will persist.