Skip to content

Commit 054ba06

Browse files
authored
Improve KAITO deployment normalization and gateway guidance (#165)
1 parent 2a209ff commit 054ba06

17 files changed

Lines changed: 647 additions & 82 deletions

File tree

backend/src/routes/costs.test.ts

Lines changed: 97 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,12 @@
11
import { describe, test, expect, beforeEach, afterEach, mock } from 'bun:test';
22
import app from '../hono-app';
33
import { cloudPricingService } from '../services/cloudPricing';
4+
import { kubernetesService } from '../services/kubernetes';
5+
import { mockServiceMethod } from '../test/helpers';
46

57
// Mock fetch globally for pricing API tests
68
const originalFetch = globalThis.fetch;
9+
const restores: Array<() => void> = [];
710

811
function mockFetch(response: unknown, options?: { ok?: boolean; status?: number }) {
912
// @ts-expect-error - mocking fetch for tests
@@ -23,6 +26,8 @@ describe('Costs Routes', () => {
2326
});
2427

2528
afterEach(() => {
29+
restores.forEach((restore) => restore());
30+
restores.length = 0;
2631
globalThis.fetch = originalFetch;
2732
});
2833

@@ -313,45 +318,117 @@ describe('Costs Routes', () => {
313318

314319
describe('GET /api/costs/node-pools', () => {
315320
test('returns node pool costs with cache stats', async () => {
316-
// This endpoint requires K8s, so we test the response structure
317-
// when K8s returns an error or empty data
321+
restores.push(
322+
mockServiceMethod(kubernetesService, 'getDetailedClusterGpuCapacity', async () => ({
323+
totalGpus: 4,
324+
allocatedGpus: 1,
325+
availableGpus: 3,
326+
maxContiguousAvailable: 3,
327+
maxNodeGpuCapacity: 4,
328+
gpuNodeCount: 1,
329+
nodePools: [
330+
{
331+
name: 'gpu-pool',
332+
gpuCount: 4,
333+
nodeCount: 1,
334+
availableGpus: 3,
335+
gpuModel: 'A100-80GB',
336+
},
337+
],
338+
})),
339+
);
340+
318341
const res = await app.request('/api/costs/node-pools');
319342

320-
// May succeed with empty data or fail with K8s error
321-
expect([200, 500]).toContain(res.status);
343+
expect(res.status).toBe(200);
322344

323-
if (res.status === 200) {
324-
const data = await res.json();
325-
expect(data.success).toBe(true);
326-
expect(data.nodePoolCosts).toBeDefined();
327-
expect(Array.isArray(data.nodePoolCosts)).toBe(true);
328-
expect(data.pricingSource).toBeDefined();
329-
expect(data.cacheStats).toBeDefined();
330-
}
345+
const data = await res.json();
346+
expect(data.success).toBe(true);
347+
expect(data.nodePoolCosts).toBeDefined();
348+
expect(Array.isArray(data.nodePoolCosts)).toBe(true);
349+
expect(data.nodePoolCosts[0].poolName).toBe('gpu-pool');
350+
expect(data.pricingSource).toBe('realtime-with-fallback');
351+
expect(data.cacheStats).toBeDefined();
331352
});
332353

333354
test('accepts gpuCount and replicas query params', async () => {
355+
restores.push(
356+
mockServiceMethod(kubernetesService, 'getDetailedClusterGpuCapacity', async () => ({
357+
totalGpus: 8,
358+
allocatedGpus: 0,
359+
availableGpus: 8,
360+
maxContiguousAvailable: 8,
361+
maxNodeGpuCapacity: 8,
362+
gpuNodeCount: 1,
363+
nodePools: [
364+
{
365+
name: 'gpu-pool',
366+
gpuCount: 8,
367+
nodeCount: 2,
368+
availableGpus: 8,
369+
gpuModel: 'H100-80GB',
370+
},
371+
],
372+
})),
373+
);
374+
334375
const res = await app.request('/api/costs/node-pools?gpuCount=2&replicas=3');
335376

336-
// May succeed or fail depending on K8s
337-
expect([200, 500]).toContain(res.status);
377+
expect(res.status).toBe(200);
378+
379+
const data = await res.json();
380+
expect(data.nodePoolCosts[0].costBreakdown.totalGpus).toBe(6);
338381
});
339382

340383
test('accepts realtime=false to disable realtime pricing', async () => {
384+
restores.push(
385+
mockServiceMethod(kubernetesService, 'getDetailedClusterGpuCapacity', async () => ({
386+
totalGpus: 4,
387+
allocatedGpus: 0,
388+
availableGpus: 4,
389+
maxContiguousAvailable: 4,
390+
maxNodeGpuCapacity: 4,
391+
gpuNodeCount: 1,
392+
nodePools: [
393+
{
394+
name: 'gpu-pool',
395+
gpuCount: 4,
396+
nodeCount: 1,
397+
availableGpus: 4,
398+
gpuModel: 'A100-80GB',
399+
instanceType: 'Standard_NC24ads_A100_v4',
400+
},
401+
],
402+
})),
403+
);
404+
341405
const res = await app.request('/api/costs/node-pools?realtime=false');
342406

343-
expect([200, 500]).toContain(res.status);
407+
expect(res.status).toBe(200);
344408

345-
if (res.status === 200) {
346-
const data = await res.json();
347-
expect(data.pricingSource).toBe('static');
348-
}
409+
const data = await res.json();
410+
expect(data.pricingSource).toBe('static');
411+
expect(data.nodePoolCosts[0].realtimePricing).toBeUndefined();
349412
});
350413

351414
test('accepts computeType=cpu for CPU-only pools', async () => {
415+
restores.push(
416+
mockServiceMethod(kubernetesService, 'getAllNodePools', async () => ([
417+
{
418+
name: 'cpu-pool',
419+
gpuCount: 0,
420+
nodeCount: 2,
421+
availableGpus: 0,
422+
},
423+
])),
424+
);
425+
352426
const res = await app.request('/api/costs/node-pools?computeType=cpu');
353427

354-
expect([200, 500]).toContain(res.status);
428+
expect(res.status).toBe(200);
429+
430+
const data = await res.json();
431+
expect(data.nodePoolCosts).toBeArray();
355432
});
356433
});
357434
});

backend/src/routes/deployments.test.ts

Lines changed: 141 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,62 @@ describe('Deployment Routes', () => {
9191
expect(spec.model.storage.volumes[0].name).toBe('model-cache');
9292
expect(spec.model.storage.volumes[0].size).toBe('100Gi');
9393
});
94+
95+
test('normalizes KAITO GGUF deployments to llamacpp in preview manifests', async () => {
96+
restores.push(
97+
mockServiceMethod(configService, 'getDefaultNamespace', async () => 'kaito-workspace'),
98+
);
99+
100+
const res = await app.request('/api/deployments/preview', {
101+
method: 'POST',
102+
headers: { 'Content-Type': 'application/json' },
103+
body: JSON.stringify({
104+
...validDeploymentBody,
105+
namespace: 'kaito-workspace',
106+
provider: 'kaito',
107+
modelId: 'nvidia/Nemotron-3-Nano-4B-gguf',
108+
modelSource: 'huggingface',
109+
ggufFile: 'nvidia-nemotron-3-nano-4b.Q4_K_M.gguf',
110+
ggufRunMode: 'direct',
111+
}),
112+
});
113+
114+
expect(res.status).toBe(200);
115+
116+
const data = await res.json();
117+
expect(data.resources[0].manifest.spec.engine.type).toBe('llamacpp');
118+
expect(data.resources[0].manifest.spec.engine.args.ggufUrl).toBe(
119+
'https://huggingface.co/nvidia/Nemotron-3-Nano-4B-gguf/resolve/main/nvidia-nemotron-3-nano-4b.Q4_K_M.gguf'
120+
);
121+
expect(data.resources[0].manifest.spec.image).toBe('ghcr.io/kaito-project/aikit/runners/llama-cpp-cuda:latest');
122+
expect(data.resources[0].manifest.spec.provider.name).toBe('kaito');
123+
});
124+
125+
test('normalizes KAITO premade deployments to llamacpp in preview manifests', async () => {
126+
restores.push(
127+
mockServiceMethod(configService, 'getDefaultNamespace', async () => 'kaito-workspace'),
128+
);
129+
130+
const res = await app.request('/api/deployments/preview', {
131+
method: 'POST',
132+
headers: { 'Content-Type': 'application/json' },
133+
body: JSON.stringify({
134+
...validDeploymentBody,
135+
namespace: 'kaito-workspace',
136+
provider: 'kaito',
137+
modelId: 'llama3.2:3b',
138+
modelSource: 'premade',
139+
premadeModel: 'llama3.2:3b',
140+
}),
141+
});
142+
143+
expect(res.status).toBe(200);
144+
145+
const data = await res.json();
146+
expect(data.resources[0].manifest.spec.engine.type).toBe('llamacpp');
147+
expect(data.resources[0].manifest.spec.image).toBe('ghcr.io/kaito-project/aikit/llama3.2:3b');
148+
expect(data.resources[0].manifest.spec.provider.name).toBe('kaito');
149+
});
94150
});
95151

96152
describe('POST /api/deployments - storage validation', () => {
@@ -577,6 +633,91 @@ describe('Deployment Routes', () => {
577633
});
578634

579635
describe('POST /api/deployments', () => {
636+
test('resolves direct KAITO GGUF deployments to the runner image', async () => {
637+
let capturedConfig: any;
638+
639+
restores.push(
640+
mockServiceMethod(kubernetesService, 'createDeployment', async (config) => {
641+
capturedConfig = config;
642+
return undefined;
643+
}),
644+
);
645+
restores.push(
646+
mockServiceMethod(kubernetesService, 'getClusterGpuCapacity', async () => ({
647+
totalGpus: 8,
648+
allocatedGpus: 0,
649+
availableGpus: 8,
650+
maxContiguousAvailable: 8,
651+
nodes: [],
652+
})),
653+
);
654+
restores.push(
655+
mockServiceMethod(configService, 'getDefaultNamespace', async () => 'kaito-workspace'),
656+
);
657+
658+
const res = await app.request('/api/deployments', {
659+
method: 'POST',
660+
headers: { 'Content-Type': 'application/json' },
661+
body: JSON.stringify({
662+
name: 'nemotron-direct',
663+
namespace: 'kaito-workspace',
664+
provider: 'kaito',
665+
modelId: 'unsloth/NVIDIA-Nemotron-3-Nano-4B-GGUF',
666+
engine: 'llamacpp',
667+
modelSource: 'huggingface',
668+
ggufFile: 'NVIDIA-Nemotron-3-Nano-4B-Q4_K_M.gguf',
669+
ggufRunMode: 'direct',
670+
resources: { gpu: 1 },
671+
}),
672+
});
673+
674+
expect(res.status).toBe(201);
675+
expect(capturedConfig.imageRef).toBe('ghcr.io/kaito-project/aikit/runners/llama-cpp-cuda:latest');
676+
expect(capturedConfig.engineArgs?.ggufUrl).toBe(
677+
'https://huggingface.co/unsloth/NVIDIA-Nemotron-3-Nano-4B-GGUF/resolve/main/NVIDIA-Nemotron-3-Nano-4B-Q4_K_M.gguf'
678+
);
679+
});
680+
681+
test('resolves premade KAITO deployments to the premade image', async () => {
682+
let capturedConfig: any;
683+
684+
restores.push(
685+
mockServiceMethod(kubernetesService, 'createDeployment', async (config) => {
686+
capturedConfig = config;
687+
return undefined;
688+
}),
689+
);
690+
restores.push(
691+
mockServiceMethod(kubernetesService, 'getClusterGpuCapacity', async () => ({
692+
totalGpus: 8,
693+
allocatedGpus: 0,
694+
availableGpus: 8,
695+
maxContiguousAvailable: 8,
696+
nodes: [],
697+
})),
698+
);
699+
restores.push(
700+
mockServiceMethod(configService, 'getDefaultNamespace', async () => 'kaito-workspace'),
701+
);
702+
703+
const res = await app.request('/api/deployments', {
704+
method: 'POST',
705+
headers: { 'Content-Type': 'application/json' },
706+
body: JSON.stringify({
707+
name: 'llama-premade',
708+
namespace: 'kaito-workspace',
709+
provider: 'kaito',
710+
modelId: 'llama3.2:3b',
711+
engine: 'llamacpp',
712+
modelSource: 'premade',
713+
premadeModel: 'llama3.2:3b',
714+
}),
715+
});
716+
717+
expect(res.status).toBe(201);
718+
expect(capturedConfig.imageRef).toBe('ghcr.io/kaito-project/aikit/llama3.2:3b');
719+
});
720+
580721
test('accepts deployment with providerOverrides', async () => {
581722
restores.push(
582723
mockServiceMethod(kubernetesService, 'createDeployment', async () => undefined),

backend/src/routes/deployments.ts

Lines changed: 41 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ import { kubernetesService } from '../services/kubernetes';
66
import { configService } from '../services/config';
77
import { metricsService } from '../services/metrics';
88
import { validateGpuFit, formatGpuWarnings } from '../services/gpuValidation';
9+
import { aikitService, GGUF_RUNNER_IMAGE } from '../services/aikit';
910
import { handleK8sError } from '../lib/k8s-errors';
1011
import models from '../data/models.json';
1112
import logger from '../lib/logger';
@@ -285,6 +286,42 @@ const createDeploymentSchema = z.object({
285286
}
286287
});
287288

289+
function resolveDeploymentImages(config: DeploymentConfig): DeploymentConfig {
290+
if (config.provider !== 'kaito') {
291+
return config;
292+
}
293+
294+
if (config.modelSource === 'premade' && config.premadeModel) {
295+
if (config.imageRef) {
296+
return config;
297+
}
298+
299+
const imageRef = aikitService.getImageRef({
300+
modelSource: 'premade',
301+
premadeModel: config.premadeModel,
302+
});
303+
return imageRef ? { ...config, imageRef } : config;
304+
}
305+
306+
if (config.modelSource === 'huggingface' && config.ggufRunMode === 'direct') {
307+
const resolvedConfig: DeploymentConfig = {
308+
...config,
309+
imageRef: config.imageRef || GGUF_RUNNER_IMAGE,
310+
};
311+
312+
if (config.ggufFile) {
313+
resolvedConfig.engineArgs = {
314+
...(config.engineArgs || {}),
315+
ggufUrl: aikitService.buildHuggingFaceUrl(config.modelId, config.ggufFile),
316+
};
317+
}
318+
319+
return resolvedConfig;
320+
}
321+
322+
return config;
323+
}
324+
288325
const deployments = new Hono()
289326
.get('/', zValidator('query', listDeploymentsQuerySchema), async (c) => {
290327
try {
@@ -323,10 +360,10 @@ const deployments = new Hono()
323360
.post('/', zValidator('json', createDeploymentSchema), async (c) => {
324361
const body = c.req.valid('json');
325362

326-
const config: DeploymentConfig = {
363+
const config = resolveDeploymentImages({
327364
...body,
328365
namespace: body.namespace || (await configService.getDefaultNamespace()),
329-
};
366+
});
330367

331368
// GPU fit validation
332369
let gpuWarnings: string[] = [];
@@ -383,10 +420,10 @@ const deployments = new Hono()
383420
})
384421
.post('/preview', zValidator('json', createDeploymentSchema), async (c) => {
385422
const body = c.req.valid('json');
386-
const config: DeploymentConfig = {
423+
const config = resolveDeploymentImages({
387424
...body,
388425
namespace: body.namespace || (await configService.getDefaultNamespace()),
389-
};
426+
});
390427

391428
// Apply storage defaults that the mutating webhook would add,
392429
// so the preview manifest matches what Kubernetes will persist.

0 commit comments

Comments
 (0)