Skip to content

Commit aa14a55

Browse files
fix(kaito): detect KAITO installed via AKS AI-toolchain add-on (#322)
Signed-off-by: Suraj Deshmukh <suraj.deshmukh@microsoft.com> Co-authored-by: Robbie Cronin <robert.owen.cronin@gmail.com>
1 parent c4ee1c5 commit aa14a55

5 files changed

Lines changed: 148 additions & 5 deletions

File tree

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@ providers/*/bin/
6363
/test-results/
6464
/playwright-report/
6565
/playwright/.cache/
66+
.playwright-mcp/
6667
frontend/test-results/
6768
frontend/playwright-report/
6869
frontend/blob-report/

backend/src/services/kubernetes-runtime-status.test.ts

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -378,6 +378,33 @@ describe('KubernetesService - Runtime Status', () => {
378378
expect(status.message).toBe('KAITO workspace CRD found and KAITO operator pods are ready');
379379
});
380380

381+
test('reports KAITO as installed when the AKS AI-toolchain-operator add-on pod is running in kube-system', async () => {
382+
restores.push(
383+
mockServiceMethod(kubernetesService, 'checkCRDExists', async (crdName: string) => crdName === 'workspaces.kaito.sh'),
384+
);
385+
// The AKS add-on runs the KAITO operator in kube-system, labeled
386+
// app=ai-toolchain-operator rather than the upstream Helm chart labels, so
387+
// it only surfaces through the cross-namespace fallback search.
388+
mockOperatorPods('kaito-workspace', kaitoOperatorSelector, [], [
389+
{
390+
metadata: { namespace: 'kube-system', name: 'kaito-workspace-557dbc5ffb-smczp', labels: { app: 'ai-toolchain-operator' } },
391+
status: {
392+
phase: 'Running',
393+
containerStatuses: [
394+
{ ready: true, restartCount: 0 },
395+
],
396+
},
397+
},
398+
]);
399+
400+
const status = await kubernetesService.checkKaitoInstallationStatus();
401+
402+
expect(status.installed).toBe(true);
403+
expect(status.crdFound).toBe(true);
404+
expect(status.operatorRunning).toBe(true);
405+
expect(status.message).toBe('KAITO workspace CRD found and KAITO operator pods are ready in kube-system');
406+
});
407+
381408
test('reports Dynamo as installed when a ready operator pod is found', async () => {
382409
restores.push(
383410
mockServiceMethod(kubernetesService, 'checkCRDExists', async (crdName: string) => crdName === 'dynamographdeployments.nvidia.com'),

backend/src/services/kubernetes.ts

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,17 @@ const INFERENCE_EXTENSION_VERSION_ANNOTATIONS = [
3131
const KAITO_WORKSPACE_CRD = 'workspaces.kaito.sh';
3232
const KAITO_NAMESPACE = 'kaito-workspace';
3333
const KAITO_OPERATOR_POD_SELECTOR = 'app.kubernetes.io/name=workspace,app.kubernetes.io/instance=kaito-workspace';
34+
// The AKS AI-toolchain-operator add-on installs KAITO in kube-system. Verified
35+
// against a live `--enable-ai-toolchain-operator` cluster, the add-on operator
36+
// POD carries ONLY the bare `app=ai-toolchain-operator` label — it does NOT
37+
// carry `app.kubernetes.io/name` (that key is present on the Deployment but not
38+
// propagated to the pod template). So this pod probe must match on `app`; using
39+
// `app.kubernetes.io/name=ai-toolchain-operator` here would match nothing.
40+
// NOTE: the Go provider shim probes the Deployment instead and intentionally
41+
// uses `app.kubernetes.io/name=ai-toolchain-operator` — see
42+
// providers/kaito/upstream_health.go (listWorkspaceController). The two paths
43+
// key off different labels on purpose because they inspect different objects.
44+
const KAITO_AKS_ADDON_POD_SELECTOR = 'app=ai-toolchain-operator';
3445
const DYNAMO_CRD = 'dynamographdeployments.nvidia.com';
3546
const DYNAMO_NAMESPACE = 'dynamo-system';
3647
const DYNAMO_OPERATOR_POD_SELECTOR = 'control-plane=controller-manager,app.kubernetes.io/name=dynamo-operator,app.kubernetes.io/instance=dynamo-platform';
@@ -156,6 +167,14 @@ const RUNTIME_INSTALLATION_PROBES: Record<string, RuntimeInstallationProbe> = {
156167
operatorNamespace: KAITO_NAMESPACE,
157168
operatorPodSelectors: [KAITO_OPERATOR_POD_SELECTOR],
158169
fallbackPodSelectors: ['app.kubernetes.io/name=workspace'],
170+
// The AKS add-on pod only ever lives in kube-system, never in
171+
// kaito-workspace, so it is matched exclusively in the cross-namespace
172+
// pass. Listing it explicitly here (rather than relying on the implicit
173+
// `crossNamespaceFallbackPodSelectors = fallbackPodSelectors` default)
174+
// keeps add-on detection working even if KAITO later gains other
175+
// same-namespace fallbacks, and avoids a guaranteed-empty query for
176+
// `app=ai-toolchain-operator` against kaito-workspace on every probe.
177+
crossNamespaceFallbackPodSelectors: ['app.kubernetes.io/name=workspace', KAITO_AKS_ADDON_POD_SELECTOR],
159178
},
160179
dynamo: {
161180
providerName: 'Dynamo',

providers/kaito/upstream_health.go

Lines changed: 40 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,9 @@ import (
1717

1818
appsv1 "k8s.io/api/apps/v1"
1919
"k8s.io/apimachinery/pkg/api/meta"
20+
"k8s.io/apimachinery/pkg/labels"
2021
"k8s.io/apimachinery/pkg/runtime/schema"
22+
"k8s.io/apimachinery/pkg/selection"
2123
"sigs.k8s.io/controller-runtime/pkg/client"
2224
)
2325

@@ -43,9 +45,23 @@ const (
4345

4446
// Well-known resource selectors the probe looks up.
4547
const (
46-
kaitoDeploymentSelectorKey = "app.kubernetes.io/name"
47-
kaitoDeploymentSelectorValue = "workspace"
48-
controllerMissingUserMessage = "The KAITO workspace controller is not running. Install KAITO with `helm install kaito-workspace kaito/workspace`. If this cluster was provisioned with `--enable-ai-toolchain-operator`, disable the AKS extension first (`az aks update --disable-ai-toolchain-operator ...`)."
48+
kaitoDeploymentSelectorKey = "app.kubernetes.io/name"
49+
kaitoDeploymentSelectorValue = "workspace"
50+
// aksAddonSelectorValue matches the KAITO controller Deployment installed by
51+
// the AKS AI-toolchain-operator add-on. Verified against a live
52+
// `--enable-ai-toolchain-operator` cluster, the add-on Deployment carries
53+
// BOTH app.kubernetes.io/name=ai-toolchain-operator AND app=ai-toolchain-operator
54+
// (in kube-system), so probing the dotted key here is correct.
55+
// NOTE: the add-on POD only carries the bare `app` label, so the TypeScript
56+
// pod probe in backend/src/services/kubernetes.ts intentionally matches
57+
// `app=ai-toolchain-operator` instead. The two paths use different label
58+
// keys on purpose because they inspect different objects (Deployment here,
59+
// Pod there).
60+
aksAddonSelectorValue = "ai-toolchain-operator"
61+
// controllerMissingUserMessage covers both the "never installed" case and the
62+
// "add-on enabled but unhealthy" case, pointing at the namespace to inspect
63+
// for each install path.
64+
controllerMissingUserMessage = "The KAITO workspace controller is not running. Install it with `helm install kaito-workspace kaito/workspace` (check the kaito-workspace namespace), or via the AKS AI toolchain operator add-on `az aks update --enable-ai-toolchain-operator ...` (check the kube-system namespace)."
4965
controllerNotReadyUserMessage = "The KAITO workspace controller Deployment %s/%s exists but has no ready replicas."
5066
crdMissingUserMessage = "KAITO Workspace CRD not found. Install KAITO."
5167
)
@@ -132,16 +148,35 @@ func isNoKindMatch(err error) bool {
132148
// workspace controller label selector. It also returns a second return value
133149
// indicating whether any Deployment with the selector was found (so callers
134150
// can distinguish "missing" from "not ready").
151+
//
152+
// The selector matches both the upstream Helm chart
153+
// (app.kubernetes.io/name=workspace) and the AKS AI-toolchain-operator add-on
154+
// (app.kubernetes.io/name=ai-toolchain-operator). The List is cluster-wide so
155+
// the controller is found regardless of which namespace it runs in
156+
// (kaito-workspace for the chart, kube-system for the add-on).
135157
func listWorkspaceController(ctx context.Context, direct client.Client) (*appsv1.Deployment, bool, error) {
158+
req, err := labels.NewRequirement(
159+
kaitoDeploymentSelectorKey,
160+
selection.In,
161+
[]string{kaitoDeploymentSelectorValue, aksAddonSelectorValue},
162+
)
163+
if err != nil {
164+
return nil, false, fmt.Errorf("build controller selector: %w", err)
165+
}
166+
selector := labels.NewSelector().Add(*req)
167+
136168
list := &appsv1.DeploymentList{}
137-
if err := direct.List(ctx, list, client.MatchingLabels{kaitoDeploymentSelectorKey: kaitoDeploymentSelectorValue}); err != nil {
169+
if err := direct.List(ctx, list, client.MatchingLabelsSelector{Selector: selector}); err != nil {
138170
return nil, false, fmt.Errorf("list deployments: %w", err)
139171
}
140172
if len(list.Items) == 0 {
141173
return nil, false, nil
142174
}
143175
// Prefer a ready one; otherwise return the first item so the caller can
144-
// reference the namespace/name in the message.
176+
// reference the namespace/name in the message. When both the Helm chart and
177+
// the AKS add-on are present, the In selector returns both Deployments and
178+
// this loop reports the first ready one — installed/healthy is what matters,
179+
// not which install path wins the tiebreak.
145180
for i := range list.Items {
146181
d := &list.Items[i]
147182
if d.Status.ReadyReplicas > 0 {

providers/kaito/upstream_health_test.go

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,26 @@ func newKaitoDeployment(namespace, name string, readyReplicas int32) *appsv1.Dep
123123
}
124124
}
125125

126+
// newAKSAddonDeployment builds a Deployment that mimics the KAITO controller
127+
// installed by the AKS AI-toolchain-operator add-on. The label set mirrors what
128+
// a live `--enable-ai-toolchain-operator` cluster emits: the Deployment carries
129+
// BOTH app.kubernetes.io/name=ai-toolchain-operator and app=ai-toolchain-operator
130+
// (typically in kube-system), unlike the upstream Helm chart's
131+
// app.kubernetes.io/name=workspace. The probe matches on the dotted key.
132+
func newAKSAddonDeployment(namespace, name string, readyReplicas int32) *appsv1.Deployment {
133+
return &appsv1.Deployment{
134+
ObjectMeta: metav1.ObjectMeta{
135+
Name: name,
136+
Namespace: namespace,
137+
Labels: map[string]string{
138+
kaitoDeploymentSelectorKey: aksAddonSelectorValue,
139+
"app": aksAddonSelectorValue,
140+
},
141+
},
142+
Status: appsv1.DeploymentStatus{ReadyReplicas: readyReplicas},
143+
}
144+
}
145+
126146
func stringContains(s, substr string) bool {
127147
for i := 0; i+len(substr) <= len(s); i++ {
128148
if s[i:i+len(substr)] == substr {
@@ -161,6 +181,47 @@ func TestProbe_ControllerReady(t *testing.T) {
161181
}
162182
}
163183

184+
func TestProbe_ControllerReady_AKSAddon(t *testing.T) {
185+
// KAITO installed via the AKS AI-toolchain-operator add-on: the controller
186+
// runs in kube-system with app.kubernetes.io/name=ai-toolchain-operator.
187+
// The probe must recognise it as a healthy upstream controller.
188+
d := newAKSAddonDeployment("kube-system", "kaito-workspace", 1)
189+
c := probeClientBuilderWithWorkspace(t).
190+
WithObjects(d).
191+
Build()
192+
193+
got := probeUpstreamController(context.Background(), c)
194+
195+
if !got.Healthy {
196+
t.Errorf("expected Healthy=true, got %+v", got)
197+
}
198+
if got.Reason != ReasonUpstreamHealthy {
199+
t.Errorf("expected Reason=%s, got %s", ReasonUpstreamHealthy, got.Reason)
200+
}
201+
}
202+
203+
func TestProbe_ControllerNotReady_AKSAddon(t *testing.T) {
204+
// The add-on Deployment exists in kube-system but has no ready replicas:
205+
// the probe must report NotReady (not Missing), referencing its location.
206+
d := newAKSAddonDeployment("kube-system", "kaito-workspace", 0)
207+
c := probeClientBuilderWithWorkspace(t).
208+
WithObjects(d).
209+
Build()
210+
211+
got := probeUpstreamController(context.Background(), c)
212+
213+
if got.Healthy {
214+
t.Error("expected Healthy=false")
215+
}
216+
if got.Reason != ReasonUpstreamControllerNotReady {
217+
t.Errorf("expected Reason=%s, got %s", ReasonUpstreamControllerNotReady, got.Reason)
218+
}
219+
want := "kube-system/kaito-workspace"
220+
if !stringContains(got.Message, want) {
221+
t.Errorf("expected Message to contain %q, got %q", want, got.Message)
222+
}
223+
}
224+
164225
func TestProbe_ControllerNotReady(t *testing.T) {
165226
d := newKaitoDeployment("kaito-workspace", "kaito-workspace", 0)
166227
c := probeClientBuilderWithWorkspace(t).

0 commit comments

Comments
 (0)