Skip to content

Commit 8f3386a

Browse files
authored
feat: add MaaS (Models as a Service) Helm chart and integration (#61)
* feat: add MaaS (Models as a Service) Helm chart and integration Add charts/maas — a Helm chart for deploying the MaaS platform on xKS, providing API key management, tiered subscriptions, and authenticated model serving via Kuadrant/RHCL gateway policies. Components: - maas-controller: watches MaaSSubscription/MaaSAuthPolicy CRs, auto-generates per-model AuthPolicy and TokenRateLimitPolicy - maas-api: REST API for API keys, subscriptions, model discovery - PostgreSQL: persistent storage (PVC-backed by default) - Gateway + HTTPRoute: Istio-based ingress for /v1/models and /maas-api - AuthPolicy: API key callback + optional Azure AD JWT authentication - RateLimitPolicy: configurable request-based rate limiting - 4 CRDs: ExternalModel, MaaSAuthPolicy, MaaSModelRef, MaaSSubscription Security: - No anonymous access — API key or Azure AD JWT required - Scoped RBAC (namespace Role for secrets, ClusterRole read-only) - PostgreSQL credentials auto-generated and persisted in Secrets - Azure AD tenantId/clientId validated at template render time - NetworkPolicy allows only Authorino, Istio gateway, and sidecar traffic - Container security contexts with dropped capabilities Integration: - Helmfile orchestration with presync (CRD apply, dependency checks) and postsync (rollout validation, gateway SA patching) hooks - MAAS=true requires RHCL=true (Makefile guard) - Kuadrant CR readiness check before deploy - make deploy-all RHCL=true MAAS=true for full stack Made-with: Cursor * fix: MaaS chart fixes for xKS deployment - Add --force-conflicts to CRD server-side apply to fix ownership conflicts on redeployment - Fix maas-api AuthPolicy: replace broken api-key callback with anonymous auth when Azure AD is disabled (per-model auth is handled by maas-controller-generated AuthPolicies) - Add fsGroup: 26 to PostgreSQL pod security context for file permission issues on xKS - Add demo Makefile targets (demo-setup, demo-run, demo-cleanup) Made-with: Cursor * fix: MaaS TLS + HTTPS gateway + NetworkPolicy alignment - Enable TLS for maas-api via cert-manager (opendatahub-ca-issuer) - Add maas-api certificate.yaml and destination-rule.yaml templates - Fix maas-api deployment env vars (TLS_CERT/TLS_KEY, TLS_SELF_SIGNED=false) - Enable HTTPS listener on gateway (port 443) with gateway-certificate.yaml - Fix NetworkPolicy to use port 8443 when TLS is enabled (was hardcoded 8080) - Fix maas-api-auth policy headers for anonymous auth (X-MaaS-Group/Username) - Document existingSecret path for PostgreSQL in secret.yaml - Extend setup-maas-tls.sh: Authorino CA trust + gateway CA bundle mount Made-with: Cursor
1 parent 5392474 commit 8f3386a

38 files changed

Lines changed: 2679 additions & 12 deletions

Makefile

Lines changed: 120 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,15 @@
11
.PHONY: deploy deploy-all undeploy undeploy-kserve status help check-kubeconfig sync clear-cache
22
.PHONY: deploy-cert-manager deploy-istio deploy-lws deploy-rhcl deploy-kserve deploy-opendatahub-prerequisites deploy-cert-manager-pki
3-
.PHONY: undeploy-rhcl test conformance deploy-mock-model clean-mock-model
3+
.PHONY: undeploy-rhcl deploy-maas undeploy-maas test conformance deploy-mock-model clean-mock-model lint
4+
.PHONY: demo-setup demo-run demo-cleanup demo-all setup-maas-tls
45

56
HELMFILE_CACHE := $(HOME)/.cache/helmfile
67
# Auto-detect KServe namespace: redhat-ods-applications (EA2) or opendatahub (EA1)
78
KSERVE_NAMESPACE ?= $(shell kubectl get deployment llmisvc-controller-manager -n redhat-ods-applications -o name 2>/dev/null | grep -q . && echo redhat-ods-applications || echo opendatahub)
89
RHCL ?= false
10+
MAAS ?= false
11+
MAAS_GATEWAY_NAME ?= maas-default-gateway
12+
MAAS_GATEWAY_NS ?= istio-system
913

1014
check-kubeconfig:
1115
@kubectl cluster-info >/dev/null 2>&1 || (echo "ERROR: Cannot connect to cluster. Check KUBECONFIG." && exit 1)
@@ -17,20 +21,31 @@ help:
1721
@echo " make deploy - Deploy cert-manager + istio + lws"
1822
@echo " make deploy-all - Deploy all (cert-manager + istio + lws + kserve)"
1923
@echo " make deploy-all RHCL=true - Deploy all including RHCL"
24+
@echo " make deploy-all RHCL=true MAAS=true - Deploy all including RHCL + MaaS"
2025
@echo " make deploy-rhcl - Deploy RHCL standalone (API gateway, auth, rate limiting)"
26+
@echo " make deploy-maas - Deploy MaaS (API gateway, auth, rate limiting, subscriptions)"
27+
@echo " make setup-maas-tls - Setup TLS cert chain for MaaS (Authorino CA trust)"
2128
@echo " make deploy-kserve - Deploy KServe"
2229
@echo ""
2330
@echo "Undeploy:"
2431
@echo " make undeploy - Remove all infrastructure"
2532
@echo " make undeploy-rhcl - Remove RHCL"
33+
@echo " make undeploy-maas - Remove MaaS"
2634
@echo " make undeploy-kserve - Remove KServe"
2735
@echo ""
2836
@echo "Mock model (no GPU):"
2937
@echo " make deploy-mock-model - Deploy mock LLMInferenceService"
3038
@echo " make clean-mock-model - Clean up mock deployment"
3139
@echo ""
40+
@echo "Demo (full end-to-end):"
41+
@echo " make demo-setup - Deploy mock model + MaaS resources"
42+
@echo " make demo-run - Run interactive demo (auth, rate limiting, inference)"
43+
@echo " make demo-cleanup - Remove demo resources (keeps stack)"
44+
@echo " make demo-all - Setup + run in one shot"
45+
@echo ""
3246
@echo "Other:"
3347
@echo " make status - Show deployment status"
48+
@echo " make lint - Run local lints (helm lint, yamllint, shellcheck)"
3449
@echo " make test - Run ODH conformance tests"
3550
@echo " make sync - Fetch latest from git repos"
3651
@echo " make clear-cache - Clear helmfile git cache"
@@ -52,11 +67,30 @@ deploy: check-kubeconfig clear-cache
5267
@$(MAKE) status
5368

5469
RHCL_TARGET := $(if $(filter true,$(RHCL)),deploy-rhcl,)
70+
MAAS_TARGET := $(if $(filter true,$(MAAS)),deploy-maas,)
71+
72+
# MaaS requires RHCL — fail early if MAAS=true without RHCL=true
73+
$(if $(and $(filter true,$(MAAS)),$(filter-out true,$(RHCL))),$(error MAAS=true requires RHCL=true. Run: make deploy-all RHCL=true MAAS=true))
5574

56-
deploy-all: check-kubeconfig deploy-cert-manager deploy-istio deploy-lws $(RHCL_TARGET) deploy-kserve
75+
deploy-all: check-kubeconfig deploy-cert-manager deploy-istio deploy-lws $(RHCL_TARGET) deploy-kserve $(MAAS_TARGET)
5776
@$(MAKE) status
5877

5978
deploy-cert-manager: check-kubeconfig clear-cache
79+
@# Detect orphaned cert-manager pods not managed by our Helm release
80+
@if kubectl get pods -n cert-manager --no-headers 2>/dev/null | grep -q .; then \
81+
if ! helm list -n cert-manager-operator 2>/dev/null | grep -q cert-manager; then \
82+
echo "WARNING: cert-manager pods exist in cert-manager namespace but no Helm release found in cert-manager-operator namespace."; \
83+
echo " To force cleanup, re-run with: CERT_MANAGER_FORCE_CLEANUP=true"; \
84+
if [ "$${CERT_MANAGER_FORCE_CLEANUP:-false}" = "true" ]; then \
85+
echo " Cleaning orphaned resources..."; \
86+
kubectl delete deployment --all -n cert-manager --ignore-not-found 2>/dev/null || true; \
87+
kubectl delete sa --all -n cert-manager --ignore-not-found 2>/dev/null || true; \
88+
echo " Done. Proceeding with fresh deploy."; \
89+
else \
90+
echo " Skipping cleanup (CERT_MANAGER_FORCE_CLEANUP=false)."; \
91+
fi; \
92+
fi; \
93+
fi
6094
helmfile apply --selector name=cert-manager-operator
6195

6296
deploy-istio: check-kubeconfig clear-cache
@@ -80,9 +114,35 @@ undeploy-rhcl: check-kubeconfig
80114
-helmfile destroy --selector name=rhcl --state-values-set rhclOperator.enabled=true
81115
@echo "=== RHCL removed ==="
82116

117+
deploy-maas: check-kubeconfig clear-cache
118+
@echo "=== Deploying MaaS (Models as a Service) ==="
119+
@echo "Prerequisites: KServe, RHCL, Istio, and cert-manager must be deployed first"
120+
@kubectl get crd llminferenceservices.serving.kserve.io >/dev/null 2>&1 || \
121+
(echo "ERROR: KServe not found. Run 'make deploy-kserve' first." && exit 1)
122+
@kubectl get crd authpolicies.kuadrant.io >/dev/null 2>&1 || \
123+
(echo "ERROR: Kuadrant/RHCL not found. Run 'make deploy-rhcl' first." && exit 1)
124+
@kubectl get crd gateways.gateway.networking.k8s.io >/dev/null 2>&1 || \
125+
(echo "ERROR: Gateway API CRDs not found. Run 'make deploy-istio' first." && exit 1)
126+
@kubectl get kuadrant -n kuadrant-system -o name 2>/dev/null | grep -q . || \
127+
(echo "ERROR: No Kuadrant instance found in kuadrant-system. Run 'make deploy-rhcl' first." && exit 1)
128+
helmfile apply --selector name=maas --state-values-set maas.enabled=true
129+
@echo "=== Running MaaS TLS setup (Authorino CA trust) ==="
130+
@./scripts/setup-maas-tls.sh
131+
@echo "=== MaaS deployed ==="
132+
133+
setup-maas-tls: check-kubeconfig
134+
@echo "=== Setting up MaaS TLS (cert chain + Authorino CA trust) ==="
135+
@./scripts/setup-maas-tls.sh
136+
137+
undeploy-maas: check-kubeconfig
138+
@echo "=== Removing MaaS ==="
139+
-helmfile destroy --selector name=maas --state-values-set maas.enabled=true
140+
-kubectl delete crd maassubscriptions.maas.opendatahub.io maasmodelrefs.maas.opendatahub.io maasauthpolicies.maas.opendatahub.io externalmodels.maas.opendatahub.io --ignore-not-found 2>/dev/null || true
141+
@echo "=== MaaS removed ==="
142+
83143
deploy-opendatahub-prerequisites: check-kubeconfig
84144
@echo "=== Deploying OpenDataHub prerequisites ==="
85-
kubectl create namespace $(KSERVE_NAMESPACE) --dry-run=client -o yaml | kubectl apply -f -
145+
kubectl create namespace "$(KSERVE_NAMESPACE)" --dry-run=client -o yaml | kubectl apply -f -
86146
-kubectl get secret redhat-pull-secret -n istio-system -o yaml 2>/dev/null | \
87147
sed 's/namespace: istio-system/namespace: $(KSERVE_NAMESPACE)/' | \
88148
kubectl apply -f - 2>/dev/null || true
@@ -112,25 +172,28 @@ deploy-cert-manager-pki: check-kubeconfig deploy-opendatahub-prerequisites
112172

113173
deploy-kserve: check-kubeconfig deploy-cert-manager-pki
114174
@echo "Applying KServe via Helm..."
115-
helmfile sync --wait --selector name=kserve-rhaii-xks --skip-crds
175+
helmfile sync --wait --selector name=kserve-rhaii-xks
116176
@echo "=== KServe deployed ==="
117177

118-
# Undeploy
119-
undeploy: check-kubeconfig undeploy-kserve
178+
# Undeploy (reverse dependency order: MaaS → KServe → RHCL → base infra)
179+
undeploy: check-kubeconfig
180+
-@$(MAKE) undeploy-maas 2>/dev/null || true
181+
-@$(MAKE) undeploy-kserve 2>/dev/null || true
182+
-@$(MAKE) undeploy-rhcl 2>/dev/null || true
120183
@./scripts/cleanup.sh -y
121184

122185
undeploy-kserve: check-kubeconfig
123186
-@kubectl delete llminferenceservice --all -A --ignore-not-found 2>/dev/null || true
124187
-@kubectl delete inferencepool --all -A --ignore-not-found 2>/dev/null || true
125-
-@helm uninstall kserve-rhaii-xks --namespace $(KSERVE_NAMESPACE) 2>/dev/null || true
188+
-@helm uninstall kserve-rhaii-xks --namespace "$(KSERVE_NAMESPACE)" 2>/dev/null || true
126189
-@kubectl delete validatingwebhookconfiguration llminferenceservice.serving.kserve.io llminferenceserviceconfig.serving.kserve.io --ignore-not-found 2>/dev/null || true
127190
-@# Removes KServe CRDs and Inference Extension CRDs (Helm does not remove CRDs on uninstall)
128191
-@kubectl get crd -o name | grep -E "serving.kserve.io|inference.networking" | xargs -r kubectl delete --ignore-not-found 2>/dev/null || true
129192
-@# Removes cluster-scoped RBAC resources
130193
-@kubectl get clusterrole,clusterrolebinding -o name | grep -i kserve | xargs -r kubectl delete --ignore-not-found 2>/dev/null || true
131194
-@kubectl delete clusterissuer opendatahub-ca-issuer opendatahub-selfsigned-issuer --ignore-not-found 2>/dev/null || true
132195
-@kubectl delete certificate opendatahub-ca -n cert-manager --ignore-not-found 2>/dev/null || true
133-
-@kubectl delete namespace $(KSERVE_NAMESPACE) --ignore-not-found --wait=false 2>/dev/null || true
196+
-@kubectl delete namespace "$(KSERVE_NAMESPACE)" --ignore-not-found --wait=false 2>/dev/null || true
134197
@echo "=== KServe removed ==="
135198

136199
# Status
@@ -157,6 +220,15 @@ status: check-kubeconfig
157220
kubectl get kuadrant,authorino,limitador -n kuadrant-system 2>/dev/null || echo " No instances"; \
158221
fi
159222
@echo ""
223+
@echo "maas (optional):"
224+
@kubectl get pods -n "$(KSERVE_NAMESPACE)" -l control-plane=maas-controller 2>/dev/null || echo " Not deployed (optional component)"
225+
@kubectl get pods -n "$(KSERVE_NAMESPACE)" -l app.kubernetes.io/name=maas-api 2>/dev/null || echo " "
226+
@if kubectl get crd maassubscriptions.maas.opendatahub.io >/dev/null 2>&1; then \
227+
echo ""; \
228+
echo "maas gateway:"; \
229+
kubectl get gateway $(MAAS_GATEWAY_NAME) -n $(MAAS_GATEWAY_NS) 2>/dev/null || echo " No gateway"; \
230+
fi
231+
@echo ""
160232
@echo "kserve:"
161233
@kubectl get pods -n $(KSERVE_NAMESPACE) -l control-plane=kserve-controller-manager 2>/dev/null || echo " Not deployed"
162234
@echo ""
@@ -225,3 +297,43 @@ clean-mock-model: check-kubeconfig
225297
-kubectl delete clusterstoragecontainer local-noop --ignore-not-found 2>/dev/null || true
226298
-kubectl delete namespace "$(MOCK_NAMESPACE)" --ignore-not-found
227299
@echo "=== Done ==="
300+
301+
# Demo — end-to-end demo (requires full stack: make deploy-all RHCL=true MAAS=true)
302+
demo-setup: check-kubeconfig
303+
@./demo/setup.sh
304+
305+
demo-run: check-kubeconfig
306+
@./demo/run.sh
307+
308+
demo-cleanup: check-kubeconfig
309+
@./demo/cleanup.sh
310+
311+
demo-all: demo-setup demo-run
312+
313+
# Lint — run locally before pushing to catch issues CodeRabbit would flag
314+
lint:
315+
@echo "=== Running local lints ==="
316+
@FAIL=0; \
317+
echo "--- helm lint ---"; \
318+
for chart in charts/*/; do \
319+
if [ -f "$$chart/Chart.yaml" ]; then \
320+
helm lint "$$chart" 2>&1 || FAIL=1; \
321+
fi; \
322+
done; \
323+
echo ""; \
324+
echo "--- yamllint ---"; \
325+
if command -v yamllint >/dev/null 2>&1; then \
326+
yamllint -d '{extends: relaxed, rules: {line-length: {max: 200}}}' values.yaml charts/*/values.yaml 2>&1 || FAIL=1; \
327+
else \
328+
echo " yamllint not found (brew install yamllint)"; \
329+
fi; \
330+
echo ""; \
331+
echo "--- shellcheck ---"; \
332+
if command -v shellcheck >/dev/null 2>&1; then \
333+
shellcheck -x scripts/*.sh test/*.sh 2>&1 || FAIL=1; \
334+
else \
335+
echo " shellcheck not found (brew install shellcheck)"; \
336+
fi; \
337+
echo ""; \
338+
echo "=== Lint complete ==="; \
339+
exit $$FAIL

charts/maas/Chart.yaml

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
apiVersion: v2
2+
name: maas
3+
description: Models as a Service (MaaS) for xKS — API gateway authentication, authorization, rate limiting, and model subscription management
4+
version: 0.1.0
5+
appVersion: "0.0.2"
6+
type: application
7+
keywords:
8+
- maas
9+
- models-as-a-service
10+
- kuadrant
11+
- kserve
12+
- rate-limiting
13+
- authentication
14+
home: https://github.com/opendatahub-io/models-as-a-service
15+
sources:
16+
- https://github.com/opendatahub-io/models-as-a-service
Lines changed: 164 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,164 @@
1+
---
2+
apiVersion: apiextensions.k8s.io/v1
3+
kind: CustomResourceDefinition
4+
metadata:
5+
annotations:
6+
controller-gen.kubebuilder.io/version: v0.16.4
7+
name: externalmodels.maas.opendatahub.io
8+
spec:
9+
group: maas.opendatahub.io
10+
names:
11+
kind: ExternalModel
12+
listKind: ExternalModelList
13+
plural: externalmodels
14+
singular: externalmodel
15+
scope: Namespaced
16+
versions:
17+
- additionalPrinterColumns:
18+
- jsonPath: .spec.provider
19+
name: Provider
20+
type: string
21+
- jsonPath: .spec.endpoint
22+
name: Endpoint
23+
type: string
24+
- jsonPath: .status.phase
25+
name: Phase
26+
type: string
27+
- jsonPath: .metadata.creationTimestamp
28+
name: Age
29+
type: date
30+
name: v1alpha1
31+
schema:
32+
openAPIV3Schema:
33+
description: |-
34+
ExternalModel is the Schema for the externalmodels API.
35+
It defines an external LLM provider (e.g., OpenAI, Anthropic) that can be
36+
referenced by MaaSModelRef resources.
37+
properties:
38+
apiVersion:
39+
description: |-
40+
APIVersion defines the versioned schema of this representation of an object.
41+
Servers should convert recognized schemas to the latest internal value, and
42+
may reject unrecognized values.
43+
More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources
44+
type: string
45+
kind:
46+
description: |-
47+
Kind is a string value representing the REST resource this object represents.
48+
Servers may infer this from the endpoint the client submits requests to.
49+
Cannot be updated.
50+
In CamelCase.
51+
More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds
52+
type: string
53+
metadata:
54+
type: object
55+
spec:
56+
description: ExternalModelSpec defines the desired state of ExternalModel
57+
properties:
58+
credentialRef:
59+
description: |-
60+
CredentialRef references a Kubernetes Secret containing the provider API key.
61+
The Secret must contain a data key "api-key" with the credential value.
62+
properties:
63+
name:
64+
description: Name is the name of the Secret
65+
maxLength: 253
66+
minLength: 1
67+
type: string
68+
required:
69+
- name
70+
type: object
71+
endpoint:
72+
description: |-
73+
Endpoint is the FQDN of the external provider (no scheme or path).
74+
e.g. "api.openai.com".
75+
This field is metadata for downstream consumers (e.g. BBR provider-resolver plugin)
76+
and is not used by the controller for endpoint derivation.
77+
maxLength: 253
78+
pattern: ^[a-zA-Z0-9]([a-zA-Z0-9\-]*[a-zA-Z0-9])?(\.[a-zA-Z0-9]([a-zA-Z0-9\-]*[a-zA-Z0-9])?)*$
79+
type: string
80+
provider:
81+
description: |-
82+
Provider identifies the API format and auth type for the external model.
83+
e.g. "openai", "anthropic".
84+
maxLength: 63
85+
type: string
86+
required:
87+
- credentialRef
88+
- endpoint
89+
- provider
90+
type: object
91+
status:
92+
description: ExternalModelStatus defines the observed state of ExternalModel
93+
properties:
94+
conditions:
95+
description: Conditions represent the latest available observations
96+
of the external model's state
97+
items:
98+
description: Condition contains details for one aspect of the current
99+
state of this API Resource.
100+
properties:
101+
lastTransitionTime:
102+
description: |-
103+
lastTransitionTime is the last time the condition transitioned from one status to another.
104+
This should be when the underlying condition changed. If that is not known, then using the time when the API field changed is acceptable.
105+
format: date-time
106+
type: string
107+
message:
108+
description: |-
109+
message is a human readable message indicating details about the transition.
110+
This may be an empty string.
111+
maxLength: 32768
112+
type: string
113+
observedGeneration:
114+
description: |-
115+
observedGeneration represents the .metadata.generation that the condition was set based upon.
116+
For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date
117+
with respect to the current state of the instance.
118+
format: int64
119+
minimum: 0
120+
type: integer
121+
reason:
122+
description: |-
123+
reason contains a programmatic identifier indicating the reason for the condition's last transition.
124+
Producers of specific condition types may define expected values and meanings for this field,
125+
and whether the values are considered a guaranteed API.
126+
The value should be a CamelCase string.
127+
This field may not be empty.
128+
maxLength: 1024
129+
minLength: 1
130+
pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$
131+
type: string
132+
status:
133+
description: status of the condition, one of True, False, Unknown.
134+
enum:
135+
- "True"
136+
- "False"
137+
- Unknown
138+
type: string
139+
type:
140+
description: type of condition in CamelCase or in foo.example.com/CamelCase.
141+
maxLength: 316
142+
pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$
143+
type: string
144+
required:
145+
- lastTransitionTime
146+
- message
147+
- reason
148+
- status
149+
- type
150+
type: object
151+
type: array
152+
phase:
153+
description: Phase represents the current phase of the external model
154+
enum:
155+
- Pending
156+
- Ready
157+
- Failed
158+
type: string
159+
type: object
160+
type: object
161+
served: true
162+
storage: true
163+
subresources:
164+
status: {}

0 commit comments

Comments
 (0)