Skip to content

Commit c6e3181

Browse files
Brent Salisburynerdalert
authored andcommitted
feat: add tlsInsecureSkipVerify to ExternalModel spec
Adds an optional spec.tlsInsecureSkipVerify field to the ExternalModel CRD. When true, the reconciler generates the DestinationRule with insecureSkipVerify: true, allowing connections to endpoints with self-signed certificates without manual patching that gets overwritten on reconciliation. Default is false. Closes #627
1 parent 6d31fd8 commit c6e3181

9 files changed

Lines changed: 120 additions & 22 deletions

File tree

deployment/base/maas-controller/crd/bases/maas.opendatahub.io_externalmodels.yaml

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,18 @@ spec:
101101
maxLength: 253
102102
minLength: 1
103103
type: string
104+
tlsInsecureSkipVerify:
105+
description: |-
106+
TLSInsecureSkipVerify disables TLS certificate verification on the
107+
DestinationRule created for this external model. When true, the generated
108+
Istio DestinationRule includes insecureSkipVerify: true under
109+
trafficPolicy.tls, allowing connections to endpoints with self-signed or
110+
untrusted certificates (e.g., simulators, dev environments).
111+
112+
WARNING: Do not enable in production. This is intended for testing only.
113+
114+
Default: false (certificates are verified).
115+
type: boolean
104116
required:
105117
- credentialRef
106118
- endpoint

maas-api/cmd/main.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -148,7 +148,7 @@ func registerHandlers(ctx context.Context, log *logger.Logger, router *gin.Engin
148148

149149
subscriptionSelector := subscription.NewSelector(log, cluster.MaaSSubscriptionLister)
150150

151-
modelManager, err := models.NewManager(log)
151+
modelManager, err := models.NewManager(log, cfg.AccessCheckTimeoutSeconds)
152152
if err != nil {
153153
log.Fatal("Failed to create model manager", "error", err)
154154
}

maas-api/internal/config/config.go

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,12 @@ type Config struct {
4646
// Default: 30 days. Minimum: 1 day.
4747
APIKeyMaxExpirationDays int
4848

49+
// AccessCheckTimeoutSeconds bounds the total duration of model access validation.
50+
// This limits the staleness window between when access is checked and when the
51+
// response reaches the client. Models whose probes don't complete within this
52+
// window are excluded (fail-closed). Default: 15 seconds. Minimum: 1 second.
53+
AccessCheckTimeoutSeconds int
54+
4955
// Deprecated flag (backward compatibility with pre-TLS version)
5056
deprecatedHTTPPort string
5157
}
@@ -56,6 +62,7 @@ func Load() *Config {
5662
gatewayName := env.GetString("GATEWAY_NAME", constant.DefaultGatewayName)
5763
secure, _ := env.GetBool("SECURE", false)
5864
maxExpirationDays, _ := env.GetInt("API_KEY_MAX_EXPIRATION_DAYS", constant.DefaultAPIKeyMaxExpirationDays)
65+
accessCheckTimeoutSeconds, _ := env.GetInt("ACCESS_CHECK_TIMEOUT_SECONDS", 15)
5966

6067
c := &Config{
6168
Name: env.GetString("INSTANCE_NAME", gatewayName),
@@ -69,6 +76,7 @@ func Load() *Config {
6976
DebugMode: debugMode,
7077
DBConnectionURL: "", // Loaded from K8s secret via LoadDatabaseURL()
7178
APIKeyMaxExpirationDays: maxExpirationDays,
79+
AccessCheckTimeoutSeconds: accessCheckTimeoutSeconds,
7280
// Deprecated env var (backward compatibility with pre-TLS version)
7381
deprecatedHTTPPort: env.GetString("PORT", ""),
7482
}
@@ -141,6 +149,10 @@ func (c *Config) Validate() error {
141149
return errors.New("API_KEY_MAX_EXPIRATION_DAYS must be at least 1")
142150
}
143151

152+
if c.AccessCheckTimeoutSeconds < 1 {
153+
return errors.New("ACCESS_CHECK_TIMEOUT_SECONDS must be at least 1")
154+
}
155+
144156
return nil
145157
}
146158

maas-api/internal/config/config_test.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,7 @@ func TestValidate(t *testing.T) {
120120
DBConnectionURL: "postgresql://localhost/test",
121121
Secure: false,
122122
APIKeyMaxExpirationDays: 30,
123+
AccessCheckTimeoutSeconds: 15,
123124
MaaSSubscriptionNamespace: "models-as-a-service",
124125
},
125126
},
@@ -129,6 +130,7 @@ func TestValidate(t *testing.T) {
129130
DBConnectionURL: "postgresql://localhost/test",
130131
TLS: TLSConfig{SelfSigned: true, MinVersion: TLSVersion(tls.VersionTLS12)},
131132
APIKeyMaxExpirationDays: 30,
133+
AccessCheckTimeoutSeconds: 15,
132134
MaaSSubscriptionNamespace: "models-as-a-service",
133135
},
134136
},
@@ -138,6 +140,7 @@ func TestValidate(t *testing.T) {
138140
DBConnectionURL: "postgresql://localhost/test",
139141
TLS: TLSConfig{Cert: "/cert.pem", Key: "/key.pem", MinVersion: TLSVersion(tls.VersionTLS12)},
140142
APIKeyMaxExpirationDays: 30,
143+
AccessCheckTimeoutSeconds: 15,
141144
MaaSSubscriptionNamespace: "models-as-a-service",
142145
},
143146
},
@@ -146,6 +149,7 @@ func TestValidate(t *testing.T) {
146149
cfg: Config{
147150
DBConnectionURL: "postgresql://localhost/test",
148151
APIKeyMaxExpirationDays: 1,
152+
AccessCheckTimeoutSeconds: 15,
149153
MaaSSubscriptionNamespace: "models-as-a-service",
150154
},
151155
},
@@ -154,6 +158,7 @@ func TestValidate(t *testing.T) {
154158
cfg: Config{
155159
DBConnectionURL: "postgresql://localhost/test",
156160
APIKeyMaxExpirationDays: 30,
161+
AccessCheckTimeoutSeconds: 15,
157162
MaaSSubscriptionNamespace: "models-as-a-service",
158163
},
159164
},
@@ -162,6 +167,7 @@ func TestValidate(t *testing.T) {
162167
cfg: Config{
163168
DBConnectionURL: "postgresql://localhost/test",
164169
APIKeyMaxExpirationDays: 365,
170+
AccessCheckTimeoutSeconds: 15,
165171
MaaSSubscriptionNamespace: "models-as-a-service",
166172
},
167173
},

maas-api/internal/handlers/models.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ import (
55
"net/http"
66
"sort"
77
"strings"
8+
"time"
89

910
"github.com/gin-gonic/gin"
1011
"github.com/openai/openai-go/v2/packages/pagination"
@@ -260,6 +261,7 @@ func (h *ModelsHandler) ListLLMs(c *gin.Context) {
260261

261262
// Initialize to empty slice (not nil) so JSON marshals as [] instead of null
262263
modelList := []models.Model{}
264+
accessCheckedAt := time.Now().UTC()
263265
if h.maasModelRefLister != nil {
264266
h.logger.Debug("Listing models from MaaSModelRef cache (all namespaces)")
265267
list, err := models.ListFromMaaSModelRefLister(h.maasModelRefLister)
@@ -361,11 +363,18 @@ func (h *ModelsHandler) ListLLMs(c *gin.Context) {
361363
}
362364
}
363365

366+
accessCheckedAt = time.Now().UTC()
364367
h.logger.Debug("Access validation complete", "listed", len(list), "accessible", len(modelList), "subscriptions", len(subscriptionsToUse))
365368
} else {
366369
h.logger.Debug("MaaSModelRef lister not configured, returning empty model list")
367370
}
368371

372+
// Prevent clients and proxies from caching authorization-checked model listings.
373+
// The access check is a point-in-time snapshot; auth policies may change at any moment.
374+
// X-Access-Checked-At lets clients assess the freshness of the authorization decision.
375+
c.Header("Cache-Control", "no-store")
376+
c.Header("X-Access-Checked-At", accessCheckedAt.Format(time.RFC3339))
377+
369378
h.logger.Debug("GET /v1/models returning models", "count", len(modelList))
370379
c.JSON(http.StatusOK, pagination.Page[models.Model]{
371380
Object: "list",

maas-api/internal/handlers/models_test.go

Lines changed: 17 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -320,7 +320,7 @@ func TestListingModels(t *testing.T) {
320320
}
321321
router, _ := fixtures.SetupTestServer(t, config)
322322

323-
modelMgr, errMgr := models.NewManager(testLogger)
323+
modelMgr, errMgr := models.NewManager(testLogger, 15)
324324
require.NoError(t, errMgr)
325325

326326
// Set up test fixtures
@@ -350,6 +350,16 @@ func TestListingModels(t *testing.T) {
350350

351351
require.Equal(t, http.StatusOK, w.Code, "Expected status OK")
352352

353+
// Verify anti-caching and freshness headers (authorization timing race mitigation)
354+
assert.Equal(t, "no-store", w.Header().Get("Cache-Control"),
355+
"Expected Cache-Control: no-store to prevent caching of authorization-checked listings")
356+
accessCheckedAt := w.Header().Get("X-Access-Checked-At")
357+
assert.NotEmpty(t, accessCheckedAt, "Expected X-Access-Checked-At header with RFC3339 timestamp")
358+
if accessCheckedAt != "" {
359+
_, parseErr := time.Parse(time.RFC3339, accessCheckedAt)
360+
require.NoError(t, parseErr, "X-Access-Checked-At should be valid RFC3339")
361+
}
362+
353363
var response pagination.Page[models.Model]
354364
err = json.Unmarshal(w.Body.Bytes(), &response)
355365
require.NoError(t, err, "Failed to unmarshal response body")
@@ -425,7 +435,7 @@ func TestListingModelsWithSubscriptionHeader(t *testing.T) {
425435
}
426436
router, _ := fixtures.SetupTestServer(t, config)
427437

428-
modelMgr, errMgr := models.NewManager(testLogger)
438+
modelMgr, errMgr := models.NewManager(testLogger, 15)
429439
require.NoError(t, errMgr)
430440

431441
_, cleanup := fixtures.StubTokenProviderAPIs(t)
@@ -647,7 +657,7 @@ func TestListModels_ReturnAllModels(t *testing.T) {
647657
},
648658
}
649659

650-
modelMgr, err := models.NewManager(testLogger)
660+
modelMgr, err := models.NewManager(testLogger, 15)
651661
require.NoError(t, err)
652662

653663
subscriptionSelector := subscription.NewSelector(testLogger, subscriptionLister)
@@ -829,7 +839,7 @@ func TestListModels_DeduplicationBySubscription(t *testing.T) {
829839
},
830840
}
831841

832-
modelMgr, err := models.NewManager(testLogger)
842+
modelMgr, err := models.NewManager(testLogger, 15)
833843
require.NoError(t, err)
834844

835845
subscriptionSelector := subscription.NewSelector(testLogger, subscriptionLister)
@@ -940,7 +950,7 @@ func TestListModels_DifferentModelRefsWithSameModelID(t *testing.T) {
940950
},
941951
}
942952

943-
modelMgr, err := models.NewManager(testLogger)
953+
modelMgr, err := models.NewManager(testLogger, 15)
944954
require.NoError(t, err)
945955

946956
subscriptionSelector := subscription.NewSelector(testLogger, subscriptionLister)
@@ -1040,7 +1050,7 @@ func TestListModels_DifferentModelRefsWithSameURLAndModelID(t *testing.T) {
10401050
},
10411051
}
10421052

1043-
modelMgr, err := models.NewManager(testLogger)
1053+
modelMgr, err := models.NewManager(testLogger, 15)
10441054
require.NoError(t, err)
10451055

10461056
subscriptionSelector := subscription.NewSelector(testLogger, subscriptionLister)
@@ -1139,7 +1149,7 @@ func TestListModels_DifferentModelRefsWithSameModelIDAndDifferentSubscriptions(t
11391149
},
11401150
}
11411151

1142-
modelMgr, err := models.NewManager(testLogger)
1152+
modelMgr, err := models.NewManager(testLogger, 15)
11431153
require.NoError(t, err)
11441154

11451155
subscriptionSelector := subscription.NewSelector(testLogger, subscriptionLister)

maas-api/internal/models/discovery.go

Lines changed: 45 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -32,28 +32,43 @@ const maxModelsResponseBytes int64 = 4 << 20 // 4 MiB
3232

3333
// HTTP client and concurrency for access-validation probes.
3434
const (
35-
httpClientTimeout = 5 * time.Second
3635
httpMaxIdleConns = 100
3736
httpIdleConnTimeout = 90 * time.Second
3837
maxDiscoveryConcurrency = 10
38+
39+
// defaultAccessCheckTimeout bounds the total duration of FilterModelsByAccess.
40+
// This limits the staleness window between when access is checked and when
41+
// the response reaches the client. Models whose probes don't complete within
42+
// this window are excluded (fail-closed).
43+
defaultAccessCheckTimeout = 15 * time.Second
3944
)
4045

4146
// Manager runs access validation (probe model endpoints) for models listed from MaaSModelRef.
4247
type Manager struct {
43-
logger *logger.Logger
44-
httpClient *http.Client
48+
logger *logger.Logger
49+
httpClient *http.Client
50+
accessCheckTimeout time.Duration
4551
}
4652

4753
// NewManager creates a Manager for filtering models by access. The client uses InsecureSkipVerify
4854
// for cluster-internal probes; auth is enforced by the gateway/model server.
49-
func NewManager(log *logger.Logger) (*Manager, error) {
55+
// accessCheckTimeoutSeconds controls the total duration bound for access validation;
56+
// if <= 0, the default of 15 seconds is used.
57+
func NewManager(log *logger.Logger, accessCheckTimeoutSeconds int) (*Manager, error) {
5058
if log == nil {
5159
return nil, errors.New("log is required")
5260
}
61+
timeout := defaultAccessCheckTimeout
62+
if accessCheckTimeoutSeconds > 0 {
63+
timeout = time.Duration(accessCheckTimeoutSeconds) * time.Second
64+
}
5365
return &Manager{
54-
logger: log,
66+
logger: log,
67+
accessCheckTimeout: timeout,
5568
httpClient: &http.Client{
56-
Timeout: httpClientTimeout,
69+
// No per-client Timeout — each request inherits the accessCheckTimeout
70+
// deadline via its context. This ensures that configuring a longer
71+
// ACCESS_CHECK_TIMEOUT_SECONDS actually allows slower backends to respond.
5772
Transport: &http.Transport{
5873
TLSClientConfig: &tls.Config{InsecureSkipVerify: true}, //nolint:gosec // cluster-internal only
5974
MaxIdleConns: httpMaxIdleConns,
@@ -65,12 +80,26 @@ func NewManager(log *logger.Logger) (*Manager, error) {
6580
}
6681

6782
// FilterModelsByAccess returns only models the user can access by probing each model's
68-
// /v1/models endpoint with the given Authorization and x-maas-subscription headers (passed through as-is). 2xx or 405 → include, 401/403/404 → exclude.
83+
// /v1/models endpoint with the given Authorization and x-maas-subscription headers (passed through as-is).
84+
// 2xx or 405 → include, 401/403/404 → exclude.
6985
// Models with nil URL are skipped. Concurrency is limited by maxDiscoveryConcurrency.
86+
//
87+
// Because authorization policies propagate asynchronously through the gateway, there is an
88+
// inherent eventual-consistency window: a model listed here may become inaccessible (or vice versa)
89+
// by the time the client acts on the response. Actual enforcement always happens at the gateway
90+
// when the model is invoked for inference. Callers should set Cache-Control: no-store and expose
91+
// a freshness timestamp via response headers so clients can assess freshness.
92+
//
93+
// The access check is bounded by accessCheckTimeout to limit the staleness window.
7094
func (m *Manager) FilterModelsByAccess(ctx context.Context, models []Model, authHeader string, subscriptionHeader string) []Model {
7195
if len(models) == 0 {
7296
return models
7397
}
98+
99+
// Bound the total access-check duration to limit the staleness window.
100+
ctx, cancel := context.WithTimeout(ctx, m.accessCheckTimeout)
101+
defer cancel()
102+
74103
m.logger.Debug("FilterModelsByAccess: validating access for models", "count", len(models), "subscriptionHeaderProvided", subscriptionHeader != "")
75104
// Initialize to empty slice (not nil) so JSON marshals as [] instead of null when no models are accessible
76105
out := []Model{}
@@ -222,7 +251,11 @@ func (m *Manager) fetchModelsWithRetry(ctx context.Context, authHeader string, s
222251
lastResult = authRes
223252
return lastResult != authRetry, nil
224253
}); err != nil {
225-
m.logger.Debug("Access validation failed: model fetch backoff exhausted", "service", meta.ServiceName, "endpoint", meta.Endpoint, "error", err)
254+
if errors.Is(err, context.DeadlineExceeded) || ctx.Err() == context.DeadlineExceeded {
255+
m.logger.Debug("Access validation failed: context deadline exceeded", "service", meta.ServiceName, "endpoint", meta.Endpoint, "timeout", m.accessCheckTimeout)
256+
} else {
257+
m.logger.Debug("Access validation failed: model fetch backoff exhausted", "service", meta.ServiceName, "endpoint", meta.Endpoint, "error", err)
258+
}
226259
return nil // explicit fail-closed on error
227260
}
228261

@@ -249,6 +282,10 @@ func (m *Manager) fetchModels(ctx context.Context, authHeader string, subscripti
249282
// #nosec G704 -- Intentional HTTP request to probe model endpoint for authorization check
250283
resp, err := m.httpClient.Do(req)
251284
if err != nil {
285+
if errors.Is(err, context.DeadlineExceeded) || ctx.Err() == context.DeadlineExceeded {
286+
m.logger.Debug("Access validation: request timed out (context deadline exceeded)", "service", meta.ServiceName, "endpoint", meta.Endpoint)
287+
return nil, authDenied // fail-closed, no point retrying a deadline
288+
}
252289
m.logger.Debug("Access validation: GET request failed", "service", meta.ServiceName, "endpoint", meta.Endpoint, "error", err)
253290
return nil, authRetry
254291
}

maas-controller/api/maas/v1alpha1/externalmodel_types.go

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,18 @@ type ExternalModelSpec struct {
7070
// The Secret must contain a data key "api-key" with the credential value.
7171
// +kubebuilder:validation:Required
7272
CredentialRef CredentialReference `json:"credentialRef"`
73+
74+
// TLSInsecureSkipVerify disables TLS certificate verification on the
75+
// DestinationRule created for this external model. When true, the generated
76+
// Istio DestinationRule includes insecureSkipVerify: true under
77+
// trafficPolicy.tls, allowing connections to endpoints with self-signed or
78+
// untrusted certificates (e.g., simulators, dev environments).
79+
//
80+
// WARNING: Do not enable in production. This is intended for testing only.
81+
//
82+
// Default: false (certificates are verified).
83+
// +optional
84+
TLSInsecureSkipVerify bool `json:"tlsInsecureSkipVerify,omitempty"`
7385
}
7486

7587
// CredentialReference references a Kubernetes Secret with provider API credentials.

maas-controller/pkg/reconciler/externalmodel/reconciler.go

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -284,12 +284,12 @@ func specFromExternalModel(extModel *maasv1alpha1.ExternalModel, model *maasv1al
284284
}
285285

286286
spec := ExternalModelSpec{
287-
Provider: extModel.Spec.Provider,
288-
Endpoint: extModel.Spec.Endpoint,
289-
PathPrefix: ann[AnnPathPrefix],
290-
TLS: true,
291-
Port: 443,
292-
// TLSInsecureSkipVerify: extModel.Spec.TLSInsecureSkipVerify, // requires issue #627 CRD change
287+
Provider: extModel.Spec.Provider,
288+
Endpoint: extModel.Spec.Endpoint,
289+
PathPrefix: ann[AnnPathPrefix],
290+
TLS: true,
291+
Port: 443,
292+
TLSInsecureSkipVerify: extModel.Spec.TLSInsecureSkipVerify,
293293
}
294294

295295
if spec.Provider == "" {

0 commit comments

Comments
 (0)