diff --git a/.github/workflows/integration-test-docker.yml b/.github/workflows/integration-test-docker.yml index 4e37d79b6..d94fe1050 100644 --- a/.github/workflows/integration-test-docker.yml +++ b/.github/workflows/integration-test-docker.yml @@ -82,13 +82,18 @@ jobs: - name: Download models run: | echo "Downloading minimal models for CI..." + echo "Note: Gated models (e.g., embeddinggemma-300m) will be gracefully skipped if HF_TOKEN is not available." make download-models env: CI: true CI_MINIMAL_MODELS: true - HF_TOKEN: ${{ secrets.HF_TOKEN }} HF_HUB_ENABLE_HF_TRANSFER: 1 HF_HUB_DISABLE_TELEMETRY: 1 + # HF_TOKEN is required for downloading gated models (e.g., embeddinggemma-300m) + # For PRs from forks, this will be empty and the model_manager will gracefully skip gated models + # The hf CLI uses HUGGINGFACE_HUB_TOKEN, so we set both for compatibility + HF_TOKEN: ${{ secrets.HF_TOKEN }} + HUGGINGFACE_HUB_TOKEN: ${{ secrets.HF_TOKEN }} - name: Start CI services run: | diff --git a/.github/workflows/integration-test-dynamic-config.yml b/.github/workflows/integration-test-dynamic-config.yml index a5590de96..d23ea2a5d 100644 --- a/.github/workflows/integration-test-dynamic-config.yml +++ b/.github/workflows/integration-test-dynamic-config.yml @@ -76,6 +76,12 @@ jobs: - name: Run Dynamic Config E2E tests id: e2e-test + env: + # HF_TOKEN is required for downloading gated models (e.g., embeddinggemma-300m) + # For PRs from forks, this will be empty and the model_manager will gracefully skip gated models + # The hf CLI uses HUGGINGFACE_HUB_TOKEN, so we set both for compatibility + HF_TOKEN: ${{ secrets.HF_TOKEN }} + HUGGINGFACE_HUB_TOKEN: ${{ secrets.HF_TOKEN }} run: | set +e # Don't exit on error, we want to capture the result make e2e-test E2E_PROFILE=dynamic-config E2E_VERBOSE=true E2E_KEEP_CLUSTER=false diff --git a/.github/workflows/integration-test-helm.yml b/.github/workflows/integration-test-helm.yml index 1c1892218..2c4521d0d 100644 --- a/.github/workflows/integration-test-helm.yml +++ b/.github/workflows/integration-test-helm.yml @@ -161,11 +161,28 @@ jobs: kubectl get namespace vllm-semantic-router-system echo "::endgroup::" + - name: Create HF_TOKEN secret (if available) + env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} + run: | + echo "::group::Create HF_TOKEN Secret" + if [ -n "$HF_TOKEN" ]; then + kubectl create secret generic hf-token-secret \ + --from-literal=token="$HF_TOKEN" \ + -n vllm-semantic-router-system \ + --dry-run=client -o yaml | kubectl apply -f - + echo "✓ Created hf-token-secret (HF_TOKEN is set)" + else + echo "⚠️ HF_TOKEN not set - gated models (e.g., embeddinggemma-300m) will be gracefully skipped" + fi + echo "::endgroup::" + - name: Install Helm chart (CI minimal config) run: | echo "::group::Install Chart" # CI environment: Download only essential model to avoid OOM # Only download all-MiniLM-L12-v2 (smallest model ~120MB) + # Note: Default values include embeddinggemma-300m, which will be skipped if hf-token-secret is not available helm install semantic-router ${{ env.CHART_PATH }} \ --namespace vllm-semantic-router-system \ --wait \ diff --git a/.github/workflows/integration-test-k8s.yml b/.github/workflows/integration-test-k8s.yml index 4aa4a52d6..e77fbdbe1 100644 --- a/.github/workflows/integration-test-k8s.yml +++ b/.github/workflows/integration-test-k8s.yml @@ -81,6 +81,12 @@ jobs: - name: Run Integration E2E tests (${{ matrix.profile }}) id: e2e-test + env: + # HF_TOKEN is required for downloading gated models (e.g., embeddinggemma-300m) + # For PRs from forks, this will be empty and the E2E framework will gracefully skip gated model downloads + # The hf CLI uses HUGGINGFACE_HUB_TOKEN, so we set both for compatibility + HF_TOKEN: ${{ secrets.HF_TOKEN }} + HUGGINGFACE_HUB_TOKEN: ${{ secrets.HF_TOKEN }} run: | set +e # Don't exit on error, we want to capture the result make e2e-test E2E_PROFILE=${{ matrix.profile }} E2E_VERBOSE=true E2E_KEEP_CLUSTER=false diff --git a/.github/workflows/performance-nightly.yml b/.github/workflows/performance-nightly.yml index 145deff84..c57a322ee 100644 --- a/.github/workflows/performance-nightly.yml +++ b/.github/workflows/performance-nightly.yml @@ -70,9 +70,13 @@ jobs: - name: Download models (minimal set for nightly) env: CI_MINIMAL_MODELS: false - HF_TOKEN: ${{ secrets.HF_TOKEN }} HF_HUB_ENABLE_HF_TRANSFER: 1 HF_HUB_DISABLE_TELEMETRY: 1 + # HF_TOKEN is required for downloading gated models (e.g., embeddinggemma-300m) + # For PRs from forks, this will be empty and the model_manager will gracefully skip gated models + # The hf CLI uses HUGGINGFACE_HUB_TOKEN, so we set both for compatibility + HF_TOKEN: ${{ secrets.HF_TOKEN }} + HUGGINGFACE_HUB_TOKEN: ${{ secrets.HF_TOKEN }} run: make download-models - name: Create reports directory diff --git a/.github/workflows/performance-test.yml b/.github/workflows/performance-test.yml index c47b0ab2c..5f24f44a8 100644 --- a/.github/workflows/performance-test.yml +++ b/.github/workflows/performance-test.yml @@ -79,9 +79,13 @@ jobs: - name: Download models (minimal) env: CI_MINIMAL_MODELS: true - HF_TOKEN: ${{ secrets.HF_TOKEN }} HF_HUB_ENABLE_HF_TRANSFER: 1 HF_HUB_DISABLE_TELEMETRY: 1 + # HF_TOKEN is required for downloading gated models (e.g., embeddinggemma-300m) + # For PRs from forks, this will be empty and the model_manager will gracefully skip gated models + # The hf CLI uses HUGGINGFACE_HUB_TOKEN, so we set both for compatibility + HF_TOKEN: ${{ secrets.HF_TOKEN }} + HUGGINGFACE_HUB_TOKEN: ${{ secrets.HF_TOKEN }} run: make download-models - name: Run component benchmarks diff --git a/.github/workflows/test-and-build.yml b/.github/workflows/test-and-build.yml index fd74ab593..d7b790828 100644 --- a/.github/workflows/test-and-build.yml +++ b/.github/workflows/test-and-build.yml @@ -138,12 +138,20 @@ jobs: pip install -r src/model_manager/requirements.txt - name: Download models (minimal on PRs) + run: | + echo "Downloading models for CI..." + echo "Note: Gated models (e.g., embeddinggemma-300m) will be gracefully skipped if HF_TOKEN is not available." + echo "This is expected for PRs from forks where secrets are not exposed." + make download-models env: CI_MINIMAL_MODELS: ${{ github.event_name == 'pull_request' }} - HF_TOKEN: ${{ secrets.HF_TOKEN }} HF_HUB_ENABLE_HF_TRANSFER: 1 HF_HUB_DISABLE_TELEMETRY: 1 - run: make download-models + # HF_TOKEN is required for downloading gated models (e.g., embeddinggemma-300m) + # For PRs from forks, this will be empty and the model_manager will gracefully skip gated models + # The hf CLI uses HUGGINGFACE_HUB_TOKEN, so we set both for compatibility + HF_TOKEN: ${{ secrets.HF_TOKEN }} + HUGGINGFACE_HUB_TOKEN: ${{ secrets.HF_TOKEN }} - name: Start Milvus service run: | diff --git a/candle-binding/semantic-router_test.go b/candle-binding/semantic-router_test.go index 9d7a4427c..ee3f06766 100644 --- a/candle-binding/semantic-router_test.go +++ b/candle-binding/semantic-router_test.go @@ -1482,7 +1482,7 @@ func TestGetEmbeddingSmart(t *testing.T) { } t.Run("ShortTextHighLatency", func(t *testing.T) { - // Short text with high latency priority - uses Qwen3 (1024) since Gemma is not available + // Short text with high latency priority should use Gemma (768) text := "Hello world" embedding, err := GetEmbeddingSmart(text, 0.3, 0.8) @@ -1490,16 +1490,15 @@ func TestGetEmbeddingSmart(t *testing.T) { t.Fatalf("GetEmbeddingSmart failed: %v", err) } - // Expect Qwen3 (1024) dimension since Gemma is not available - if len(embedding) != 1024 { - t.Errorf("Expected 1024-dim embedding, got %d", len(embedding)) + if len(embedding) != 768 { + t.Errorf("Expected 768-dim embedding, got %d", len(embedding)) } t.Logf("Short text embedding generated: dim=%d", len(embedding)) }) t.Run("MediumTextBalanced", func(t *testing.T) { - // Medium text with balanced priorities - uses Qwen3 (1024) since Gemma is not available + // Medium text with balanced priorities - may select Qwen3 (1024) or Gemma (768) text := strings.Repeat("This is a medium length text with enough words to exceed 512 tokens. ", 10) embedding, err := GetEmbeddingSmart(text, 0.5, 0.5) @@ -1507,9 +1506,9 @@ func TestGetEmbeddingSmart(t *testing.T) { t.Fatalf("GetEmbeddingSmart failed: %v", err) } - // Expect Qwen3 (1024) dimension since Gemma is not available - if len(embedding) != 1024 { - t.Errorf("Expected 1024-dim embedding, got %d", len(embedding)) + // Accept both Qwen3 (1024) and Gemma (768) dimensions + if len(embedding) != 768 && len(embedding) != 1024 { + t.Errorf("Expected 768 or 1024-dim embedding, got %d", len(embedding)) } t.Logf("Medium text embedding generated: dim=%d", len(embedding)) @@ -1569,9 +1568,9 @@ func TestGetEmbeddingSmart(t *testing.T) { return } - // Expect Qwen3 (1024) since Gemma is not available - if len(embedding) != 1024 { - t.Errorf("Expected 1024-dim embedding, got %d", len(embedding)) + // Smart routing may select Qwen3 (1024) or Gemma (768) based on priorities + if len(embedding) != 768 && len(embedding) != 1024 { + t.Errorf("Expected 768 or 1024-dim embedding, got %d", len(embedding)) } t.Logf("Priority test %s: generated %d-dim embedding", tc.desc, len(embedding)) }) @@ -1594,9 +1593,9 @@ func TestGetEmbeddingSmart(t *testing.T) { continue } - // Expect Qwen3 (1024) since Gemma is not available - if len(embedding) != 1024 { - t.Errorf("Iteration %d: Expected 1024-dim embedding, got %d", i, len(embedding)) + // Smart routing may select Qwen3 (1024) or Gemma (768) + if len(embedding) != 768 && len(embedding) != 1024 { + t.Errorf("Iteration %d: Expected 768 or 1024-dim embedding, got %d", i, len(embedding)) } // Verify no nil pointers @@ -1635,12 +1634,11 @@ func BenchmarkGetEmbeddingSmart(b *testing.B) { } // Test constants for embedding models (Phase 4.2) -// Note: Gemma model is gated and requires HF_TOKEN, so tests use Qwen3 only const ( Qwen3EmbeddingModelPath = "../models/Qwen3-Embedding-0.6B" - GemmaEmbeddingModelPath = "" // Gemma is gated, not used in CI tests + GemmaEmbeddingModelPath = "../models/embeddinggemma-300m" TestEmbeddingText = "This is a test sentence for embedding generation" - TestLongContextText = "This is a longer text that might benefit from long-context embedding models like Qwen3" + TestLongContextText = "This is a longer text that might benefit from long-context embedding models like Qwen3 or Gemma" ) // Test constants for Qwen3 Multi-LoRA @@ -1702,8 +1700,22 @@ func TestInitEmbeddingModels(t *testing.T) { }) t.Run("InitGemmaOnly", func(t *testing.T) { - // Gemma is a gated model requiring HF_TOKEN, skip in CI - t.Skip("Skipping Gemma-only test: Gemma is a gated model requiring HF_TOKEN") + err := InitEmbeddingModels("", GemmaEmbeddingModelPath, true) + if err != nil { + t.Logf("InitEmbeddingModels (Gemma only) returned error (may already be initialized): %v", err) + + // Verify functionality + _, testErr := GetEmbeddingSmart("test", 0.5, 0.5) + if testErr == nil { + t.Log("✓ ModelFactory is functional (already initialized)") + } else { + if isModelInitializationError(testErr) { + t.Skipf("Skipping test due to model unavailability: %v", testErr) + } + } + } else { + t.Log("✓ Gemma model initialized successfully") + } }) t.Run("InitWithInvalidPaths", func(t *testing.T) { @@ -1785,16 +1797,16 @@ func TestGetEmbeddingWithDim(t *testing.T) { t.Run("OversizedDimension", func(t *testing.T) { // Test graceful degradation when requested dimension exceeds model capacity - // Qwen3: 1024, so 2048 should fall back to full dimension + // Qwen3: 1024, Gemma: 768, so 2048 should fall back to full dimension embedding, err := GetEmbeddingWithDim(TestEmbeddingText, 0.5, 0.5, 2048) if err != nil { t.Errorf("Should gracefully handle oversized dimension, got error: %v", err) return } - // Should return full dimension (1024 for Qwen3) - if len(embedding) != 1024 { - t.Errorf("Expected full dimension (1024), got %d", len(embedding)) + // Should return full dimension (1024 for Qwen3 or 768 for Gemma) + if len(embedding) != 1024 && len(embedding) != 768 { + t.Errorf("Expected full dimension (1024 or 768), got %d", len(embedding)) } else { t.Logf("✓ Oversized dimension gracefully degraded to full dimension: %d", len(embedding)) } @@ -1889,9 +1901,6 @@ func TestEmbeddingPriorityRouting(t *testing.T) { if err != nil { t.Fatalf("Failed to initialize embedding models: %v", err) } - - // Note: These tests use Matryoshka dimension truncation (768) with Qwen3 model - // The dimension is truncated from Qwen3's full 1024 dimensions testCases := []struct { name string text string @@ -1906,7 +1915,7 @@ func TestEmbeddingPriorityRouting(t *testing.T) { qualityPriority: 0.2, latencyPriority: 0.9, expectedDim: 768, - description: "Uses Qwen3 with Matryoshka 768 truncation", + description: "Should prefer faster embedding model (Gemma > Qwen3)", }, { name: "HighQualityPriority", @@ -1914,7 +1923,7 @@ func TestEmbeddingPriorityRouting(t *testing.T) { qualityPriority: 0.9, latencyPriority: 0.2, expectedDim: 768, - description: "Uses Qwen3 with Matryoshka 768 truncation", + description: "Should prefer quality model (Qwen3/Gemma)", }, { name: "BalancedPriority", @@ -1922,7 +1931,7 @@ func TestEmbeddingPriorityRouting(t *testing.T) { qualityPriority: 0.5, latencyPriority: 0.5, expectedDim: 768, - description: "Uses Qwen3 with Matryoshka 768 truncation", + description: "Should select based on text length", }, } diff --git a/config/model_manager/models.lora.yaml b/config/model_manager/models.lora.yaml index 22375f144..cda113cbf 100644 --- a/config/model_manager/models.lora.yaml +++ b/config/model_manager/models.lora.yaml @@ -29,3 +29,7 @@ models: - id: Qwen3-Embedding-0.6B repo_id: Qwen/Qwen3-Embedding-0.6B + + # Gated model - requires HF_TOKEN (will gracefully skip if token not available) + - id: embeddinggemma-300m + repo_id: google/embeddinggemma-300m diff --git a/config/model_manager/models.minimal.yaml b/config/model_manager/models.minimal.yaml index cff4b5d06..4f584c413 100644 --- a/config/model_manager/models.minimal.yaml +++ b/config/model_manager/models.minimal.yaml @@ -10,8 +10,8 @@ # Equivalent to: make download-models-minimal # or CI_MINIMAL_MODELS=true make download-models # -# Note: This is the minimal set for fast CI runs. Larger models like -# embeddinggemma-300m are in models.yaml (full set) for local development. +# Note: This is the minimal set for fast CI runs. Gated models like +# embeddinggemma-300m will gracefully skip if HF_TOKEN is not available. cache_dir: "models" verify: "size" # Use size for faster CI runs @@ -56,6 +56,10 @@ models: - id: Qwen3-Embedding-0.6B repo_id: Qwen/Qwen3-Embedding-0.6B + # Gated model - requires HF_TOKEN (will gracefully skip if token not available) + - id: embeddinggemma-300m + repo_id: google/embeddinggemma-300m + # ============================================================================= # Hallucination Detection - Required for hallucination tests # ============================================================================= diff --git a/deploy/helm/semantic-router/templates/deployment.yaml b/deploy/helm/semantic-router/templates/deployment.yaml index 761a88615..b095d583e 100644 --- a/deploy/helm/semantic-router/templates/deployment.yaml +++ b/deploy/helm/semantic-router/templates/deployment.yaml @@ -42,13 +42,33 @@ spec: command: ["/bin/bash", "-c"] args: - | - set -e echo "Downloading models to persistent volume..." cd /app/models {{- range .Values.initContainer.models }} # Download {{ .name }} echo "Downloading {{ .name }} from {{ .repo }}..." + + {{- if or (eq .name "embeddinggemma-300m") (contains "embeddinggemma" .name) }} + # Skip gated models if token is missing + if [ -z "${HF_TOKEN:-}" ] && [ -z "${HUGGINGFACE_HUB_TOKEN:-}" ]; then + echo "⚠️ Skipping {{ .name }} (HF_TOKEN not set, gated model requires authentication)" + else + # Remove .cache directory to ensure fresh download + rm -rf "{{ .name }}/.cache" 2>/dev/null || true + # Download with ignore_patterns to exclude ONNX-only files if pytorch model exists + python -c "from huggingface_hub import snapshot_download; snapshot_download(repo_id='{{ .repo }}', local_dir='{{ .name }}', ignore_patterns=['*.onnx', '*.msgpack', '*.h5', '*.tflite'] if '{{ .name }}' == 'all-MiniLM-L12-v2' else None)" + + # Check for required model files + echo "Checking {{ .name }} for required files:" + if [ -f "{{ .name }}/pytorch_model.bin" ] || [ -f "{{ .name }}/model.safetensors" ]; then + echo "✓ Found PyTorch model weights in {{ .name }}" + else + echo "✗ WARNING: No PyTorch model weights found in {{ .name }}" + ls -la "{{ .name }}/" | head -20 + fi + fi # End of HF_TOKEN check for Gemma + {{- else }} # Remove .cache directory to ensure fresh download rm -rf "{{ .name }}/.cache" 2>/dev/null || true # Download with ignore_patterns to exclude ONNX-only files if pytorch model exists @@ -62,6 +82,7 @@ spec: echo "✗ WARNING: No PyTorch model weights found in {{ .name }}" ls -la "{{ .name }}/" | head -20 fi + {{- end }} {{- end }} echo "All models downloaded successfully!" @@ -69,9 +90,9 @@ spec: env: - name: HF_HUB_CACHE value: /tmp/hf_cache - {{- with .Values.initContainer.env }} - {{- toYaml . | nindent 10 }} - {{- end }} + {{- with .Values.initContainer.env }} + {{- toYaml . | nindent 8 }} + {{- end }} resources: {{- toYaml .Values.initContainer.resources | nindent 10 }} volumeMounts: diff --git a/deploy/helm/semantic-router/values.yaml b/deploy/helm/semantic-router/values.yaml index 736868c37..0fffb0bb5 100644 --- a/deploy/helm/semantic-router/values.yaml +++ b/deploy/helm/semantic-router/values.yaml @@ -149,20 +149,28 @@ initContainer: # -- Additional environment variables for the init container. # For example, to use a private Hugging Face model, you can pass a token # and specify an endpoint using a pre-existing Kubernetes secret. - # env: - # - name: HF_TOKEN - # valueFrom: - # secretKeyRef: - # name: my-hf-secret - # key: token - # - name: HF_ENDPOINT - # value: "https://huggingface.co" - env: [] + # HF_TOKEN is required for downloading gated models like embeddinggemma-300m + # For PRs from forks, this will be empty and gated models will be gracefully skipped + env: + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + optional: true # Allow deployment even if secret doesn't exist (for local testing) + - name: HUGGINGFACE_HUB_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + optional: true # Allow deployment even if secret doesn't exist (for local testing) # -- Models to download models: # Embedding models for semantic cache and tools - name: Qwen3-Embedding-0.6B repo: Qwen/Qwen3-Embedding-0.6B + - name: embeddinggemma-300m + repo: google/embeddinggemma-300m - name: all-MiniLM-L12-v2 repo: sentence-transformers/all-MiniLM-L12-v2 - name: lora_intent_classifier_bert-base-uncased_model diff --git a/e2e/pkg/framework/runner.go b/e2e/pkg/framework/runner.go index 8c6b80861..aa0de8278 100644 --- a/e2e/pkg/framework/runner.go +++ b/e2e/pkg/framework/runner.go @@ -114,6 +114,19 @@ func (r *Runner) Run(ctx context.Context) error { // Set Kubernetes client for report generator r.reporter.SetKubeClient(kubeClient) + // Step 3.5: Create HF_TOKEN secret if available (for gated model downloads) + // This is required for downloading gated models like google/embeddinggemma-300m + if hfToken := os.Getenv("HF_TOKEN"); hfToken != "" { + if err := r.createHFTokenSecret(ctx, kubeClient); err != nil { + r.log("⚠️ Warning: Failed to create HF_TOKEN secret: %v", err) + r.log(" Model downloads may fail if gated models (e.g., embeddinggemma-300m) are required") + } else { + r.log("✅ Created HF_TOKEN secret for gated model downloads") + } + } else { + r.log("ℹ️ HF_TOKEN not set - gated models (e.g., embeddinggemma-300m) may not be downloadable") + } + // Step 4: Setup profile (deploy Helm charts, etc.) if !r.opts.SkipSetup { setupOpts := &SetupOptions{ @@ -492,6 +505,68 @@ func (r *Runner) collectSemanticRouterLogs(ctx context.Context, client *kubernet return nil } +// createHFTokenSecret creates a Kubernetes secret for HF_TOKEN if it's available in the environment +// This is required for the init container to download gated models like google/embeddinggemma-300m +// The secret must be in the same namespace as the semantic-router deployment (vllm-semantic-router-system) +// because Kubernetes secrets are namespace-scoped +func (r *Runner) createHFTokenSecret(ctx context.Context, kubeClient *kubernetes.Clientset) error { + hfToken := os.Getenv("HF_TOKEN") + if hfToken == "" { + return nil // No token to create + } + + // All E2E profiles deploy semantic-router to this namespace + nsName := "vllm-semantic-router-system" + + // First, ensure the namespace exists + _, err := kubeClient.CoreV1().Namespaces().Get(ctx, nsName, metav1.GetOptions{}) + if err != nil { + // Namespace doesn't exist, create it + ns := &corev1.Namespace{ + ObjectMeta: metav1.ObjectMeta{ + Name: nsName, + }, + } + _, err = kubeClient.CoreV1().Namespaces().Create(ctx, ns, metav1.CreateOptions{}) + if err != nil && !strings.Contains(err.Error(), "already exists") { + // If we can't create the namespace, that's okay - the profile will create it + r.log("⚠️ Could not create namespace %s (will be created by profile): %v", nsName, err) + } + } + + // Create the secret in the namespace where semantic-router is deployed + secret := &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{ + Name: "hf-token-secret", + Namespace: nsName, + }, + Type: corev1.SecretTypeOpaque, + StringData: map[string]string{ + "token": hfToken, + }, + } + + _, err = kubeClient.CoreV1().Secrets(nsName).Create(ctx, secret, metav1.CreateOptions{}) + if err != nil { + // If secret already exists, update it + if strings.Contains(err.Error(), "already exists") { + _, err = kubeClient.CoreV1().Secrets(nsName).Update(ctx, secret, metav1.UpdateOptions{}) + if err != nil { + return fmt.Errorf("failed to update existing HF_TOKEN secret in %s: %w", nsName, err) + } + return nil + } + // If namespace still doesn't exist, that's okay - it will be created by Helm + if strings.Contains(err.Error(), "not found") { + r.log("⚠️ Namespace %s not found yet (will be created by profile)", nsName) + return nil + } + return fmt.Errorf("failed to create HF_TOKEN secret in %s: %w", nsName, err) + } + + return nil +} + func getPodReadyStatus(pod corev1.Pod) string { readyCount := 0 totalCount := len(pod.Status.ContainerStatuses) diff --git a/e2e/profiles/ai-gateway/values.yaml b/e2e/profiles/ai-gateway/values.yaml index f369f77bd..d9d3b92fa 100644 --- a/e2e/profiles/ai-gateway/values.yaml +++ b/e2e/profiles/ai-gateway/values.yaml @@ -626,6 +626,7 @@ config: # - EmbeddingGemma-300M: Up to 8K context, fast inference, Matryoshka support (768/512/256/128) embedding_models: qwen3_model_path: "models/Qwen3-Embedding-0.6B" + gemma_model_path: "models/embeddinggemma-300m" use_cpu: true # Set to false for GPU acceleration (requires CUDA) # Observability Configuration diff --git a/e2e/profiles/dynamic-config/profile.go b/e2e/profiles/dynamic-config/profile.go index fc397de01..3c7b84957 100644 --- a/e2e/profiles/dynamic-config/profile.go +++ b/e2e/profiles/dynamic-config/profile.go @@ -256,16 +256,37 @@ func (p *Profile) kubectlApply(ctx context.Context, kubeconfig, manifestPath str func (p *Profile) verifyCRDsExist(ctx context.Context, kubeconfig string) error { // Verify IntelligentPool exists - cmd := exec.CommandContext(ctx, "kubectl", "get", "intelligentpool", "ai-gateway-pool", "-n", "default", "--kubeconfig", kubeconfig) - if err := cmd.Run(); err != nil { + cmd := exec.CommandContext(ctx, "kubectl", "get", "intelligentpool", "ai-gateway-pool", "-n", "default", "--kubeconfig", kubeconfig, "-o", "yaml") + output, err := cmd.Output() + if err != nil { return fmt.Errorf("IntelligentPool 'ai-gateway-pool' not found: %w", err) } + if p.verbose { + p.log("IntelligentPool 'ai-gateway-pool' found:\n%s", string(output)) + } // Verify IntelligentRoute exists - cmd = exec.CommandContext(ctx, "kubectl", "get", "intelligentroute", "ai-gateway-route", "-n", "default", "--kubeconfig", kubeconfig) - if err := cmd.Run(); err != nil { + cmd = exec.CommandContext(ctx, "kubectl", "get", "intelligentroute", "ai-gateway-route", "-n", "default", "--kubeconfig", kubeconfig, "-o", "yaml") + output, err = cmd.Output() + if err != nil { return fmt.Errorf("IntelligentRoute 'ai-gateway-route' not found: %w", err) } + if p.verbose { + p.log("IntelligentRoute 'ai-gateway-route' found:\n%s", string(output)) + } + + // Check CRD status conditions + cmd = exec.CommandContext(ctx, "kubectl", "get", "intelligentpool", "ai-gateway-pool", "-n", "default", "--kubeconfig", kubeconfig, "-o", "jsonpath={.status.conditions}") + statusOutput, _ := cmd.Output() + if p.verbose { + p.log("IntelligentPool status conditions: %s", string(statusOutput)) + } + + cmd = exec.CommandContext(ctx, "kubectl", "get", "intelligentroute", "ai-gateway-route", "-n", "default", "--kubeconfig", kubeconfig, "-o", "jsonpath={.status.conditions}") + statusOutput, _ = cmd.Output() + if p.verbose { + p.log("IntelligentRoute status conditions: %s", string(statusOutput)) + } return nil } @@ -359,6 +380,23 @@ func (p *Profile) verifyEnvironment(ctx context.Context, opts *framework.SetupOp p.log("All deployments are healthy") + // Additional diagnostic: Check semantic-router pod logs for CRD processing + if p.verbose { + p.log("Checking semantic-router pod logs for CRD processing...") + cmd := exec.CommandContext(ctx, "kubectl", "logs", "-n", "vllm-semantic-router-system", "-l", "app.kubernetes.io/name=semantic-router", "--tail=50", "--kubeconfig", opts.KubeConfig) + logs, err := cmd.Output() + if err == nil { + p.log("Recent semantic-router logs:\n%s", string(logs)) + } else { + p.log("Could not retrieve semantic-router logs: %v", err) + } + + // Check if semantic-router is configured with Kubernetes config source + cmd = exec.CommandContext(ctx, "kubectl", "get", "deployment", "semantic-router", "-n", "vllm-semantic-router-system", "-o", "jsonpath={.spec.template.spec.containers[0].env}", "--kubeconfig", opts.KubeConfig) + envOutput, _ := cmd.Output() + p.log("Semantic-router container environment variables: %s", string(envOutput)) + } + return nil } diff --git a/e2e/profiles/dynamic-config/values.yaml b/e2e/profiles/dynamic-config/values.yaml index bf77f4206..ef3e13ecd 100644 --- a/e2e/profiles/dynamic-config/values.yaml +++ b/e2e/profiles/dynamic-config/values.yaml @@ -5,7 +5,7 @@ # Environment variables for the semantic-router container env: - name: EMBEDDING_MODEL_OVERRIDE - value: "qwen3" # Force qwen3 for tests (Gemma requires HF_TOKEN) + value: "auto" # Use intelligent model selection config: # Set config source to kubernetes to enable CRD-based configuration @@ -127,7 +127,7 @@ config: embedding_models: qwen3_model_path: "models/Qwen3-Embedding-0.6B" - gemma_model_path: "" # Empty = fallback to Qwen3 (embeddinggemma requires HF_TOKEN) + gemma_model_path: "models/embeddinggemma-300m" use_cpu: true # Increase memory limits for embedding model support diff --git a/scripts/quickstart.sh b/scripts/quickstart.sh index 78dbf169d..82260fbe8 100755 --- a/scripts/quickstart.sh +++ b/scripts/quickstart.sh @@ -167,13 +167,17 @@ download_models() { info_msg "📥 Downloading AI models..." echo - # Try full model set first (includes embeddinggemma-300m which may require auth) + # Try full model set first (includes embeddinggemma-300m which requires HF_TOKEN for gated access) # If that fails (e.g., 401 on gated models), fall back to minimal set if [ "${CI_MINIMAL_MODELS:-}" = "true" ]; then info_msg "CI_MINIMAL_MODELS=true detected, using minimal model set" export CI_MINIMAL_MODELS=true else info_msg "Attempting to download full model set (includes embeddinggemma-300m)..." + if [ -z "${HF_TOKEN:-}" ]; then + info_msg "ℹ️ Note: HF_TOKEN not set. If embeddinggemma-300m download fails, script will fall back to minimal model set." + info_msg " To download Gemma, set HF_TOKEN environment variable: export HF_TOKEN=your_token" + fi export CI_MINIMAL_MODELS=false fi @@ -183,11 +187,13 @@ download_models() { else # Check if failure was due to gated model (embeddinggemma-300m) if grep -q "embeddinggemma.*401\|embeddinggemma.*Unauthorized\|embeddinggemma.*GatedRepoError" /tmp/download-models-output.log 2>/dev/null; then - info_msg "⚠️ Full model download failed (gated model requires auth)" - info_msg "📋 Falling back to minimal model set..." + info_msg "⚠️ Full model download failed: embeddinggemma-300m requires HF_TOKEN for gated model access" + info_msg "📋 Falling back to minimal model set (without Gemma)..." + info_msg "💡 To download Gemma, set HF_TOKEN: export HF_TOKEN=your_token && make download-models" export CI_MINIMAL_MODELS=true if make download-models 2>&1 | tee /tmp/download-models-output.log; then success_msg "✅ Minimal models downloaded successfully!" + info_msg "ℹ️ Note: Gemma embedding model was skipped. Some features may be limited." else error_msg "❌ Failed to download even minimal models!" info_msg "📋 Check logs: cat /tmp/download-models-output.log" diff --git a/src/model_manager/__init__.py b/src/model_manager/__init__.py index 5da0bffd6..7018456c7 100644 --- a/src/model_manager/__init__.py +++ b/src/model_manager/__init__.py @@ -24,6 +24,7 @@ MissingModelError, BadChecksumError, DownloadError, + GatedModelError, ) __version__ = "0.1.0" @@ -41,6 +42,7 @@ "MissingModelError", "BadChecksumError", "DownloadError", + "GatedModelError", ] @@ -96,15 +98,25 @@ def ensure_all(self) -> dict[str, str]: continue logger.info(f"Downloading model '{spec.id}' from {spec.repo_id}...") - local_path = download_model(spec, self.config.cache_dir) - - if self.config.verify != "none": - logger.info(f"Verifying model '{spec.id}'...") - if not verify_model(local_path, self.config.verify): - raise BadChecksumError(f"Verification failed for model '{spec.id}'") - - results[spec.id] = local_path - logger.info(f"Model '{spec.id}' ready at {local_path}") + try: + local_path = download_model(spec, self.config.cache_dir) + + if self.config.verify != "none": + logger.info(f"Verifying model '{spec.id}'...") + if not verify_model(local_path, self.config.verify): + raise BadChecksumError( + f"Verification failed for model '{spec.id}'" + ) + + results[spec.id] = local_path + logger.info(f"Model '{spec.id}' ready at {local_path}") + except GatedModelError as e: + # Gracefully skip gated models when token is not available + logger.warning( + f"⚠️ Skipping gated model '{spec.id}': {e}. " + "This is expected for PRs from forks where HF_TOKEN is not available." + ) + continue return results @@ -117,7 +129,14 @@ def ensure_model(self, model_id: str) -> str: Returns: Local path to the model + + Raises: + GatedModelError: If the model is gated and HF_TOKEN is not available """ + import logging + + logger = logging.getLogger(__name__) + spec = self.get_model_spec(model_id) if spec is None: raise MissingModelError(f"Model '{model_id}' not found in configuration") diff --git a/src/model_manager/cli.py b/src/model_manager/cli.py index e79274e89..352e3deaa 100644 --- a/src/model_manager/cli.py +++ b/src/model_manager/cli.py @@ -189,6 +189,8 @@ def main() -> int: # Ensure all models results = manager.ensure_all() logger.info(f"All {len(results)} models ready") + # Note: Some gated models may have been skipped if HF_TOKEN is not available + # This is expected behavior and not an error return 0 except ModelManagerError as e: diff --git a/src/model_manager/downloader.py b/src/model_manager/downloader.py index 5defb3492..d3643cb26 100644 --- a/src/model_manager/downloader.py +++ b/src/model_manager/downloader.py @@ -11,8 +11,14 @@ from huggingface_hub import snapshot_download from huggingface_hub.utils import RepositoryNotFoundError, RevisionNotFoundError +try: + from huggingface_hub.errors import GatedRepoError +except ImportError: + # Fallback for older versions of huggingface_hub + GatedRepoError = None + from .config import ModelSpec -from .errors import DownloadError, MissingModelError +from .errors import DownloadError, MissingModelError, GatedModelError logger = logging.getLogger(__name__) @@ -75,17 +81,56 @@ def download_model(spec: ModelSpec, cache_dir: str) -> str: logger.info(f"Successfully downloaded model '{spec.id}' to '{result_path}'") return result_path - except RepositoryNotFoundError: + except RepositoryNotFoundError as e: + # RepositoryNotFoundError (404) can occur for gated models when not authenticated + # HuggingFace returns 404 instead of 401 to avoid revealing repository existence + # Check if this might be a gated model by examining the error message + error_str = str(e).lower() + # Known gated models that might return 404 when not authenticated + known_gated_models = ["embeddinggemma", "gemma"] + is_known_gated = any( + gated_name in spec.repo_id.lower() for gated_name in known_gated_models + ) + + # If it's a known gated model or the error suggests authentication issues, treat as gated + if ( + is_known_gated + or "401" in error_str + or "unauthorized" in error_str + or "gated" in error_str + ): + raise GatedModelError( + f"Gated model '{spec.id}' requires HF_TOKEN authentication. " + f"Set HF_TOKEN or HUGGINGFACE_HUB_TOKEN environment variable to download. " + f"Note: Gated models may return 'Repository not found' (404) when not authenticated." + ) from e + raise MissingModelError( f"Repository not found: '{spec.repo_id}'. " "Check if the repository exists and you have access." - ) + ) from e except RevisionNotFoundError: raise MissingModelError( f"Revision not found: '{spec.revision}' in repository '{spec.repo_id}'. " "Check if the revision (commit/tag/branch) exists." ) except Exception as e: + # Check if this is a gated model error (401 Unauthorized or GatedRepoError) + error_str = str(e).lower() + is_gated_error = ( + (GatedRepoError is not None and isinstance(e, GatedRepoError)) + or "401" in error_str + or "unauthorized" in error_str + or "gated" in error_str + or "gatedrepoerror" in error_str + ) + + if is_gated_error: + raise GatedModelError( + f"Gated model '{spec.id}' requires HF_TOKEN authentication. " + f"Set HF_TOKEN or HUGGINGFACE_HUB_TOKEN environment variable to download." + ) from e + raise DownloadError( f"Failed to download model '{spec.id}' from '{spec.repo_id}': {e}" ) from e diff --git a/src/model_manager/errors.py b/src/model_manager/errors.py index 25137676d..20e78bee9 100644 --- a/src/model_manager/errors.py +++ b/src/model_manager/errors.py @@ -31,3 +31,9 @@ class ConfigurationError(ModelManagerError): """Raised when configuration is invalid or missing.""" pass + + +class GatedModelError(ModelManagerError): + """Raised when attempting to download a gated model without authentication.""" + + pass diff --git a/src/semantic-router/pkg/extproc/processor_req_body.go b/src/semantic-router/pkg/extproc/processor_req_body.go index f2b546f3b..73d210f5f 100644 --- a/src/semantic-router/pkg/extproc/processor_req_body.go +++ b/src/semantic-router/pkg/extproc/processor_req_body.go @@ -373,6 +373,13 @@ func (r *OpenAIRouter) createRoutingResponse(model string, endpoint string, modi RawValue: []byte(model), }, }) + // Add x-ai-eg-model header for Envoy AI Gateway compatibility + setHeaders = append(setHeaders, &core.HeaderValueOption{ + Header: &core.HeaderValue{ + Key: "x-ai-eg-model", // Envoy AI Gateway expects this header + RawValue: []byte(model), + }, + }) } // For Response API requests, modify :path to /v1/chat/completions @@ -441,6 +448,13 @@ func (r *OpenAIRouter) createSpecifiedModelResponse(model string, endpoint strin RawValue: []byte(model), }, }) + // Add x-ai-eg-model header for Envoy AI Gateway compatibility + setHeaders = append(setHeaders, &core.HeaderValueOption{ + Header: &core.HeaderValue{ + Key: "x-ai-eg-model", // Envoy AI Gateway expects this header + RawValue: []byte(model), + }, + }) // For Response API requests, modify :path to /v1/chat/completions and use translated body var bodyMutation *ext_proc.BodyMutation