vllm-project · liavweiss · Dec 10, 2025 · Dec 15, 2025 · Dec 15, 2025
@@ -82,13 +82,18 @@ jobs:
       - name: Download models
         run: |
           echo "Downloading minimal models for CI..."
+          echo "Note: Gated models (e.g., embeddinggemma-300m) will be gracefully skipped if HF_TOKEN is not available."
           make download-models
         env:
           CI: true
           CI_MINIMAL_MODELS: true
-          HF_TOKEN: ${{ secrets.HF_TOKEN }}
           HF_HUB_ENABLE_HF_TRANSFER: 1
           HF_HUB_DISABLE_TELEMETRY: 1
+          # HF_TOKEN is required for downloading gated models (e.g., embeddinggemma-300m)
+          # For PRs from forks, this will be empty and the model_manager will gracefully skip gated models
+          # The hf CLI uses HUGGINGFACE_HUB_TOKEN, so we set both for compatibility
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+          HUGGINGFACE_HUB_TOKEN: ${{ secrets.HF_TOKEN }}
 
       - name: Start CI services
         run: |

@@ -76,6 +76,12 @@ jobs:
 
       - name: Run Dynamic Config E2E tests
         id: e2e-test
+        env:
+          # HF_TOKEN is required for downloading gated models (e.g., embeddinggemma-300m)
+          # For PRs from forks, this will be empty and the model_manager will gracefully skip gated models
+          # The hf CLI uses HUGGINGFACE_HUB_TOKEN, so we set both for compatibility
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+          HUGGINGFACE_HUB_TOKEN: ${{ secrets.HF_TOKEN }}
         run: |
           set +e  # Don't exit on error, we want to capture the result
           make e2e-test E2E_PROFILE=dynamic-config E2E_VERBOSE=true E2E_KEEP_CLUSTER=false

@@ -161,11 +161,28 @@ jobs:
           kubectl get namespace vllm-semantic-router-system
           echo "::endgroup::"
 
+      - name: Create HF_TOKEN secret (if available)
+        env:
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+        run: |
+          echo "::group::Create HF_TOKEN Secret"
+          if [ -n "$HF_TOKEN" ]; then
+            kubectl create secret generic hf-token-secret \
+              --from-literal=token="$HF_TOKEN" \
+              -n vllm-semantic-router-system \
+              --dry-run=client -o yaml | kubectl apply -f -
+            echo "✓ Created hf-token-secret (HF_TOKEN is set)"
+          else
+            echo "⚠️  HF_TOKEN not set - gated models (e.g., embeddinggemma-300m) will be gracefully skipped"
+          fi
+          echo "::endgroup::"
+
       - name: Install Helm chart (CI minimal config)
         run: |
           echo "::group::Install Chart"
           # CI environment: Download only essential model to avoid OOM
           # Only download all-MiniLM-L12-v2 (smallest model ~120MB)
+          # Note: Default values include embeddinggemma-300m, which will be skipped if hf-token-secret is not available
           helm install semantic-router ${{ env.CHART_PATH }} \
             --namespace vllm-semantic-router-system \
             --wait \

@@ -81,6 +81,12 @@ jobs:
 
       - name: Run Integration E2E tests (${{ matrix.profile }})
         id: e2e-test
+        env:
+          # HF_TOKEN is required for downloading gated models (e.g., embeddinggemma-300m)
+          # For PRs from forks, this will be empty and the E2E framework will gracefully skip gated model downloads
+          # The hf CLI uses HUGGINGFACE_HUB_TOKEN, so we set both for compatibility
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+          HUGGINGFACE_HUB_TOKEN: ${{ secrets.HF_TOKEN }}
         run: |
           set +e  # Don't exit on error, we want to capture the result
           make e2e-test E2E_PROFILE=${{ matrix.profile }} E2E_VERBOSE=true E2E_KEEP_CLUSTER=false

@@ -70,9 +70,13 @@ jobs:
       - name: Download models (minimal set for nightly)
         env:
           CI_MINIMAL_MODELS: false
-          HF_TOKEN: ${{ secrets.HF_TOKEN }}
           HF_HUB_ENABLE_HF_TRANSFER: 1
           HF_HUB_DISABLE_TELEMETRY: 1
+          # HF_TOKEN is required for downloading gated models (e.g., embeddinggemma-300m)
+          # For PRs from forks, this will be empty and the model_manager will gracefully skip gated models
+          # The hf CLI uses HUGGINGFACE_HUB_TOKEN, so we set both for compatibility
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+          HUGGINGFACE_HUB_TOKEN: ${{ secrets.HF_TOKEN }}
         run: make download-models
 
       - name: Create reports directory

@@ -79,9 +79,13 @@ jobs:
       - name: Download models (minimal)
         env:
           CI_MINIMAL_MODELS: true
-          HF_TOKEN: ${{ secrets.HF_TOKEN }}
           HF_HUB_ENABLE_HF_TRANSFER: 1
           HF_HUB_DISABLE_TELEMETRY: 1
+          # HF_TOKEN is required for downloading gated models (e.g., embeddinggemma-300m)
+          # For PRs from forks, this will be empty and the model_manager will gracefully skip gated models
+          # The hf CLI uses HUGGINGFACE_HUB_TOKEN, so we set both for compatibility
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+          HUGGINGFACE_HUB_TOKEN: ${{ secrets.HF_TOKEN }}
         run: make download-models
 
       - name: Run component benchmarks

@@ -138,12 +138,20 @@ jobs:
           pip install -r src/model_manager/requirements.txt
 
       - name: Download models (minimal on PRs)
+        run: |
+          echo "Downloading models for CI..."
+          echo "Note: Gated models (e.g., embeddinggemma-300m) will be gracefully skipped if HF_TOKEN is not available."
+          echo "This is expected for PRs from forks where secrets are not exposed."
+          make download-models
         env:
           CI_MINIMAL_MODELS: ${{ github.event_name == 'pull_request' }}
-          HF_TOKEN: ${{ secrets.HF_TOKEN }}
           HF_HUB_ENABLE_HF_TRANSFER: 1
           HF_HUB_DISABLE_TELEMETRY: 1
-        run: make download-models
+          # HF_TOKEN is required for downloading gated models (e.g., embeddinggemma-300m)
+          # For PRs from forks, this will be empty and the model_manager will gracefully skip gated models
+          # The hf CLI uses HUGGINGFACE_HUB_TOKEN, so we set both for compatibility
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+          HUGGINGFACE_HUB_TOKEN: ${{ secrets.HF_TOKEN }}
 
       - name: Start Milvus service
         run: |

@@ -1482,34 +1482,33 @@ func TestGetEmbeddingSmart(t *testing.T) {
 	}
 
 	t.Run("ShortTextHighLatency", func(t *testing.T) {
-		// Short text with high latency priority - uses Qwen3 (1024) since Gemma is not available
+		// Short text with high latency priority should use Gemma (768)
 		text := "Hello world"
 		embedding, err := GetEmbeddingSmart(text, 0.3, 0.8)
 
 		if err != nil {
 			t.Fatalf("GetEmbeddingSmart failed: %v", err)
 		}
 
-		// Expect Qwen3 (1024) dimension since Gemma is not available
-		if len(embedding) != 1024 {
-			t.Errorf("Expected 1024-dim embedding, got %d", len(embedding))
+		if len(embedding) != 768 {
+			t.Errorf("Expected 768-dim embedding, got %d", len(embedding))
 		}
 
 		t.Logf("Short text embedding generated: dim=%d", len(embedding))
 	})
 
 	t.Run("MediumTextBalanced", func(t *testing.T) {
-		// Medium text with balanced priorities - uses Qwen3 (1024) since Gemma is not available
+		// Medium text with balanced priorities - may select Qwen3 (1024) or Gemma (768)
 		text := strings.Repeat("This is a medium length text with enough words to exceed 512 tokens. ", 10)
 		embedding, err := GetEmbeddingSmart(text, 0.5, 0.5)
 
 		if err != nil {
 			t.Fatalf("GetEmbeddingSmart failed: %v", err)
 		}
 
-		// Expect Qwen3 (1024) dimension since Gemma is not available
-		if len(embedding) != 1024 {
-			t.Errorf("Expected 1024-dim embedding, got %d", len(embedding))
+		// Accept both Qwen3 (1024) and Gemma (768) dimensions
+		if len(embedding) != 768 && len(embedding) != 1024 {
+			t.Errorf("Expected 768 or 1024-dim embedding, got %d", len(embedding))
 		}
 
 		t.Logf("Medium text embedding generated: dim=%d", len(embedding))
@@ -1569,9 +1568,9 @@ func TestGetEmbeddingSmart(t *testing.T) {
 					return
 				}
 
-				// Expect Qwen3 (1024) since Gemma is not available
-				if len(embedding) != 1024 {
-					t.Errorf("Expected 1024-dim embedding, got %d", len(embedding))
+				// Smart routing may select Qwen3 (1024) or Gemma (768) based on priorities
+				if len(embedding) != 768 && len(embedding) != 1024 {
+					t.Errorf("Expected 768 or 1024-dim embedding, got %d", len(embedding))
 				}
 				t.Logf("Priority test %s: generated %d-dim embedding", tc.desc, len(embedding))
 			})
@@ -1594,9 +1593,9 @@ func TestGetEmbeddingSmart(t *testing.T) {
 				continue
 			}
 
-			// Expect Qwen3 (1024) since Gemma is not available
-			if len(embedding) != 1024 {
-				t.Errorf("Iteration %d: Expected 1024-dim embedding, got %d", i, len(embedding))
+			// Smart routing may select Qwen3 (1024) or Gemma (768)
+			if len(embedding) != 768 && len(embedding) != 1024 {
+				t.Errorf("Iteration %d: Expected 768 or 1024-dim embedding, got %d", i, len(embedding))
 			}
 
 			// Verify no nil pointers
@@ -1635,12 +1634,11 @@ func BenchmarkGetEmbeddingSmart(b *testing.B) {
 }
 
 // Test constants for embedding models (Phase 4.2)
-// Note: Gemma model is gated and requires HF_TOKEN, so tests use Qwen3 only
 const (
 	Qwen3EmbeddingModelPath = "../models/Qwen3-Embedding-0.6B"
-	GemmaEmbeddingModelPath = "" // Gemma is gated, not used in CI tests
+	GemmaEmbeddingModelPath = "../models/embeddinggemma-300m"
 	TestEmbeddingText       = "This is a test sentence for embedding generation"
-	TestLongContextText     = "This is a longer text that might benefit from long-context embedding models like Qwen3"
+	TestLongContextText     = "This is a longer text that might benefit from long-context embedding models like Qwen3 or Gemma"
 )
 
 // Test constants for Qwen3 Multi-LoRA
@@ -1702,8 +1700,22 @@ func TestInitEmbeddingModels(t *testing.T) {
 	})
 
 	t.Run("InitGemmaOnly", func(t *testing.T) {
-		// Gemma is a gated model requiring HF_TOKEN, skip in CI
-		t.Skip("Skipping Gemma-only test: Gemma is a gated model requiring HF_TOKEN")
+		err := InitEmbeddingModels("", GemmaEmbeddingModelPath, true)
+		if err != nil {
+			t.Logf("InitEmbeddingModels (Gemma only) returned error (may already be initialized): %v", err)
+
+			// Verify functionality
+			_, testErr := GetEmbeddingSmart("test", 0.5, 0.5)
+			if testErr == nil {
+				t.Log("✓ ModelFactory is functional (already initialized)")
+			} else {
+				if isModelInitializationError(testErr) {
+					t.Skipf("Skipping test due to model unavailability: %v", testErr)
+				}
+			}
+		} else {
+			t.Log("✓ Gemma model initialized successfully")
+		}
 	})
 
 	t.Run("InitWithInvalidPaths", func(t *testing.T) {
@@ -1785,16 +1797,16 @@ func TestGetEmbeddingWithDim(t *testing.T) {
 
 	t.Run("OversizedDimension", func(t *testing.T) {
 		// Test graceful degradation when requested dimension exceeds model capacity
-		// Qwen3: 1024, so 2048 should fall back to full dimension
+		// Qwen3: 1024, Gemma: 768, so 2048 should fall back to full dimension
 		embedding, err := GetEmbeddingWithDim(TestEmbeddingText, 0.5, 0.5, 2048)
 		if err != nil {
 			t.Errorf("Should gracefully handle oversized dimension, got error: %v", err)
 			return
 		}
 
-		// Should return full dimension (1024 for Qwen3)
-		if len(embedding) != 1024 {
-			t.Errorf("Expected full dimension (1024), got %d", len(embedding))
+		// Should return full dimension (1024 for Qwen3 or 768 for Gemma)
+		if len(embedding) != 1024 && len(embedding) != 768 {
+			t.Errorf("Expected full dimension (1024 or 768), got %d", len(embedding))
 		} else {
 			t.Logf("✓ Oversized dimension gracefully degraded to full dimension: %d", len(embedding))
 		}
@@ -1889,9 +1901,6 @@ func TestEmbeddingPriorityRouting(t *testing.T) {
 	if err != nil {
 		t.Fatalf("Failed to initialize embedding models: %v", err)
 	}
-
-	// Note: These tests use Matryoshka dimension truncation (768) with Qwen3 model
-	// The dimension is truncated from Qwen3's full 1024 dimensions
 	testCases := []struct {
 		name            string
 		text            string
@@ -1906,23 +1915,23 @@ func TestEmbeddingPriorityRouting(t *testing.T) {
 			qualityPriority: 0.2,
 			latencyPriority: 0.9,
 			expectedDim:     768,
-			description:     "Uses Qwen3 with Matryoshka 768 truncation",
+			description:     "Should prefer faster embedding model (Gemma > Qwen3)",
 		},
 		{
 			name:            "HighQualityPriority",
 			text:            strings.Repeat("Long context text ", 30),
 			qualityPriority: 0.9,
 			latencyPriority: 0.2,
 			expectedDim:     768,
-			description:     "Uses Qwen3 with Matryoshka 768 truncation",
+			description:     "Should prefer quality model (Qwen3/Gemma)",
 		},
 		{
 			name:            "BalancedPriority",
 			text:            "Medium length text for embedding",
 			qualityPriority: 0.5,
 			latencyPriority: 0.5,
 			expectedDim:     768,
-			description:     "Uses Qwen3 with Matryoshka 768 truncation",
+			description:     "Should select based on text length",
 		},
 	}
 

@@ -29,3 +29,7 @@ models:
 
   - id: Qwen3-Embedding-0.6B
     repo_id: Qwen/Qwen3-Embedding-0.6B
+
+  # Gated model - requires HF_TOKEN (will gracefully skip if token not available)
+  - id: embeddinggemma-300m
+    repo_id: google/embeddinggemma-300m
@@ -10,8 +10,8 @@
 # Equivalent to: make download-models-minimal
 #             or CI_MINIMAL_MODELS=true make download-models
 #
-# Note: This is the minimal set for fast CI runs. Larger models like
-# embeddinggemma-300m are in models.yaml (full set) for local development.
+# Note: This is the minimal set for fast CI runs. Gated models like
+# embeddinggemma-300m will gracefully skip if HF_TOKEN is not available.
 
 cache_dir: "models"
 verify: "size" # Use size for faster CI runs
@@ -56,6 +56,10 @@ models:
   - id: Qwen3-Embedding-0.6B
     repo_id: Qwen/Qwen3-Embedding-0.6B
 
+  # Gated model - requires HF_TOKEN (will gracefully skip if token not available)
+  - id: embeddinggemma-300m
+    repo_id: google/embeddinggemma-300m
+
   # =============================================================================
   # Hallucination Detection - Required for hallucination tests
   # =============================================================================

@@ -42,13 +42,33 @@ spec:
         command: ["/bin/bash", "-c"]
         args:
         - |
-          set -e
           echo "Downloading models to persistent volume..."
           cd /app/models
 
           {{- range .Values.initContainer.models }}
           # Download {{ .name }}
           echo "Downloading {{ .name }} from {{ .repo }}..."
+
+          {{- if or (eq .name "embeddinggemma-300m") (contains "embeddinggemma" .name) }}
+          # Skip gated models if token is missing
+          if [ -z "${HF_TOKEN:-}" ] && [ -z "${HUGGINGFACE_HUB_TOKEN:-}" ]; then
+            echo "⚠️  Skipping {{ .name }} (HF_TOKEN not set, gated model requires authentication)"
+          else
+            # Remove .cache directory to ensure fresh download
+            rm -rf "{{ .name }}/.cache" 2>/dev/null || true
+            # Download with ignore_patterns to exclude ONNX-only files if pytorch model exists
+            python -c "from huggingface_hub import snapshot_download; snapshot_download(repo_id='{{ .repo }}', local_dir='{{ .name }}', ignore_patterns=['*.onnx', '*.msgpack', '*.h5', '*.tflite'] if '{{ .name }}' == 'all-MiniLM-L12-v2' else None)"
+
+            # Check for required model files
+            echo "Checking {{ .name }} for required files:"
+            if [ -f "{{ .name }}/pytorch_model.bin" ] || [ -f "{{ .name }}/model.safetensors" ]; then
+              echo "✓ Found PyTorch model weights in {{ .name }}"
+            else
+              echo "✗ WARNING: No PyTorch model weights found in {{ .name }}"
+              ls -la "{{ .name }}/" | head -20
+            fi
+          fi  # End of HF_TOKEN check for Gemma
+          {{- else }}
           # Remove .cache directory to ensure fresh download
           rm -rf "{{ .name }}/.cache" 2>/dev/null || true
           # Download with ignore_patterns to exclude ONNX-only files if pytorch model exists
@@ -62,16 +82,17 @@ spec:
             echo "✗ WARNING: No PyTorch model weights found in {{ .name }}"
             ls -la "{{ .name }}/" | head -20
           fi
+          {{- end }}
 
           {{- end }}
           echo "All models downloaded successfully!"
           ls -la /app/models/
         env:
         - name: HF_HUB_CACHE
           value: /tmp/hf_cache
-          {{- with .Values.initContainer.env }}
-          {{- toYaml . | nindent 10 }}
-          {{- end }}
+        {{- with .Values.initContainer.env }}
+        {{- toYaml . | nindent 8 }}
+        {{- end }}
         resources:
           {{- toYaml .Values.initContainer.resources | nindent 10 }}
         volumeMounts: