vllm-project
diff --git a/‎.github/workflows/integration-test-docker.yml‎
Lines changed: 12 additions & 0 deletions b/‎.github/workflows/integration-test-docker.yml‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎.github/workflows/integration-test-k8s.yml‎
Lines changed: 4 additions & 0 deletions b/‎.github/workflows/integration-test-k8s.yml‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎.github/workflows/performance-nightly.yml‎
Lines changed: 5 additions & 0 deletions b/‎.github/workflows/performance-nightly.yml‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎.github/workflows/performance-test.yml‎
Lines changed: 5 additions & 0 deletions b/‎.github/workflows/performance-test.yml‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎.github/workflows/test-and-build.yml‎
Lines changed: 5 additions & 0 deletions b/‎.github/workflows/test-and-build.yml‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎candle-binding/semantic-router_test.go‎
Lines changed: 38 additions & 29 deletions b/‎candle-binding/semantic-router_test.go‎
Lines changed: 38 additions & 29 deletions
diff --git a/‎deploy/helm/semantic-router/templates/deployment.yaml‎
Lines changed: 35 additions & 14 deletions b/‎deploy/helm/semantic-router/templates/deployment.yaml‎
Lines changed: 35 additions & 14 deletions
diff --git a/‎deploy/helm/semantic-router/values.yaml‎
Lines changed: 17 additions & 9 deletions b/‎deploy/helm/semantic-router/values.yaml‎
Lines changed: 17 additions & 9 deletions
@@ -82,12 +82,24 @@ jobs:
       - name: Download models
         run: |
           echo "Downloading minimal models for CI..."
+          # Authenticate with HuggingFace if token is available
+          # Note: For PRs from forks, HF_TOKEN is not available (GitHub security feature)
+          # The makefile will gracefully skip gated models (e.g., embeddinggemma-300m) if token is missing
+          if [ -n "${HF_TOKEN:-}" ]; then
+            huggingface-cli login --token "$HF_TOKEN" --add-to-git-credential
+            export HUGGINGFACE_HUB_TOKEN="$HF_TOKEN"
+          fi
           make download-models
         env:
           CI: true
           CI_MINIMAL_MODELS: true
           HF_HUB_ENABLE_HF_TRANSFER: 1
           HF_HUB_DISABLE_TELEMETRY: 1
+          # HF_TOKEN is required for downloading gated models (e.g., embeddinggemma-300m)
+          # For PRs from forks, this will be empty and the makefile will gracefully skip gated models
+          # The hf CLI uses HUGGINGFACE_HUB_TOKEN, so we set both for compatibility
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+          HUGGINGFACE_HUB_TOKEN: ${{ secrets.HF_TOKEN }}
 
       - name: Start CI services
         run: |
 
@@ -81,6 +81,10 @@ jobs:
 
       - name: Run Integration E2E tests (${{ matrix.profile }})
         id: e2e-test
+        env:
+          # Pass HF_TOKEN to E2E tests for downloading gated models (e.g., embeddinggemma-300m)
+          # For PRs from forks, this will be empty and the E2E framework will gracefully skip gated model downloads
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
         run: |
           set +e  # Don't exit on error, we want to capture the result
           make e2e-test E2E_PROFILE=${{ matrix.profile }} E2E_VERBOSE=true E2E_KEEP_CLUSTER=false
 
@@ -71,6 +71,11 @@ jobs:
           CI_MINIMAL_MODELS: true
           HF_HUB_ENABLE_HF_TRANSFER: 1
           HF_HUB_DISABLE_TELEMETRY: 1
+          # HF_TOKEN is required for downloading gated models (e.g., embeddinggemma-300m)
+          # For PRs from forks, this will be empty and the makefile will gracefully skip gated models
+          # The hf CLI uses HUGGINGFACE_HUB_TOKEN, so we set both for compatibility
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+          HUGGINGFACE_HUB_TOKEN: ${{ secrets.HF_TOKEN }}
         run: make download-models
 
       - name: Create reports directory
 
@@ -81,6 +81,11 @@ jobs:
           CI_MINIMAL_MODELS: true
           HF_HUB_ENABLE_HF_TRANSFER: 1
           HF_HUB_DISABLE_TELEMETRY: 1
+          # HF_TOKEN is required for downloading gated models (e.g., embeddinggemma-300m)
+          # For PRs from forks, this will be empty and the makefile will gracefully skip gated models
+          # The hf CLI uses HUGGINGFACE_HUB_TOKEN, so we set both for compatibility
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+          HUGGINGFACE_HUB_TOKEN: ${{ secrets.HF_TOKEN }}
         run: make download-models
 
       - name: Download performance baselines
 
@@ -143,6 +143,11 @@ jobs:
           CI_MINIMAL_MODELS: ${{ github.event_name == 'pull_request' }}
           HF_HUB_ENABLE_HF_TRANSFER: 1
           HF_HUB_DISABLE_TELEMETRY: 1
+          # HF_TOKEN is required for downloading gated models (e.g., embeddinggemma-300m)
+          # For PRs from forks, this will be empty and the makefile will gracefully skip gated models
+          # The hf CLI uses HUGGINGFACE_HUB_TOKEN, so we set both for compatibility
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+          HUGGINGFACE_HUB_TOKEN: ${{ secrets.HF_TOKEN }}
         run: make download-models
 
       - name: Start Milvus service
 
@@ -1482,34 +1482,33 @@ func TestGetEmbeddingSmart(t *testing.T) {
 	}
 
 	t.Run("ShortTextHighLatency", func(t *testing.T) {
-		// Short text with high latency priority - uses Qwen3 (1024) since Gemma is not available
+		// Short text with high latency priority should use Gemma (768)
 		text := "Hello world"
 		embedding, err := GetEmbeddingSmart(text, 0.3, 0.8)
 
 		if err != nil {
 			t.Fatalf("GetEmbeddingSmart failed: %v", err)
 		}
 
-		// Expect Qwen3 (1024) dimension since Gemma is not available
-		if len(embedding) != 1024 {
-			t.Errorf("Expected 1024-dim embedding, got %d", len(embedding))
+		if len(embedding) != 768 {
+			t.Errorf("Expected 768-dim embedding, got %d", len(embedding))
 		}
 
 		t.Logf("Short text embedding generated: dim=%d", len(embedding))
 	})
 
 	t.Run("MediumTextBalanced", func(t *testing.T) {
-		// Medium text with balanced priorities - uses Qwen3 (1024) since Gemma is not available
+		// Medium text with balanced priorities - may select Qwen3 (1024) or Gemma (768)
 		text := strings.Repeat("This is a medium length text with enough words to exceed 512 tokens. ", 10)
 		embedding, err := GetEmbeddingSmart(text, 0.5, 0.5)
 
 		if err != nil {
 			t.Fatalf("GetEmbeddingSmart failed: %v", err)
 		}
 
-		// Expect Qwen3 (1024) dimension since Gemma is not available
-		if len(embedding) != 1024 {
-			t.Errorf("Expected 1024-dim embedding, got %d", len(embedding))
+		// Accept both Qwen3 (1024) and Gemma (768) dimensions
+		if len(embedding) != 768 && len(embedding) != 1024 {
+			t.Errorf("Expected 768 or 1024-dim embedding, got %d", len(embedding))
 		}
 
 		t.Logf("Medium text embedding generated: dim=%d", len(embedding))
@@ -1569,9 +1568,9 @@ func TestGetEmbeddingSmart(t *testing.T) {
 					return
 				}
 
-				// Expect Qwen3 (1024) since Gemma is not available
-				if len(embedding) != 1024 {
-					t.Errorf("Expected 1024-dim embedding, got %d", len(embedding))
+				// Smart routing may select Qwen3 (1024) or Gemma (768) based on priorities
+				if len(embedding) != 768 && len(embedding) != 1024 {
+					t.Errorf("Expected 768 or 1024-dim embedding, got %d", len(embedding))
 				}
 				t.Logf("Priority test %s: generated %d-dim embedding", tc.desc, len(embedding))
 			})
@@ -1594,9 +1593,9 @@ func TestGetEmbeddingSmart(t *testing.T) {
 				continue
 			}
 
-			// Expect Qwen3 (1024) since Gemma is not available
-			if len(embedding) != 1024 {
-				t.Errorf("Iteration %d: Expected 1024-dim embedding, got %d", i, len(embedding))
+			// Smart routing may select Qwen3 (1024) or Gemma (768)
+			if len(embedding) != 768 && len(embedding) != 1024 {
+				t.Errorf("Iteration %d: Expected 768 or 1024-dim embedding, got %d", i, len(embedding))
 			}
 
 			// Verify no nil pointers
@@ -1635,12 +1634,11 @@ func BenchmarkGetEmbeddingSmart(b *testing.B) {
 }
 
 // Test constants for embedding models (Phase 4.2)
-// Note: Gemma model is gated and requires HF_TOKEN, so tests use Qwen3 only
 const (
 	Qwen3EmbeddingModelPath = "../models/Qwen3-Embedding-0.6B"
-	GemmaEmbeddingModelPath = "" // Gemma is gated, not used in CI tests
+	GemmaEmbeddingModelPath = "../models/embeddinggemma-300m"
 	TestEmbeddingText       = "This is a test sentence for embedding generation"
-	TestLongContextText     = "This is a longer text that might benefit from long-context embedding models like Qwen3"
+	TestLongContextText     = "This is a longer text that might benefit from long-context embedding models like Qwen3 or Gemma"
 )
 
 // Test constants for Qwen3 Multi-LoRA
@@ -1702,8 +1700,22 @@ func TestInitEmbeddingModels(t *testing.T) {
 	})
 
 	t.Run("InitGemmaOnly", func(t *testing.T) {
-		// Gemma is a gated model requiring HF_TOKEN, skip in CI
-		t.Skip("Skipping Gemma-only test: Gemma is a gated model requiring HF_TOKEN")
+		err := InitEmbeddingModels("", GemmaEmbeddingModelPath, true)
+		if err != nil {
+			t.Logf("InitEmbeddingModels (Gemma only) returned error (may already be initialized): %v", err)
+
+			// Verify functionality
+			_, testErr := GetEmbeddingSmart("test", 0.5, 0.5)
+			if testErr == nil {
+				t.Log("✓ ModelFactory is functional (already initialized)")
+			} else {
+				if isModelInitializationError(testErr) {
+					t.Skipf("Skipping test due to model unavailability: %v", testErr)
+				}
+			}
+		} else {
+			t.Log("✓ Gemma model initialized successfully")
+		}
 	})
 
 	t.Run("InitWithInvalidPaths", func(t *testing.T) {
@@ -1785,16 +1797,16 @@ func TestGetEmbeddingWithDim(t *testing.T) {
 
 	t.Run("OversizedDimension", func(t *testing.T) {
 		// Test graceful degradation when requested dimension exceeds model capacity
-		// Qwen3: 1024, so 2048 should fall back to full dimension
+		// Qwen3: 1024, Gemma: 768, so 2048 should fall back to full dimension
 		embedding, err := GetEmbeddingWithDim(TestEmbeddingText, 0.5, 0.5, 2048)
 		if err != nil {
 			t.Errorf("Should gracefully handle oversized dimension, got error: %v", err)
 			return
 		}
 
-		// Should return full dimension (1024 for Qwen3)
-		if len(embedding) != 1024 {
-			t.Errorf("Expected full dimension (1024), got %d", len(embedding))
+		// Should return full dimension (1024 for Qwen3 or 768 for Gemma)
+		if len(embedding) != 1024 && len(embedding) != 768 {
+			t.Errorf("Expected full dimension (1024 or 768), got %d", len(embedding))
 		} else {
 			t.Logf("✓ Oversized dimension gracefully degraded to full dimension: %d", len(embedding))
 		}
@@ -1889,9 +1901,6 @@ func TestEmbeddingPriorityRouting(t *testing.T) {
 	if err != nil {
 		t.Fatalf("Failed to initialize embedding models: %v", err)
 	}
-
-	// Note: These tests use Matryoshka dimension truncation (768) with Qwen3 model
-	// The dimension is truncated from Qwen3's full 1024 dimensions
 	testCases := []struct {
 		name            string
 		text            string
@@ -1906,23 +1915,23 @@ func TestEmbeddingPriorityRouting(t *testing.T) {
 			qualityPriority: 0.2,
 			latencyPriority: 0.9,
 			expectedDim:     768,
-			description:     "Uses Qwen3 with Matryoshka 768 truncation",
+			description:     "Should prefer faster embedding model (Gemma > Qwen3)",
 		},
 		{
 			name:            "HighQualityPriority",
 			text:            strings.Repeat("Long context text ", 30),
 			qualityPriority: 0.9,
 			latencyPriority: 0.2,
 			expectedDim:     768,
-			description:     "Uses Qwen3 with Matryoshka 768 truncation",
+			description:     "Should prefer quality model (Qwen3/Gemma)",
 		},
 		{
 			name:            "BalancedPriority",
 			text:            "Medium length text for embedding",
 			qualityPriority: 0.5,
 			latencyPriority: 0.5,
 			expectedDim:     768,
-			description:     "Uses Qwen3 with Matryoshka 768 truncation",
+			description:     "Should select based on text length",
 		},
 	}
 
 
@@ -49,29 +49,50 @@ spec:
           {{- range .Values.initContainer.models }}
           # Download {{ .name }}
           echo "Downloading {{ .name }} from {{ .repo }}..."
-          # Remove .cache directory to ensure fresh download
-          rm -rf "{{ .name }}/.cache" 2>/dev/null || true
-          # Download with ignore_patterns to exclude ONNX-only files if pytorch model exists
-          python -c "from huggingface_hub import snapshot_download; snapshot_download(repo_id='{{ .repo }}', local_dir='{{ .name }}', ignore_patterns=['*.onnx', '*.msgpack', '*.h5', '*.tflite'] if '{{ .name }}' == 'all-MiniLM-L12-v2' else None)"
-
-          # Check for required model files
-          echo "Checking {{ .name }} for required files:"
-          if [ -f "{{ .name }}/pytorch_model.bin" ] || [ -f "{{ .name }}/model.safetensors" ]; then
-            echo "✓ Found PyTorch model weights in {{ .name }}"
+          
+          # Check if this is a gated model and if token is missing
+          {{- if or (eq .name "embeddinggemma-300m") (contains "embeddinggemma" .name) }}
+          if [ -z "${HF_TOKEN:-}" ] && [ -z "${HUGGINGFACE_HUB_TOKEN:-}" ]; then
+            echo "⚠️  Warning: HF_TOKEN not set, skipping {{ .name }} download (gated model requires authentication)"
+            echo "   This is expected for PRs from forks where secrets are not available"
+            echo "   Continuing with other models..."
           else
-            echo "✗ WARNING: No PyTorch model weights found in {{ .name }}"
-            ls -la "{{ .name }}/" | head -20
+          {{- end }}
+            # Remove .cache directory to ensure fresh download
+            rm -rf "{{ .name }}/.cache" 2>/dev/null || true
+            # Download with ignore_patterns to exclude ONNX-only files if pytorch model exists
+            python -c "
+            from huggingface_hub import snapshot_download
+            
+            repo_id = '{{ .repo }}'
+            local_dir = '{{ .name }}'
+            ignore_patterns = ['*.onnx', '*.msgpack', '*.h5', '*.tflite'] if '{{ .name }}' == 'all-MiniLM-L12-v2' else None
+            
+            snapshot_download(repo_id=repo_id, local_dir=local_dir, ignore_patterns=ignore_patterns)
+            print(f'✓ Successfully downloaded {repo_id}')
+            "
+
+            # Check for required model files
+            echo "Checking {{ .name }} for required files:"
+            if [ -f "{{ .name }}/pytorch_model.bin" ] || [ -f "{{ .name }}/model.safetensors" ]; then
+              echo "✓ Found PyTorch model weights in {{ .name }}"
+            else
+              echo "✗ WARNING: No PyTorch model weights found in {{ .name }}"
+              ls -la "{{ .name }}/" | head -20
+            fi
+          {{- if or (eq .name "embeddinggemma-300m") (contains "embeddinggemma" .name) }}
           fi
+          {{- end }}
 
           {{- end }}
           echo "All models downloaded successfully!"
           ls -la /app/models/
         env:
         - name: HF_HUB_CACHE
           value: /tmp/hf_cache
-          {{- with .Values.initContainer.env }}
-          {{- toYaml . | nindent 10 }}
-          {{- end }}
+        {{- with .Values.initContainer.env }}
+        {{- toYaml . | nindent 8 }}
+        {{- end }}
         resources:
           {{- toYaml .Values.initContainer.resources | nindent 10 }}
         volumeMounts:
 
@@ -149,20 +149,28 @@ initContainer:
   # -- Additional environment variables for the init container.
   # For example, to use a private Hugging Face model, you can pass a token
   # and specify an endpoint using a pre-existing Kubernetes secret.
-  # env:
-  #   - name: HF_TOKEN
-  #     valueFrom:
-  #       secretKeyRef:
-  #         name: my-hf-secret
-  #         key: token
-  #   - name: HF_ENDPOINT
-  #     value: "https://huggingface.co"
-  env: []
+  # HF_TOKEN is required for downloading gated models like embeddinggemma-300m
+  # For PRs from forks, this will be empty and gated models will be gracefully skipped
+  env:
+    - name: HF_TOKEN
+      valueFrom:
+        secretKeyRef:
+          name: hf-token-secret
+          key: token
+          optional: true  # Allow deployment even if secret doesn't exist (for local testing)
+    - name: HUGGINGFACE_HUB_TOKEN
+      valueFrom:
+        secretKeyRef:
+          name: hf-token-secret
+          key: token
+          optional: true  # Allow deployment even if secret doesn't exist (for local testing)
   # -- Models to download
   models:
     # Embedding models for semantic cache and tools
     - name: Qwen3-Embedding-0.6B
       repo: Qwen/Qwen3-Embedding-0.6B
+    - name: embeddinggemma-300m
+      repo: google/embeddinggemma-300m
     - name: all-MiniLM-L12-v2
       repo: sentence-transformers/all-MiniLM-L12-v2
     - name: lora_intent_classifier_bert-base-uncased_model