feat: re-enable EmbeddingGemma-300m support

Liav Weiss · Liav Weiss · commit 52451b152619 · 2025-12-11T23:05:40.000+02:00
Signed-off-by: Liav Weiss &lt;lweiss@lweiss-thinkpadx1carbongen11.raanaii.csb&gt;
diff --git a/.github/workflows/integration-test-docker.yml b/.github/workflows/integration-test-docker.yml
@@ -82,12 +82,39 @@ jobs:
       - name: Download models
         run: |
           echo "Downloading minimal models for CI..."
+          # Debug: Check if HF_TOKEN is available (without printing the actual token)
+          if [ -n "$HF_TOKEN" ]; then
+            echo "✅ HF_TOKEN is set (length: ${#HF_TOKEN} characters)"
+            echo "Authenticating with HuggingFace..."
+            huggingface-cli login --token "$HF_TOKEN" --add-to-git-credential
+            echo "✅ HuggingFace authentication successful"
+            # Also ensure environment variable is set for hf CLI
+            export HUGGINGFACE_HUB_TOKEN="$HF_TOKEN"
+          else
+            echo "⚠️  HF_TOKEN not set - gated models may fail to download"
+            echo "   This workflow requires HF_TOKEN secret to be configured in repository settings"
+            echo "   For now, Gemma model download will be skipped (expected to fail)"
+          fi
+          # Export all environment variables for make
+          export CI=true
+          export CI_MINIMAL_MODELS=true
+          export HF_HUB_ENABLE_HF_TRANSFER=1
+          export HF_HUB_DISABLE_TELEMETRY=1
+          # Pass token to make if available
+          if [ -n "$HF_TOKEN" ]; then
+            export HF_TOKEN="$HF_TOKEN"
+            export HUGGINGFACE_HUB_TOKEN="$HF_TOKEN"
+          fi
           make download-models
         env:
           CI: true
           CI_MINIMAL_MODELS: true
           HF_HUB_ENABLE_HF_TRANSFER: 1
           HF_HUB_DISABLE_TELEMETRY: 1
+          # HF_TOKEN is required for downloading gated models (e.g., embeddinggemma-300m)
+          # The hf CLI uses HUGGINGFACE_HUB_TOKEN, so we set both for compatibility
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+          HUGGINGFACE_HUB_TOKEN: ${{ secrets.HF_TOKEN }}
 
       - name: Start CI services
         run: |
diff --git a/.github/workflows/integration-test-k8s.yml b/.github/workflows/integration-test-k8s.yml
@@ -80,6 +80,9 @@ jobs:
 
       - name: Run Integration E2E tests (${{ matrix.profile }})
         id: e2e-test
+        env:
+          # Pass HF_TOKEN to E2E tests for downloading gated models (e.g., embeddinggemma-300m)
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
         run: |
           set +e  # Don't exit on error, we want to capture the result
           make e2e-test E2E_PROFILE=${{ matrix.profile }} E2E_VERBOSE=true E2E_KEEP_CLUSTER=false
diff --git a/.github/workflows/performance-nightly.yml b/.github/workflows/performance-nightly.yml
@@ -71,6 +71,10 @@ jobs:
           CI_MINIMAL_MODELS: false
           HF_HUB_ENABLE_HF_TRANSFER: 1
           HF_HUB_DISABLE_TELEMETRY: 1
+          # HF_TOKEN is required for downloading gated models (e.g., embeddinggemma-300m)
+          # The hf CLI uses HUGGINGFACE_HUB_TOKEN, so we set both for compatibility
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+          HUGGINGFACE_HUB_TOKEN: ${{ secrets.HF_TOKEN }}
         run: make download-models
 
       - name: Run comprehensive benchmarks
diff --git a/.github/workflows/performance-test.yml b/.github/workflows/performance-test.yml
@@ -81,6 +81,10 @@ jobs:
           CI_MINIMAL_MODELS: true
           HF_HUB_ENABLE_HF_TRANSFER: 1
           HF_HUB_DISABLE_TELEMETRY: 1
+          # HF_TOKEN is required for downloading gated models (e.g., embeddinggemma-300m)
+          # The hf CLI uses HUGGINGFACE_HUB_TOKEN, so we set both for compatibility
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+          HUGGINGFACE_HUB_TOKEN: ${{ secrets.HF_TOKEN }}
         run: make download-models
 
       - name: Download performance baselines
diff --git a/.github/workflows/test-and-build.yml b/.github/workflows/test-and-build.yml
@@ -143,6 +143,10 @@ jobs:
           CI_MINIMAL_MODELS: ${{ github.event_name == 'pull_request' }}
           HF_HUB_ENABLE_HF_TRANSFER: 1
           HF_HUB_DISABLE_TELEMETRY: 1
+          # HF_TOKEN is required for downloading gated models (e.g., embeddinggemma-300m)
+          # The hf CLI uses HUGGINGFACE_HUB_TOKEN, so we set both for compatibility
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+          HUGGINGFACE_HUB_TOKEN: ${{ secrets.HF_TOKEN }}
         run: make download-models
 
       - name: Start Milvus service
diff --git a/candle-binding/semantic-router_test.go b/candle-binding/semantic-router_test.go
@@ -1482,34 +1482,33 @@ func TestGetEmbeddingSmart(t *testing.T) {
 	}
 
 	t.Run("ShortTextHighLatency", func(t *testing.T) {
-		// Short text with high latency priority - uses Qwen3 (1024) since Gemma is not available
+		// Short text with high latency priority should use Gemma (768)
 		text := "Hello world"
 		embedding, err := GetEmbeddingSmart(text, 0.3, 0.8)
 
 		if err != nil {
 			t.Fatalf("GetEmbeddingSmart failed: %v", err)
 		}
 
-		// Expect Qwen3 (1024) dimension since Gemma is not available
-		if len(embedding) != 1024 {
-			t.Errorf("Expected 1024-dim embedding, got %d", len(embedding))
+		if len(embedding) != 768 {
+			t.Errorf("Expected 768-dim embedding, got %d", len(embedding))
 		}
 
 		t.Logf("Short text embedding generated: dim=%d", len(embedding))
 	})
 
 	t.Run("MediumTextBalanced", func(t *testing.T) {
-		// Medium text with balanced priorities - uses Qwen3 (1024) since Gemma is not available
+		// Medium text with balanced priorities - may select Qwen3 (1024) or Gemma (768)
 		text := strings.Repeat("This is a medium length text with enough words to exceed 512 tokens. ", 10)
 		embedding, err := GetEmbeddingSmart(text, 0.5, 0.5)
 
 		if err != nil {
 			t.Fatalf("GetEmbeddingSmart failed: %v", err)
 		}
 
-		// Expect Qwen3 (1024) dimension since Gemma is not available
-		if len(embedding) != 1024 {
-			t.Errorf("Expected 1024-dim embedding, got %d", len(embedding))
+		// Accept both Qwen3 (1024) and Gemma (768) dimensions
+		if len(embedding) != 768 && len(embedding) != 1024 {
+			t.Errorf("Expected 768 or 1024-dim embedding, got %d", len(embedding))
 		}
 
 		t.Logf("Medium text embedding generated: dim=%d", len(embedding))
@@ -1569,9 +1568,9 @@ func TestGetEmbeddingSmart(t *testing.T) {
 					return
 				}
 
-				// Expect Qwen3 (1024) since Gemma is not available
-				if len(embedding) != 1024 {
-					t.Errorf("Expected 1024-dim embedding, got %d", len(embedding))
+				// Smart routing may select Qwen3 (1024) or Gemma (768) based on priorities
+				if len(embedding) != 768 && len(embedding) != 1024 {
+					t.Errorf("Expected 768 or 1024-dim embedding, got %d", len(embedding))
 				}
 				t.Logf("Priority test %s: generated %d-dim embedding", tc.desc, len(embedding))
 			})
@@ -1594,9 +1593,9 @@ func TestGetEmbeddingSmart(t *testing.T) {
 				continue
 			}
 
-			// Expect Qwen3 (1024) since Gemma is not available
-			if len(embedding) != 1024 {
-				t.Errorf("Iteration %d: Expected 1024-dim embedding, got %d", i, len(embedding))
+			// Smart routing may select Qwen3 (1024) or Gemma (768)
+			if len(embedding) != 768 && len(embedding) != 1024 {
+				t.Errorf("Iteration %d: Expected 768 or 1024-dim embedding, got %d", i, len(embedding))
 			}
 
 			// Verify no nil pointers
@@ -1635,12 +1634,11 @@ func BenchmarkGetEmbeddingSmart(b *testing.B) {
 }
 
 // Test constants for embedding models (Phase 4.2)
-// Note: Gemma model is gated and requires HF_TOKEN, so tests use Qwen3 only
 const (
 	Qwen3EmbeddingModelPath = "../models/Qwen3-Embedding-0.6B"
-	GemmaEmbeddingModelPath = "" // Gemma is gated, not used in CI tests
+	GemmaEmbeddingModelPath = "../models/embeddinggemma-300m"
 	TestEmbeddingText       = "This is a test sentence for embedding generation"
-	TestLongContextText     = "This is a longer text that might benefit from long-context embedding models like Qwen3"
+	TestLongContextText     = "This is a longer text that might benefit from long-context embedding models like Qwen3 or Gemma"
 )
 
 // Test constants for Qwen3 Multi-LoRA
@@ -1702,8 +1700,22 @@ func TestInitEmbeddingModels(t *testing.T) {
 	})
 
 	t.Run("InitGemmaOnly", func(t *testing.T) {
-		// Gemma is a gated model requiring HF_TOKEN, skip in CI
-		t.Skip("Skipping Gemma-only test: Gemma is a gated model requiring HF_TOKEN")
+		err := InitEmbeddingModels("", GemmaEmbeddingModelPath, true)
+		if err != nil {
+			t.Logf("InitEmbeddingModels (Gemma only) returned error (may already be initialized): %v", err)
+
+			// Verify functionality
+			_, testErr := GetEmbeddingSmart("test", 0.5, 0.5)
+			if testErr == nil {
+				t.Log("✓ ModelFactory is functional (already initialized)")
+			} else {
+				if isModelInitializationError(testErr) {
+					t.Skipf("Skipping test due to model unavailability: %v", testErr)
+				}
+			}
+		} else {
+			t.Log("✓ Gemma model initialized successfully")
+		}
 	})
 
 	t.Run("InitWithInvalidPaths", func(t *testing.T) {
@@ -1785,16 +1797,16 @@ func TestGetEmbeddingWithDim(t *testing.T) {
 
 	t.Run("OversizedDimension", func(t *testing.T) {
 		// Test graceful degradation when requested dimension exceeds model capacity
-		// Qwen3: 1024, so 2048 should fall back to full dimension
+		// Qwen3: 1024, Gemma: 768, so 2048 should fall back to full dimension
 		embedding, err := GetEmbeddingWithDim(TestEmbeddingText, 0.5, 0.5, 2048)
 		if err != nil {
 			t.Errorf("Should gracefully handle oversized dimension, got error: %v", err)
 			return
 		}
 
-		// Should return full dimension (1024 for Qwen3)
-		if len(embedding) != 1024 {
-			t.Errorf("Expected full dimension (1024), got %d", len(embedding))
+		// Should return full dimension (1024 for Qwen3 or 768 for Gemma)
+		if len(embedding) != 1024 && len(embedding) != 768 {
+			t.Errorf("Expected full dimension (1024 or 768), got %d", len(embedding))
 		} else {
 			t.Logf("✓ Oversized dimension gracefully degraded to full dimension: %d", len(embedding))
 		}
@@ -1889,9 +1901,6 @@ func TestEmbeddingPriorityRouting(t *testing.T) {
 	if err != nil {
 		t.Fatalf("Failed to initialize embedding models: %v", err)
 	}
-
-	// Note: These tests use Matryoshka dimension truncation (768) with Qwen3 model
-	// The dimension is truncated from Qwen3's full 1024 dimensions
 	testCases := []struct {
 		name            string
 		text            string
@@ -1906,23 +1915,23 @@ func TestEmbeddingPriorityRouting(t *testing.T) {
 			qualityPriority: 0.2,
 			latencyPriority: 0.9,
 			expectedDim:     768,
-			description:     "Uses Qwen3 with Matryoshka 768 truncation",
+			description:     "Should prefer faster embedding model (Gemma > Qwen3)",
 		},
 		{
 			name:            "HighQualityPriority",
 			text:            strings.Repeat("Long context text ", 30),
 			qualityPriority: 0.9,
 			latencyPriority: 0.2,
 			expectedDim:     768,
-			description:     "Uses Qwen3 with Matryoshka 768 truncation",
+			description:     "Should prefer quality model (Qwen3/Gemma)",
 		},
 		{
 			name:            "BalancedPriority",
 			text:            "Medium length text for embedding",
 			qualityPriority: 0.5,
 			latencyPriority: 0.5,
 			expectedDim:     768,
-			description:     "Uses Qwen3 with Matryoshka 768 truncation",
+			description:     "Should select based on text length",
 		},
 	}
 
diff --git a/deploy/helm/semantic-router/templates/deployment.yaml b/deploy/helm/semantic-router/templates/deployment.yaml
@@ -69,9 +69,9 @@ spec:
         env:
         - name: HF_HUB_CACHE
           value: /tmp/hf_cache
-          {{- with .Values.initContainer.env }}
-          {{- toYaml . | nindent 10 }}
-          {{- end }}
+        {{- with .Values.initContainer.env }}
+        {{- toYaml . | nindent 8 }}
+        {{- end }}
         resources:
           {{- toYaml .Values.initContainer.resources | nindent 10 }}
         volumeMounts:
diff --git a/deploy/helm/semantic-router/values.yaml b/deploy/helm/semantic-router/values.yaml
@@ -149,20 +149,21 @@ initContainer:
   # -- Additional environment variables for the init container.
   # For example, to use a private Hugging Face model, you can pass a token
   # and specify an endpoint using a pre-existing Kubernetes secret.
-  # env:
-  #   - name: HF_TOKEN
-  #     valueFrom:
-  #       secretKeyRef:
-  #         name: my-hf-secret
-  #         key: token
-  #   - name: HF_ENDPOINT
-  #     value: "https://huggingface.co"
-  env: []
+  # HF_TOKEN is required for downloading gated models like embeddinggemma-300m
+  env:
+    - name: HF_TOKEN
+      valueFrom:
+        secretKeyRef:
+          name: hf-token-secret
+          key: token
+          optional: true  # Allow deployment even if secret doesn't exist (for local testing)
   # -- Models to download
   models:
     # Embedding models for semantic cache and tools
     - name: Qwen3-Embedding-0.6B
       repo: Qwen/Qwen3-Embedding-0.6B
+    - name: embeddinggemma-300m
+      repo: google/embeddinggemma-300m
     - name: all-MiniLM-L12-v2
       repo: sentence-transformers/all-MiniLM-L12-v2
     - name: lora_intent_classifier_bert-base-uncased_model
diff --git a/e2e/pkg/framework/runner.go b/e2e/pkg/framework/runner.go
@@ -114,6 +114,19 @@ func (r *Runner) Run(ctx context.Context) error {
 	// Set Kubernetes client for report generator
 	r.reporter.SetKubeClient(kubeClient)
 
+	// Step 3.5: Create HF_TOKEN secret if available (for gated model downloads)
+	// This is required for downloading gated models like google/embeddinggemma-300m
+	if hfToken := os.Getenv("HF_TOKEN"); hfToken != "" {
+		if err := r.createHFTokenSecret(ctx, kubeClient); err != nil {
+			r.log("⚠️  Warning: Failed to create HF_TOKEN secret: %v", err)
+			r.log("   Model downloads may fail if gated models (e.g., embeddinggemma-300m) are required")
+		} else {
+			r.log("✅ Created HF_TOKEN secret for gated model downloads")
+		}
+	} else {
+		r.log("ℹ️  HF_TOKEN not set - gated models (e.g., embeddinggemma-300m) may not be downloadable")
+	}
+
 	// Step 4: Setup profile (deploy Helm charts, etc.)
 	if !r.opts.SkipSetup {
 		setupOpts := &SetupOptions{
@@ -485,6 +498,68 @@ func (r *Runner) collectSemanticRouterLogs(ctx context.Context, client *kubernet
 	return nil
 }
 
+// createHFTokenSecret creates a Kubernetes secret for HF_TOKEN if it's available in the environment
+// This is required for the init container to download gated models like google/embeddinggemma-300m
+// The secret must be in the same namespace as the semantic-router deployment (vllm-semantic-router-system)
+// because Kubernetes secrets are namespace-scoped
+func (r *Runner) createHFTokenSecret(ctx context.Context, kubeClient *kubernetes.Clientset) error {
+	hfToken := os.Getenv("HF_TOKEN")
+	if hfToken == "" {
+		return nil // No token to create
+	}
+
+	// All E2E profiles deploy semantic-router to this namespace
+	nsName := "vllm-semantic-router-system"
+
+	// First, ensure the namespace exists
+	_, err := kubeClient.CoreV1().Namespaces().Get(ctx, nsName, metav1.GetOptions{})
+	if err != nil {
+		// Namespace doesn't exist, create it
+		ns := &corev1.Namespace{
+			ObjectMeta: metav1.ObjectMeta{
+				Name: nsName,
+			},
+		}
+		_, err = kubeClient.CoreV1().Namespaces().Create(ctx, ns, metav1.CreateOptions{})
+		if err != nil && !strings.Contains(err.Error(), "already exists") {
+			// If we can't create the namespace, that's okay - the profile will create it
+			r.log("⚠️  Could not create namespace %s (will be created by profile): %v", nsName, err)
+		}
+	}
+
+	// Create the secret in the namespace where semantic-router is deployed
+	secret := &corev1.Secret{
+		ObjectMeta: metav1.ObjectMeta{
+			Name:      "hf-token-secret",
+			Namespace: nsName,
+		},
+		Type: corev1.SecretTypeOpaque,
+		StringData: map[string]string{
+			"token": hfToken,
+		},
+	}
+
+	_, err = kubeClient.CoreV1().Secrets(nsName).Create(ctx, secret, metav1.CreateOptions{})
+	if err != nil {
+		// If secret already exists, update it
+		if strings.Contains(err.Error(), "already exists") {
+			_, err = kubeClient.CoreV1().Secrets(nsName).Update(ctx, secret, metav1.UpdateOptions{})
+			if err != nil {
+				return fmt.Errorf("failed to update existing HF_TOKEN secret in %s: %w", nsName, err)
+			}
+			return nil
+		}
+		// If namespace still doesn't exist, that's okay - it will be created by Helm
+		if strings.Contains(err.Error(), "not found") {
+			r.log("⚠️  Namespace %s not found yet (will be created by profile)", nsName)
+			return nil
+		}
+		return fmt.Errorf("failed to create HF_TOKEN secret in %s: %w", nsName, err)
+	}
+
+	return nil
+}
+
 func getPodReadyStatus(pod corev1.Pod) string {
 	readyCount := 0
 	totalCount := len(pod.Status.ContainerStatuses)
diff --git a/e2e/profiles/ai-gateway/values.yaml b/e2e/profiles/ai-gateway/values.yaml
@@ -628,6 +628,7 @@ config:
   # - EmbeddingGemma-300M: Up to 8K context, fast inference, Matryoshka support (768/512/256/128)
   embedding_models:
     qwen3_model_path: "models/Qwen3-Embedding-0.6B"
+    gemma_model_path: "models/embeddinggemma-300m"
     use_cpu: true  # Set to false for GPU acceleration (requires CUDA)
 
   # Observability Configuration
diff --git a/e2e/profiles/dynamic-config/values.yaml b/e2e/profiles/dynamic-config/values.yaml
diff --git a/scripts/quickstart.sh b/scripts/quickstart.sh
diff --git a/tools/make/models.mk b/tools/make/models.mk