diff --git a/.github/workflows/integration-test-docker.yml b/.github/workflows/integration-test-docker.yml
index 4e37d79b6..d94fe1050 100644
--- a/.github/workflows/integration-test-docker.yml
+++ b/.github/workflows/integration-test-docker.yml
@@ -82,13 +82,18 @@ jobs:
       - name: Download models
         run: |
           echo "Downloading minimal models for CI..."
+          echo "Note: Gated models (e.g., embeddinggemma-300m) will be gracefully skipped if HF_TOKEN is not available."
           make download-models
         env:
           CI: true
           CI_MINIMAL_MODELS: true
-          HF_TOKEN: ${{ secrets.HF_TOKEN }}
           HF_HUB_ENABLE_HF_TRANSFER: 1
           HF_HUB_DISABLE_TELEMETRY: 1
+          # HF_TOKEN is required for downloading gated models (e.g., embeddinggemma-300m)
+          # For PRs from forks, this will be empty and the model_manager will gracefully skip gated models
+          # The hf CLI uses HUGGINGFACE_HUB_TOKEN, so we set both for compatibility
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+          HUGGINGFACE_HUB_TOKEN: ${{ secrets.HF_TOKEN }}
 
       - name: Start CI services
         run: |
diff --git a/.github/workflows/integration-test-dynamic-config.yml b/.github/workflows/integration-test-dynamic-config.yml
index a5590de96..d23ea2a5d 100644
--- a/.github/workflows/integration-test-dynamic-config.yml
+++ b/.github/workflows/integration-test-dynamic-config.yml
@@ -76,6 +76,12 @@ jobs:
 
       - name: Run Dynamic Config E2E tests
         id: e2e-test
+        env:
+          # HF_TOKEN is required for downloading gated models (e.g., embeddinggemma-300m)
+          # For PRs from forks, this will be empty and the model_manager will gracefully skip gated models
+          # The hf CLI uses HUGGINGFACE_HUB_TOKEN, so we set both for compatibility
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+          HUGGINGFACE_HUB_TOKEN: ${{ secrets.HF_TOKEN }}
         run: |
           set +e  # Don't exit on error, we want to capture the result
           make e2e-test E2E_PROFILE=dynamic-config E2E_VERBOSE=true E2E_KEEP_CLUSTER=false
diff --git a/.github/workflows/integration-test-helm.yml b/.github/workflows/integration-test-helm.yml
index 1c1892218..2c4521d0d 100644
--- a/.github/workflows/integration-test-helm.yml
+++ b/.github/workflows/integration-test-helm.yml
@@ -161,11 +161,28 @@ jobs:
           kubectl get namespace vllm-semantic-router-system
           echo "::endgroup::"
 
+      - name: Create HF_TOKEN secret (if available)
+        env:
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+        run: |
+          echo "::group::Create HF_TOKEN Secret"
+          if [ -n "$HF_TOKEN" ]; then
+            kubectl create secret generic hf-token-secret \
+              --from-literal=token="$HF_TOKEN" \
+              -n vllm-semantic-router-system \
+              --dry-run=client -o yaml | kubectl apply -f -
+            echo "✓ Created hf-token-secret (HF_TOKEN is set)"
+          else
+            echo "⚠️  HF_TOKEN not set - gated models (e.g., embeddinggemma-300m) will be gracefully skipped"
+          fi
+          echo "::endgroup::"
+
       - name: Install Helm chart (CI minimal config)
         run: |
           echo "::group::Install Chart"
           # CI environment: Download only essential model to avoid OOM
           # Only download all-MiniLM-L12-v2 (smallest model ~120MB)
+          # Note: Default values include embeddinggemma-300m, which will be skipped if hf-token-secret is not available
           helm install semantic-router ${{ env.CHART_PATH }} \
             --namespace vllm-semantic-router-system \
             --wait \
diff --git a/.github/workflows/integration-test-k8s.yml b/.github/workflows/integration-test-k8s.yml
index 4aa4a52d6..e77fbdbe1 100644
--- a/.github/workflows/integration-test-k8s.yml
+++ b/.github/workflows/integration-test-k8s.yml
@@ -81,6 +81,12 @@ jobs:
 
       - name: Run Integration E2E tests (${{ matrix.profile }})
         id: e2e-test
+        env:
+          # HF_TOKEN is required for downloading gated models (e.g., embeddinggemma-300m)
+          # For PRs from forks, this will be empty and the E2E framework will gracefully skip gated model downloads
+          # The hf CLI uses HUGGINGFACE_HUB_TOKEN, so we set both for compatibility
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+          HUGGINGFACE_HUB_TOKEN: ${{ secrets.HF_TOKEN }}
         run: |
           set +e  # Don't exit on error, we want to capture the result
           make e2e-test E2E_PROFILE=${{ matrix.profile }} E2E_VERBOSE=true E2E_KEEP_CLUSTER=false
diff --git a/.github/workflows/performance-nightly.yml b/.github/workflows/performance-nightly.yml
index 145deff84..c57a322ee 100644
--- a/.github/workflows/performance-nightly.yml
+++ b/.github/workflows/performance-nightly.yml
@@ -70,9 +70,13 @@ jobs:
       - name: Download models (minimal set for nightly)
         env:
           CI_MINIMAL_MODELS: false
-          HF_TOKEN: ${{ secrets.HF_TOKEN }}
           HF_HUB_ENABLE_HF_TRANSFER: 1
           HF_HUB_DISABLE_TELEMETRY: 1
+          # HF_TOKEN is required for downloading gated models (e.g., embeddinggemma-300m)
+          # For PRs from forks, this will be empty and the model_manager will gracefully skip gated models
+          # The hf CLI uses HUGGINGFACE_HUB_TOKEN, so we set both for compatibility
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+          HUGGINGFACE_HUB_TOKEN: ${{ secrets.HF_TOKEN }}
         run: make download-models
 
       - name: Create reports directory
diff --git a/.github/workflows/performance-test.yml b/.github/workflows/performance-test.yml
index c47b0ab2c..5f24f44a8 100644
--- a/.github/workflows/performance-test.yml
+++ b/.github/workflows/performance-test.yml
@@ -79,9 +79,13 @@ jobs:
       - name: Download models (minimal)
         env:
           CI_MINIMAL_MODELS: true
-          HF_TOKEN: ${{ secrets.HF_TOKEN }}
           HF_HUB_ENABLE_HF_TRANSFER: 1
           HF_HUB_DISABLE_TELEMETRY: 1
+          # HF_TOKEN is required for downloading gated models (e.g., embeddinggemma-300m)
+          # For PRs from forks, this will be empty and the model_manager will gracefully skip gated models
+          # The hf CLI uses HUGGINGFACE_HUB_TOKEN, so we set both for compatibility
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+          HUGGINGFACE_HUB_TOKEN: ${{ secrets.HF_TOKEN }}
         run: make download-models
 
       - name: Run component benchmarks
diff --git a/.github/workflows/test-and-build.yml b/.github/workflows/test-and-build.yml
index fd74ab593..d7b790828 100644
--- a/.github/workflows/test-and-build.yml
+++ b/.github/workflows/test-and-build.yml
@@ -138,12 +138,20 @@ jobs:
           pip install -r src/model_manager/requirements.txt
 
       - name: Download models (minimal on PRs)
+        run: |
+          echo "Downloading models for CI..."
+          echo "Note: Gated models (e.g., embeddinggemma-300m) will be gracefully skipped if HF_TOKEN is not available."
+          echo "This is expected for PRs from forks where secrets are not exposed."
+          make download-models
         env:
           CI_MINIMAL_MODELS: ${{ github.event_name == 'pull_request' }}
-          HF_TOKEN: ${{ secrets.HF_TOKEN }}
           HF_HUB_ENABLE_HF_TRANSFER: 1
           HF_HUB_DISABLE_TELEMETRY: 1
-        run: make download-models
+          # HF_TOKEN is required for downloading gated models (e.g., embeddinggemma-300m)
+          # For PRs from forks, this will be empty and the model_manager will gracefully skip gated models
+          # The hf CLI uses HUGGINGFACE_HUB_TOKEN, so we set both for compatibility
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+          HUGGINGFACE_HUB_TOKEN: ${{ secrets.HF_TOKEN }}
 
       - name: Start Milvus service
         run: |
diff --git a/candle-binding/semantic-router_test.go b/candle-binding/semantic-router_test.go
index 9d7a4427c..ee3f06766 100644
--- a/candle-binding/semantic-router_test.go
+++ b/candle-binding/semantic-router_test.go
@@ -1482,7 +1482,7 @@ func TestGetEmbeddingSmart(t *testing.T) {
 	}
 
 	t.Run("ShortTextHighLatency", func(t *testing.T) {
-		// Short text with high latency priority - uses Qwen3 (1024) since Gemma is not available
+		// Short text with high latency priority should use Gemma (768)
 		text := "Hello world"
 		embedding, err := GetEmbeddingSmart(text, 0.3, 0.8)
 
@@ -1490,16 +1490,15 @@ func TestGetEmbeddingSmart(t *testing.T) {
 			t.Fatalf("GetEmbeddingSmart failed: %v", err)
 		}
 
-		// Expect Qwen3 (1024) dimension since Gemma is not available
-		if len(embedding) != 1024 {
-			t.Errorf("Expected 1024-dim embedding, got %d", len(embedding))
+		if len(embedding) != 768 {
+			t.Errorf("Expected 768-dim embedding, got %d", len(embedding))
 		}
 
 		t.Logf("Short text embedding generated: dim=%d", len(embedding))
 	})
 
 	t.Run("MediumTextBalanced", func(t *testing.T) {
-		// Medium text with balanced priorities - uses Qwen3 (1024) since Gemma is not available
+		// Medium text with balanced priorities - may select Qwen3 (1024) or Gemma (768)
 		text := strings.Repeat("This is a medium length text with enough words to exceed 512 tokens. ", 10)
 		embedding, err := GetEmbeddingSmart(text, 0.5, 0.5)
 
@@ -1507,9 +1506,9 @@ func TestGetEmbeddingSmart(t *testing.T) {
 			t.Fatalf("GetEmbeddingSmart failed: %v", err)
 		}
 
-		// Expect Qwen3 (1024) dimension since Gemma is not available
-		if len(embedding) != 1024 {
-			t.Errorf("Expected 1024-dim embedding, got %d", len(embedding))
+		// Accept both Qwen3 (1024) and Gemma (768) dimensions
+		if len(embedding) != 768 && len(embedding) != 1024 {
+			t.Errorf("Expected 768 or 1024-dim embedding, got %d", len(embedding))
 		}
 
 		t.Logf("Medium text embedding generated: dim=%d", len(embedding))
@@ -1569,9 +1568,9 @@ func TestGetEmbeddingSmart(t *testing.T) {
 					return
 				}
 
-				// Expect Qwen3 (1024) since Gemma is not available
-				if len(embedding) != 1024 {
-					t.Errorf("Expected 1024-dim embedding, got %d", len(embedding))
+				// Smart routing may select Qwen3 (1024) or Gemma (768) based on priorities
+				if len(embedding) != 768 && len(embedding) != 1024 {
+					t.Errorf("Expected 768 or 1024-dim embedding, got %d", len(embedding))
 				}
 				t.Logf("Priority test %s: generated %d-dim embedding", tc.desc, len(embedding))
 			})
@@ -1594,9 +1593,9 @@ func TestGetEmbeddingSmart(t *testing.T) {
 				continue
 			}
 
-			// Expect Qwen3 (1024) since Gemma is not available
-			if len(embedding) != 1024 {
-				t.Errorf("Iteration %d: Expected 1024-dim embedding, got %d", i, len(embedding))
+			// Smart routing may select Qwen3 (1024) or Gemma (768)
+			if len(embedding) != 768 && len(embedding) != 1024 {
+				t.Errorf("Iteration %d: Expected 768 or 1024-dim embedding, got %d", i, len(embedding))
 			}
 
 			// Verify no nil pointers
@@ -1635,12 +1634,11 @@ func BenchmarkGetEmbeddingSmart(b *testing.B) {
 }
 
 // Test constants for embedding models (Phase 4.2)
-// Note: Gemma model is gated and requires HF_TOKEN, so tests use Qwen3 only
 const (
 	Qwen3EmbeddingModelPath = "../models/Qwen3-Embedding-0.6B"
-	GemmaEmbeddingModelPath = "" // Gemma is gated, not used in CI tests
+	GemmaEmbeddingModelPath = "../models/embeddinggemma-300m"
 	TestEmbeddingText       = "This is a test sentence for embedding generation"
-	TestLongContextText     = "This is a longer text that might benefit from long-context embedding models like Qwen3"
+	TestLongContextText     = "This is a longer text that might benefit from long-context embedding models like Qwen3 or Gemma"
 )
 
 // Test constants for Qwen3 Multi-LoRA
@@ -1702,8 +1700,22 @@ func TestInitEmbeddingModels(t *testing.T) {
 	})
 
 	t.Run("InitGemmaOnly", func(t *testing.T) {
-		// Gemma is a gated model requiring HF_TOKEN, skip in CI
-		t.Skip("Skipping Gemma-only test: Gemma is a gated model requiring HF_TOKEN")
+		err := InitEmbeddingModels("", GemmaEmbeddingModelPath, true)
+		if err != nil {
+			t.Logf("InitEmbeddingModels (Gemma only) returned error (may already be initialized): %v", err)
+
+			// Verify functionality
+			_, testErr := GetEmbeddingSmart("test", 0.5, 0.5)
+			if testErr == nil {
+				t.Log("✓ ModelFactory is functional (already initialized)")
+			} else {
+				if isModelInitializationError(testErr) {
+					t.Skipf("Skipping test due to model unavailability: %v", testErr)
+				}
+			}
+		} else {
+			t.Log("✓ Gemma model initialized successfully")
+		}
 	})
 
 	t.Run("InitWithInvalidPaths", func(t *testing.T) {
@@ -1785,16 +1797,16 @@ func TestGetEmbeddingWithDim(t *testing.T) {
 
 	t.Run("OversizedDimension", func(t *testing.T) {
 		// Test graceful degradation when requested dimension exceeds model capacity
-		// Qwen3: 1024, so 2048 should fall back to full dimension
+		// Qwen3: 1024, Gemma: 768, so 2048 should fall back to full dimension
 		embedding, err := GetEmbeddingWithDim(TestEmbeddingText, 0.5, 0.5, 2048)
 		if err != nil {
 			t.Errorf("Should gracefully handle oversized dimension, got error: %v", err)
 			return
 		}
 
-		// Should return full dimension (1024 for Qwen3)
-		if len(embedding) != 1024 {
-			t.Errorf("Expected full dimension (1024), got %d", len(embedding))
+		// Should return full dimension (1024 for Qwen3 or 768 for Gemma)
+		if len(embedding) != 1024 && len(embedding) != 768 {
+			t.Errorf("Expected full dimension (1024 or 768), got %d", len(embedding))
 		} else {
 			t.Logf("✓ Oversized dimension gracefully degraded to full dimension: %d", len(embedding))
 		}
@@ -1889,9 +1901,6 @@ func TestEmbeddingPriorityRouting(t *testing.T) {
 	if err != nil {
 		t.Fatalf("Failed to initialize embedding models: %v", err)
 	}
-
-	// Note: These tests use Matryoshka dimension truncation (768) with Qwen3 model
-	// The dimension is truncated from Qwen3's full 1024 dimensions
 	testCases := []struct {
 		name            string
 		text            string
@@ -1906,7 +1915,7 @@ func TestEmbeddingPriorityRouting(t *testing.T) {
 			qualityPriority: 0.2,
 			latencyPriority: 0.9,
 			expectedDim:     768,
-			description:     "Uses Qwen3 with Matryoshka 768 truncation",
+			description:     "Should prefer faster embedding model (Gemma > Qwen3)",
 		},
 		{
 			name:            "HighQualityPriority",
@@ -1914,7 +1923,7 @@ func TestEmbeddingPriorityRouting(t *testing.T) {
 			qualityPriority: 0.9,
 			latencyPriority: 0.2,
 			expectedDim:     768,
-			description:     "Uses Qwen3 with Matryoshka 768 truncation",
+			description:     "Should prefer quality model (Qwen3/Gemma)",
 		},
 		{
 			name:            "BalancedPriority",
@@ -1922,7 +1931,7 @@ func TestEmbeddingPriorityRouting(t *testing.T) {
 			qualityPriority: 0.5,
 			latencyPriority: 0.5,
 			expectedDim:     768,
-			description:     "Uses Qwen3 with Matryoshka 768 truncation",
+			description:     "Should select based on text length",
 		},
 	}
 
diff --git a/config/model_manager/models.lora.yaml b/config/model_manager/models.lora.yaml
index 22375f144..cda113cbf 100644
--- a/config/model_manager/models.lora.yaml
+++ b/config/model_manager/models.lora.yaml
@@ -29,3 +29,7 @@ models:
 
   - id: Qwen3-Embedding-0.6B
     repo_id: Qwen/Qwen3-Embedding-0.6B
+
+  # Gated model - requires HF_TOKEN (will gracefully skip if token not available)
+  - id: embeddinggemma-300m
+    repo_id: google/embeddinggemma-300m
diff --git a/config/model_manager/models.minimal.yaml b/config/model_manager/models.minimal.yaml
index cff4b5d06..4f584c413 100644
--- a/config/model_manager/models.minimal.yaml
+++ b/config/model_manager/models.minimal.yaml
@@ -10,8 +10,8 @@
 # Equivalent to: make download-models-minimal
 #             or CI_MINIMAL_MODELS=true make download-models
 #
-# Note: This is the minimal set for fast CI runs. Larger models like
-# embeddinggemma-300m are in models.yaml (full set) for local development.
+# Note: This is the minimal set for fast CI runs. Gated models like
+# embeddinggemma-300m will gracefully skip if HF_TOKEN is not available.
 
 cache_dir: "models"
 verify: "size" # Use size for faster CI runs
@@ -56,6 +56,10 @@ models:
   - id: Qwen3-Embedding-0.6B
     repo_id: Qwen/Qwen3-Embedding-0.6B
 
+  # Gated model - requires HF_TOKEN (will gracefully skip if token not available)
+  - id: embeddinggemma-300m
+    repo_id: google/embeddinggemma-300m
+
   # =============================================================================
   # Hallucination Detection - Required for hallucination tests
   # =============================================================================
diff --git a/deploy/helm/semantic-router/templates/deployment.yaml b/deploy/helm/semantic-router/templates/deployment.yaml
index 761a88615..b095d583e 100644
--- a/deploy/helm/semantic-router/templates/deployment.yaml
+++ b/deploy/helm/semantic-router/templates/deployment.yaml
@@ -42,13 +42,33 @@ spec:
         command: ["/bin/bash", "-c"]
         args:
         - |
-          set -e
           echo "Downloading models to persistent volume..."
           cd /app/models
 
           {{- range .Values.initContainer.models }}
           # Download {{ .name }}
           echo "Downloading {{ .name }} from {{ .repo }}..."
+          
+          {{- if or (eq .name "embeddinggemma-300m") (contains "embeddinggemma" .name) }}
+          # Skip gated models if token is missing
+          if [ -z "${HF_TOKEN:-}" ] && [ -z "${HUGGINGFACE_HUB_TOKEN:-}" ]; then
+            echo "⚠️  Skipping {{ .name }} (HF_TOKEN not set, gated model requires authentication)"
+          else
+            # Remove .cache directory to ensure fresh download
+            rm -rf "{{ .name }}/.cache" 2>/dev/null || true
+            # Download with ignore_patterns to exclude ONNX-only files if pytorch model exists
+            python -c "from huggingface_hub import snapshot_download; snapshot_download(repo_id='{{ .repo }}', local_dir='{{ .name }}', ignore_patterns=['*.onnx', '*.msgpack', '*.h5', '*.tflite'] if '{{ .name }}' == 'all-MiniLM-L12-v2' else None)"
+
+            # Check for required model files
+            echo "Checking {{ .name }} for required files:"
+            if [ -f "{{ .name }}/pytorch_model.bin" ] || [ -f "{{ .name }}/model.safetensors" ]; then
+              echo "✓ Found PyTorch model weights in {{ .name }}"
+            else
+              echo "✗ WARNING: No PyTorch model weights found in {{ .name }}"
+              ls -la "{{ .name }}/" | head -20
+            fi
+          fi  # End of HF_TOKEN check for Gemma
+          {{- else }}
           # Remove .cache directory to ensure fresh download
           rm -rf "{{ .name }}/.cache" 2>/dev/null || true
           # Download with ignore_patterns to exclude ONNX-only files if pytorch model exists
@@ -62,6 +82,7 @@ spec:
             echo "✗ WARNING: No PyTorch model weights found in {{ .name }}"
             ls -la "{{ .name }}/" | head -20
           fi
+          {{- end }}
 
           {{- end }}
           echo "All models downloaded successfully!"
@@ -69,9 +90,9 @@ spec:
         env:
         - name: HF_HUB_CACHE
           value: /tmp/hf_cache
-          {{- with .Values.initContainer.env }}
-          {{- toYaml . | nindent 10 }}
-          {{- end }}
+        {{- with .Values.initContainer.env }}
+        {{- toYaml . | nindent 8 }}
+        {{- end }}
         resources:
           {{- toYaml .Values.initContainer.resources | nindent 10 }}
         volumeMounts:
diff --git a/deploy/helm/semantic-router/values.yaml b/deploy/helm/semantic-router/values.yaml
index 736868c37..0fffb0bb5 100644
--- a/deploy/helm/semantic-router/values.yaml
+++ b/deploy/helm/semantic-router/values.yaml
@@ -149,20 +149,28 @@ initContainer:
   # -- Additional environment variables for the init container.
   # For example, to use a private Hugging Face model, you can pass a token
   # and specify an endpoint using a pre-existing Kubernetes secret.
-  # env:
-  #   - name: HF_TOKEN
-  #     valueFrom:
-  #       secretKeyRef:
-  #         name: my-hf-secret
-  #         key: token
-  #   - name: HF_ENDPOINT
-  #     value: "https://huggingface.co"
-  env: []
+  # HF_TOKEN is required for downloading gated models like embeddinggemma-300m
+  # For PRs from forks, this will be empty and gated models will be gracefully skipped
+  env:
+    - name: HF_TOKEN
+      valueFrom:
+        secretKeyRef:
+          name: hf-token-secret
+          key: token
+          optional: true  # Allow deployment even if secret doesn't exist (for local testing)
+    - name: HUGGINGFACE_HUB_TOKEN
+      valueFrom:
+        secretKeyRef:
+          name: hf-token-secret
+          key: token
+          optional: true  # Allow deployment even if secret doesn't exist (for local testing)
   # -- Models to download
   models:
     # Embedding models for semantic cache and tools
     - name: Qwen3-Embedding-0.6B
       repo: Qwen/Qwen3-Embedding-0.6B
+    - name: embeddinggemma-300m
+      repo: google/embeddinggemma-300m
     - name: all-MiniLM-L12-v2
       repo: sentence-transformers/all-MiniLM-L12-v2
     - name: lora_intent_classifier_bert-base-uncased_model
diff --git a/e2e/pkg/framework/runner.go b/e2e/pkg/framework/runner.go
index 8c6b80861..aa0de8278 100644
--- a/e2e/pkg/framework/runner.go
+++ b/e2e/pkg/framework/runner.go
@@ -114,6 +114,19 @@ func (r *Runner) Run(ctx context.Context) error {
 	// Set Kubernetes client for report generator
 	r.reporter.SetKubeClient(kubeClient)
 
+	// Step 3.5: Create HF_TOKEN secret if available (for gated model downloads)
+	// This is required for downloading gated models like google/embeddinggemma-300m
+	if hfToken := os.Getenv("HF_TOKEN"); hfToken != "" {
+		if err := r.createHFTokenSecret(ctx, kubeClient); err != nil {
+			r.log("⚠️  Warning: Failed to create HF_TOKEN secret: %v", err)
+			r.log("   Model downloads may fail if gated models (e.g., embeddinggemma-300m) are required")
+		} else {
+			r.log("✅ Created HF_TOKEN secret for gated model downloads")
+		}
+	} else {
+		r.log("ℹ️  HF_TOKEN not set - gated models (e.g., embeddinggemma-300m) may not be downloadable")
+	}
+
 	// Step 4: Setup profile (deploy Helm charts, etc.)
 	if !r.opts.SkipSetup {
 		setupOpts := &SetupOptions{
@@ -492,6 +505,68 @@ func (r *Runner) collectSemanticRouterLogs(ctx context.Context, client *kubernet
 	return nil
 }
 
+// createHFTokenSecret creates a Kubernetes secret for HF_TOKEN if it's available in the environment
+// This is required for the init container to download gated models like google/embeddinggemma-300m
+// The secret must be in the same namespace as the semantic-router deployment (vllm-semantic-router-system)
+// because Kubernetes secrets are namespace-scoped
+func (r *Runner) createHFTokenSecret(ctx context.Context, kubeClient *kubernetes.Clientset) error {
+	hfToken := os.Getenv("HF_TOKEN")
+	if hfToken == "" {
+		return nil // No token to create
+	}
+
+	// All E2E profiles deploy semantic-router to this namespace
+	nsName := "vllm-semantic-router-system"
+
+	// First, ensure the namespace exists
+	_, err := kubeClient.CoreV1().Namespaces().Get(ctx, nsName, metav1.GetOptions{})
+	if err != nil {
+		// Namespace doesn't exist, create it
+		ns := &corev1.Namespace{
+			ObjectMeta: metav1.ObjectMeta{
+				Name: nsName,
+			},
+		}
+		_, err = kubeClient.CoreV1().Namespaces().Create(ctx, ns, metav1.CreateOptions{})
+		if err != nil && !strings.Contains(err.Error(), "already exists") {
+			// If we can't create the namespace, that's okay - the profile will create it
+			r.log("⚠️  Could not create namespace %s (will be created by profile): %v", nsName, err)
+		}
+	}
+
+	// Create the secret in the namespace where semantic-router is deployed
+	secret := &corev1.Secret{
+		ObjectMeta: metav1.ObjectMeta{
+			Name:      "hf-token-secret",
+			Namespace: nsName,
+		},
+		Type: corev1.SecretTypeOpaque,
+		StringData: map[string]string{
+			"token": hfToken,
+		},
+	}
+
+	_, err = kubeClient.CoreV1().Secrets(nsName).Create(ctx, secret, metav1.CreateOptions{})
+	if err != nil {
+		// If secret already exists, update it
+		if strings.Contains(err.Error(), "already exists") {
+			_, err = kubeClient.CoreV1().Secrets(nsName).Update(ctx, secret, metav1.UpdateOptions{})
+			if err != nil {
+				return fmt.Errorf("failed to update existing HF_TOKEN secret in %s: %w", nsName, err)
+			}
+			return nil
+		}
+		// If namespace still doesn't exist, that's okay - it will be created by Helm
+		if strings.Contains(err.Error(), "not found") {
+			r.log("⚠️  Namespace %s not found yet (will be created by profile)", nsName)
+			return nil
+		}
+		return fmt.Errorf("failed to create HF_TOKEN secret in %s: %w", nsName, err)
+	}
+
+	return nil
+}
+
 func getPodReadyStatus(pod corev1.Pod) string {
 	readyCount := 0
 	totalCount := len(pod.Status.ContainerStatuses)
diff --git a/e2e/profiles/ai-gateway/values.yaml b/e2e/profiles/ai-gateway/values.yaml
index f369f77bd..d9d3b92fa 100644
--- a/e2e/profiles/ai-gateway/values.yaml
+++ b/e2e/profiles/ai-gateway/values.yaml
@@ -626,6 +626,7 @@ config:
   # - EmbeddingGemma-300M: Up to 8K context, fast inference, Matryoshka support (768/512/256/128)
   embedding_models:
     qwen3_model_path: "models/Qwen3-Embedding-0.6B"
+    gemma_model_path: "models/embeddinggemma-300m"
     use_cpu: true  # Set to false for GPU acceleration (requires CUDA)
 
   # Observability Configuration
diff --git a/e2e/profiles/dynamic-config/profile.go b/e2e/profiles/dynamic-config/profile.go
index fc397de01..3c7b84957 100644
--- a/e2e/profiles/dynamic-config/profile.go
+++ b/e2e/profiles/dynamic-config/profile.go
@@ -256,16 +256,37 @@ func (p *Profile) kubectlApply(ctx context.Context, kubeconfig, manifestPath str
 
 func (p *Profile) verifyCRDsExist(ctx context.Context, kubeconfig string) error {
 	// Verify IntelligentPool exists
-	cmd := exec.CommandContext(ctx, "kubectl", "get", "intelligentpool", "ai-gateway-pool", "-n", "default", "--kubeconfig", kubeconfig)
-	if err := cmd.Run(); err != nil {
+	cmd := exec.CommandContext(ctx, "kubectl", "get", "intelligentpool", "ai-gateway-pool", "-n", "default", "--kubeconfig", kubeconfig, "-o", "yaml")
+	output, err := cmd.Output()
+	if err != nil {
 		return fmt.Errorf("IntelligentPool 'ai-gateway-pool' not found: %w", err)
 	}
+	if p.verbose {
+		p.log("IntelligentPool 'ai-gateway-pool' found:\n%s", string(output))
+	}
 
 	// Verify IntelligentRoute exists
-	cmd = exec.CommandContext(ctx, "kubectl", "get", "intelligentroute", "ai-gateway-route", "-n", "default", "--kubeconfig", kubeconfig)
-	if err := cmd.Run(); err != nil {
+	cmd = exec.CommandContext(ctx, "kubectl", "get", "intelligentroute", "ai-gateway-route", "-n", "default", "--kubeconfig", kubeconfig, "-o", "yaml")
+	output, err = cmd.Output()
+	if err != nil {
 		return fmt.Errorf("IntelligentRoute 'ai-gateway-route' not found: %w", err)
 	}
+	if p.verbose {
+		p.log("IntelligentRoute 'ai-gateway-route' found:\n%s", string(output))
+	}
+
+	// Check CRD status conditions
+	cmd = exec.CommandContext(ctx, "kubectl", "get", "intelligentpool", "ai-gateway-pool", "-n", "default", "--kubeconfig", kubeconfig, "-o", "jsonpath={.status.conditions}")
+	statusOutput, _ := cmd.Output()
+	if p.verbose {
+		p.log("IntelligentPool status conditions: %s", string(statusOutput))
+	}
+
+	cmd = exec.CommandContext(ctx, "kubectl", "get", "intelligentroute", "ai-gateway-route", "-n", "default", "--kubeconfig", kubeconfig, "-o", "jsonpath={.status.conditions}")
+	statusOutput, _ = cmd.Output()
+	if p.verbose {
+		p.log("IntelligentRoute status conditions: %s", string(statusOutput))
+	}
 
 	return nil
 }
@@ -359,6 +380,23 @@ func (p *Profile) verifyEnvironment(ctx context.Context, opts *framework.SetupOp
 
 	p.log("All deployments are healthy")
 
+	// Additional diagnostic: Check semantic-router pod logs for CRD processing
+	if p.verbose {
+		p.log("Checking semantic-router pod logs for CRD processing...")
+		cmd := exec.CommandContext(ctx, "kubectl", "logs", "-n", "vllm-semantic-router-system", "-l", "app.kubernetes.io/name=semantic-router", "--tail=50", "--kubeconfig", opts.KubeConfig)
+		logs, err := cmd.Output()
+		if err == nil {
+			p.log("Recent semantic-router logs:\n%s", string(logs))
+		} else {
+			p.log("Could not retrieve semantic-router logs: %v", err)
+		}
+
+		// Check if semantic-router is configured with Kubernetes config source
+		cmd = exec.CommandContext(ctx, "kubectl", "get", "deployment", "semantic-router", "-n", "vllm-semantic-router-system", "-o", "jsonpath={.spec.template.spec.containers[0].env}", "--kubeconfig", opts.KubeConfig)
+		envOutput, _ := cmd.Output()
+		p.log("Semantic-router container environment variables: %s", string(envOutput))
+	}
+
 	return nil
 }
 
diff --git a/e2e/profiles/dynamic-config/values.yaml b/e2e/profiles/dynamic-config/values.yaml
index bf77f4206..ef3e13ecd 100644
--- a/e2e/profiles/dynamic-config/values.yaml
+++ b/e2e/profiles/dynamic-config/values.yaml
@@ -5,7 +5,7 @@
 # Environment variables for the semantic-router container
 env:
   - name: EMBEDDING_MODEL_OVERRIDE
-    value: "qwen3"  # Force qwen3 for tests (Gemma requires HF_TOKEN)
+    value: "auto"  # Use intelligent model selection
 
 config:
   # Set config source to kubernetes to enable CRD-based configuration
@@ -127,7 +127,7 @@ config:
 
   embedding_models:
     qwen3_model_path: "models/Qwen3-Embedding-0.6B"
-    gemma_model_path: ""  # Empty = fallback to Qwen3 (embeddinggemma requires HF_TOKEN)
+    gemma_model_path: "models/embeddinggemma-300m"
     use_cpu: true
 
 # Increase memory limits for embedding model support
diff --git a/scripts/quickstart.sh b/scripts/quickstart.sh
index 78dbf169d..82260fbe8 100755
--- a/scripts/quickstart.sh
+++ b/scripts/quickstart.sh
@@ -167,13 +167,17 @@ download_models() {
     info_msg "📥 Downloading AI models..."
     echo
 
-    # Try full model set first (includes embeddinggemma-300m which may require auth)
+    # Try full model set first (includes embeddinggemma-300m which requires HF_TOKEN for gated access)
     # If that fails (e.g., 401 on gated models), fall back to minimal set
     if [ "${CI_MINIMAL_MODELS:-}" = "true" ]; then
         info_msg "CI_MINIMAL_MODELS=true detected, using minimal model set"
         export CI_MINIMAL_MODELS=true
     else
         info_msg "Attempting to download full model set (includes embeddinggemma-300m)..."
+        if [ -z "${HF_TOKEN:-}" ]; then
+            info_msg "ℹ️  Note: HF_TOKEN not set. If embeddinggemma-300m download fails, script will fall back to minimal model set."
+            info_msg "   To download Gemma, set HF_TOKEN environment variable: export HF_TOKEN=your_token"
+        fi
         export CI_MINIMAL_MODELS=false
     fi
 
@@ -183,11 +187,13 @@ download_models() {
     else
         # Check if failure was due to gated model (embeddinggemma-300m)
         if grep -q "embeddinggemma.*401\|embeddinggemma.*Unauthorized\|embeddinggemma.*GatedRepoError" /tmp/download-models-output.log 2>/dev/null; then
-            info_msg "⚠️  Full model download failed (gated model requires auth)"
-            info_msg "📋 Falling back to minimal model set..."
+            info_msg "⚠️  Full model download failed: embeddinggemma-300m requires HF_TOKEN for gated model access"
+            info_msg "📋 Falling back to minimal model set (without Gemma)..."
+            info_msg "💡 To download Gemma, set HF_TOKEN: export HF_TOKEN=your_token && make download-models"
             export CI_MINIMAL_MODELS=true
             if make download-models 2>&1 | tee /tmp/download-models-output.log; then
                 success_msg "✅ Minimal models downloaded successfully!"
+                info_msg "ℹ️  Note: Gemma embedding model was skipped. Some features may be limited."
             else
                 error_msg "❌ Failed to download even minimal models!"
                 info_msg "📋 Check logs: cat /tmp/download-models-output.log"
diff --git a/src/model_manager/__init__.py b/src/model_manager/__init__.py
index 5da0bffd6..7018456c7 100644
--- a/src/model_manager/__init__.py
+++ b/src/model_manager/__init__.py
@@ -24,6 +24,7 @@
     MissingModelError,
     BadChecksumError,
     DownloadError,
+    GatedModelError,
 )
 
 __version__ = "0.1.0"
@@ -41,6 +42,7 @@
     "MissingModelError",
     "BadChecksumError",
     "DownloadError",
+    "GatedModelError",
 ]
 
 
@@ -96,15 +98,25 @@ def ensure_all(self) -> dict[str, str]:
                 continue
 
             logger.info(f"Downloading model '{spec.id}' from {spec.repo_id}...")
-            local_path = download_model(spec, self.config.cache_dir)
-
-            if self.config.verify != "none":
-                logger.info(f"Verifying model '{spec.id}'...")
-                if not verify_model(local_path, self.config.verify):
-                    raise BadChecksumError(f"Verification failed for model '{spec.id}'")
-
-            results[spec.id] = local_path
-            logger.info(f"Model '{spec.id}' ready at {local_path}")
+            try:
+                local_path = download_model(spec, self.config.cache_dir)
+
+                if self.config.verify != "none":
+                    logger.info(f"Verifying model '{spec.id}'...")
+                    if not verify_model(local_path, self.config.verify):
+                        raise BadChecksumError(
+                            f"Verification failed for model '{spec.id}'"
+                        )
+
+                results[spec.id] = local_path
+                logger.info(f"Model '{spec.id}' ready at {local_path}")
+            except GatedModelError as e:
+                # Gracefully skip gated models when token is not available
+                logger.warning(
+                    f"⚠️  Skipping gated model '{spec.id}': {e}. "
+                    "This is expected for PRs from forks where HF_TOKEN is not available."
+                )
+                continue
 
         return results
 
@@ -117,7 +129,14 @@ def ensure_model(self, model_id: str) -> str:
 
         Returns:
             Local path to the model
+
+        Raises:
+            GatedModelError: If the model is gated and HF_TOKEN is not available
         """
+        import logging
+
+        logger = logging.getLogger(__name__)
+
         spec = self.get_model_spec(model_id)
         if spec is None:
             raise MissingModelError(f"Model '{model_id}' not found in configuration")
diff --git a/src/model_manager/cli.py b/src/model_manager/cli.py
index e79274e89..352e3deaa 100644
--- a/src/model_manager/cli.py
+++ b/src/model_manager/cli.py
@@ -189,6 +189,8 @@ def main() -> int:
         # Ensure all models
         results = manager.ensure_all()
         logger.info(f"All {len(results)} models ready")
+        # Note: Some gated models may have been skipped if HF_TOKEN is not available
+        # This is expected behavior and not an error
         return 0
 
     except ModelManagerError as e:
diff --git a/src/model_manager/downloader.py b/src/model_manager/downloader.py
index 5defb3492..d3643cb26 100644
--- a/src/model_manager/downloader.py
+++ b/src/model_manager/downloader.py
@@ -11,8 +11,14 @@
 from huggingface_hub import snapshot_download
 from huggingface_hub.utils import RepositoryNotFoundError, RevisionNotFoundError
 
+try:
+    from huggingface_hub.errors import GatedRepoError
+except ImportError:
+    # Fallback for older versions of huggingface_hub
+    GatedRepoError = None
+
 from .config import ModelSpec
-from .errors import DownloadError, MissingModelError
+from .errors import DownloadError, MissingModelError, GatedModelError
 
 logger = logging.getLogger(__name__)
 
@@ -75,17 +81,56 @@ def download_model(spec: ModelSpec, cache_dir: str) -> str:
         logger.info(f"Successfully downloaded model '{spec.id}' to '{result_path}'")
         return result_path
 
-    except RepositoryNotFoundError:
+    except RepositoryNotFoundError as e:
+        # RepositoryNotFoundError (404) can occur for gated models when not authenticated
+        # HuggingFace returns 404 instead of 401 to avoid revealing repository existence
+        # Check if this might be a gated model by examining the error message
+        error_str = str(e).lower()
+        # Known gated models that might return 404 when not authenticated
+        known_gated_models = ["embeddinggemma", "gemma"]
+        is_known_gated = any(
+            gated_name in spec.repo_id.lower() for gated_name in known_gated_models
+        )
+
+        # If it's a known gated model or the error suggests authentication issues, treat as gated
+        if (
+            is_known_gated
+            or "401" in error_str
+            or "unauthorized" in error_str
+            or "gated" in error_str
+        ):
+            raise GatedModelError(
+                f"Gated model '{spec.id}' requires HF_TOKEN authentication. "
+                f"Set HF_TOKEN or HUGGINGFACE_HUB_TOKEN environment variable to download. "
+                f"Note: Gated models may return 'Repository not found' (404) when not authenticated."
+            ) from e
+
         raise MissingModelError(
             f"Repository not found: '{spec.repo_id}'. "
             "Check if the repository exists and you have access."
-        )
+        ) from e
     except RevisionNotFoundError:
         raise MissingModelError(
             f"Revision not found: '{spec.revision}' in repository '{spec.repo_id}'. "
             "Check if the revision (commit/tag/branch) exists."
         )
     except Exception as e:
+        # Check if this is a gated model error (401 Unauthorized or GatedRepoError)
+        error_str = str(e).lower()
+        is_gated_error = (
+            (GatedRepoError is not None and isinstance(e, GatedRepoError))
+            or "401" in error_str
+            or "unauthorized" in error_str
+            or "gated" in error_str
+            or "gatedrepoerror" in error_str
+        )
+
+        if is_gated_error:
+            raise GatedModelError(
+                f"Gated model '{spec.id}' requires HF_TOKEN authentication. "
+                f"Set HF_TOKEN or HUGGINGFACE_HUB_TOKEN environment variable to download."
+            ) from e
+
         raise DownloadError(
             f"Failed to download model '{spec.id}' from '{spec.repo_id}': {e}"
         ) from e
diff --git a/src/model_manager/errors.py b/src/model_manager/errors.py
index 25137676d..20e78bee9 100644
--- a/src/model_manager/errors.py
+++ b/src/model_manager/errors.py
@@ -31,3 +31,9 @@ class ConfigurationError(ModelManagerError):
     """Raised when configuration is invalid or missing."""
 
     pass
+
+
+class GatedModelError(ModelManagerError):
+    """Raised when attempting to download a gated model without authentication."""
+
+    pass
diff --git a/src/semantic-router/pkg/extproc/processor_req_body.go b/src/semantic-router/pkg/extproc/processor_req_body.go
index f2b546f3b..73d210f5f 100644
--- a/src/semantic-router/pkg/extproc/processor_req_body.go
+++ b/src/semantic-router/pkg/extproc/processor_req_body.go
@@ -373,6 +373,13 @@ func (r *OpenAIRouter) createRoutingResponse(model string, endpoint string, modi
 				RawValue: []byte(model),
 			},
 		})
+		// Add x-ai-eg-model header for Envoy AI Gateway compatibility
+		setHeaders = append(setHeaders, &core.HeaderValueOption{
+			Header: &core.HeaderValue{
+				Key:      "x-ai-eg-model", // Envoy AI Gateway expects this header
+				RawValue: []byte(model),
+			},
+		})
 	}
 
 	// For Response API requests, modify :path to /v1/chat/completions
@@ -441,6 +448,13 @@ func (r *OpenAIRouter) createSpecifiedModelResponse(model string, endpoint strin
 			RawValue: []byte(model),
 		},
 	})
+	// Add x-ai-eg-model header for Envoy AI Gateway compatibility
+	setHeaders = append(setHeaders, &core.HeaderValueOption{
+		Header: &core.HeaderValue{
+			Key:      "x-ai-eg-model", // Envoy AI Gateway expects this header
+			RawValue: []byte(model),
+		},
+	})
 
 	// For Response API requests, modify :path to /v1/chat/completions and use translated body
 	var bodyMutation *ext_proc.BodyMutation