Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion .github/workflows/integration-test-docker.yml
Original file line number Diff line number Diff line change
Expand Up @@ -82,13 +82,18 @@ jobs:
- name: Download models
run: |
echo "Downloading minimal models for CI..."
echo "Note: Gated models (e.g., embeddinggemma-300m) will be gracefully skipped if HF_TOKEN is not available."
make download-models
env:
CI: true
CI_MINIMAL_MODELS: true
HF_TOKEN: ${{ secrets.HF_TOKEN }}
HF_HUB_ENABLE_HF_TRANSFER: 1
HF_HUB_DISABLE_TELEMETRY: 1
# HF_TOKEN is required for downloading gated models (e.g., embeddinggemma-300m)
# For PRs from forks, this will be empty and the model_manager will gracefully skip gated models
# The hf CLI uses HUGGINGFACE_HUB_TOKEN, so we set both for compatibility
HF_TOKEN: ${{ secrets.HF_TOKEN }}
HUGGINGFACE_HUB_TOKEN: ${{ secrets.HF_TOKEN }}

- name: Start CI services
run: |
Expand Down
6 changes: 6 additions & 0 deletions .github/workflows/integration-test-dynamic-config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,12 @@ jobs:

- name: Run Dynamic Config E2E tests
id: e2e-test
env:
# HF_TOKEN is required for downloading gated models (e.g., embeddinggemma-300m)
# For PRs from forks, this will be empty and the model_manager will gracefully skip gated models
# The hf CLI uses HUGGINGFACE_HUB_TOKEN, so we set both for compatibility
HF_TOKEN: ${{ secrets.HF_TOKEN }}
HUGGINGFACE_HUB_TOKEN: ${{ secrets.HF_TOKEN }}
run: |
set +e # Don't exit on error, we want to capture the result
make e2e-test E2E_PROFILE=dynamic-config E2E_VERBOSE=true E2E_KEEP_CLUSTER=false
Expand Down
17 changes: 17 additions & 0 deletions .github/workflows/integration-test-helm.yml
Original file line number Diff line number Diff line change
Expand Up @@ -161,11 +161,28 @@ jobs:
kubectl get namespace vllm-semantic-router-system
echo "::endgroup::"

- name: Create HF_TOKEN secret (if available)
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
run: |
echo "::group::Create HF_TOKEN Secret"
if [ -n "$HF_TOKEN" ]; then
kubectl create secret generic hf-token-secret \
--from-literal=token="$HF_TOKEN" \
-n vllm-semantic-router-system \
--dry-run=client -o yaml | kubectl apply -f -
echo "✓ Created hf-token-secret (HF_TOKEN is set)"
else
echo "⚠️ HF_TOKEN not set - gated models (e.g., embeddinggemma-300m) will be gracefully skipped"
fi
echo "::endgroup::"

- name: Install Helm chart (CI minimal config)
run: |
echo "::group::Install Chart"
# CI environment: Download only essential model to avoid OOM
# Only download all-MiniLM-L12-v2 (smallest model ~120MB)
# Note: Default values include embeddinggemma-300m, which will be skipped if hf-token-secret is not available
helm install semantic-router ${{ env.CHART_PATH }} \
--namespace vllm-semantic-router-system \
--wait \
Expand Down
6 changes: 6 additions & 0 deletions .github/workflows/integration-test-k8s.yml
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,12 @@ jobs:

- name: Run Integration E2E tests (${{ matrix.profile }})
id: e2e-test
env:
# HF_TOKEN is required for downloading gated models (e.g., embeddinggemma-300m)
# For PRs from forks, this will be empty and the E2E framework will gracefully skip gated model downloads
# The hf CLI uses HUGGINGFACE_HUB_TOKEN, so we set both for compatibility
HF_TOKEN: ${{ secrets.HF_TOKEN }}
HUGGINGFACE_HUB_TOKEN: ${{ secrets.HF_TOKEN }}
run: |
set +e # Don't exit on error, we want to capture the result
make e2e-test E2E_PROFILE=${{ matrix.profile }} E2E_VERBOSE=true E2E_KEEP_CLUSTER=false
Expand Down
6 changes: 5 additions & 1 deletion .github/workflows/performance-nightly.yml
Original file line number Diff line number Diff line change
Expand Up @@ -70,9 +70,13 @@ jobs:
- name: Download models (minimal set for nightly)
env:
CI_MINIMAL_MODELS: false
HF_TOKEN: ${{ secrets.HF_TOKEN }}
HF_HUB_ENABLE_HF_TRANSFER: 1
HF_HUB_DISABLE_TELEMETRY: 1
# HF_TOKEN is required for downloading gated models (e.g., embeddinggemma-300m)
# For PRs from forks, this will be empty and the model_manager will gracefully skip gated models
# The hf CLI uses HUGGINGFACE_HUB_TOKEN, so we set both for compatibility
HF_TOKEN: ${{ secrets.HF_TOKEN }}
HUGGINGFACE_HUB_TOKEN: ${{ secrets.HF_TOKEN }}
run: make download-models

- name: Create reports directory
Expand Down
6 changes: 5 additions & 1 deletion .github/workflows/performance-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -79,9 +79,13 @@ jobs:
- name: Download models (minimal)
env:
CI_MINIMAL_MODELS: true
HF_TOKEN: ${{ secrets.HF_TOKEN }}
HF_HUB_ENABLE_HF_TRANSFER: 1
HF_HUB_DISABLE_TELEMETRY: 1
# HF_TOKEN is required for downloading gated models (e.g., embeddinggemma-300m)
# For PRs from forks, this will be empty and the model_manager will gracefully skip gated models
# The hf CLI uses HUGGINGFACE_HUB_TOKEN, so we set both for compatibility
HF_TOKEN: ${{ secrets.HF_TOKEN }}
HUGGINGFACE_HUB_TOKEN: ${{ secrets.HF_TOKEN }}
run: make download-models

- name: Run component benchmarks
Expand Down
12 changes: 10 additions & 2 deletions .github/workflows/test-and-build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -138,12 +138,20 @@ jobs:
pip install -r src/model_manager/requirements.txt

- name: Download models (minimal on PRs)
run: |
echo "Downloading models for CI..."
echo "Note: Gated models (e.g., embeddinggemma-300m) will be gracefully skipped if HF_TOKEN is not available."
echo "This is expected for PRs from forks where secrets are not exposed."
make download-models
env:
CI_MINIMAL_MODELS: ${{ github.event_name == 'pull_request' }}
HF_TOKEN: ${{ secrets.HF_TOKEN }}
HF_HUB_ENABLE_HF_TRANSFER: 1
HF_HUB_DISABLE_TELEMETRY: 1
run: make download-models
# HF_TOKEN is required for downloading gated models (e.g., embeddinggemma-300m)
# For PRs from forks, this will be empty and the model_manager will gracefully skip gated models
# The hf CLI uses HUGGINGFACE_HUB_TOKEN, so we set both for compatibility
HF_TOKEN: ${{ secrets.HF_TOKEN }}
HUGGINGFACE_HUB_TOKEN: ${{ secrets.HF_TOKEN }}

- name: Start Milvus service
run: |
Expand Down
67 changes: 38 additions & 29 deletions candle-binding/semantic-router_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1482,34 +1482,33 @@ func TestGetEmbeddingSmart(t *testing.T) {
}

t.Run("ShortTextHighLatency", func(t *testing.T) {
// Short text with high latency priority - uses Qwen3 (1024) since Gemma is not available
// Short text with high latency priority should use Gemma (768)
text := "Hello world"
embedding, err := GetEmbeddingSmart(text, 0.3, 0.8)

if err != nil {
t.Fatalf("GetEmbeddingSmart failed: %v", err)
}

// Expect Qwen3 (1024) dimension since Gemma is not available
if len(embedding) != 1024 {
t.Errorf("Expected 1024-dim embedding, got %d", len(embedding))
if len(embedding) != 768 {
t.Errorf("Expected 768-dim embedding, got %d", len(embedding))
}

t.Logf("Short text embedding generated: dim=%d", len(embedding))
})

t.Run("MediumTextBalanced", func(t *testing.T) {
// Medium text with balanced priorities - uses Qwen3 (1024) since Gemma is not available
// Medium text with balanced priorities - may select Qwen3 (1024) or Gemma (768)
text := strings.Repeat("This is a medium length text with enough words to exceed 512 tokens. ", 10)
embedding, err := GetEmbeddingSmart(text, 0.5, 0.5)

if err != nil {
t.Fatalf("GetEmbeddingSmart failed: %v", err)
}

// Expect Qwen3 (1024) dimension since Gemma is not available
if len(embedding) != 1024 {
t.Errorf("Expected 1024-dim embedding, got %d", len(embedding))
// Accept both Qwen3 (1024) and Gemma (768) dimensions
if len(embedding) != 768 && len(embedding) != 1024 {
t.Errorf("Expected 768 or 1024-dim embedding, got %d", len(embedding))
}

t.Logf("Medium text embedding generated: dim=%d", len(embedding))
Expand Down Expand Up @@ -1569,9 +1568,9 @@ func TestGetEmbeddingSmart(t *testing.T) {
return
}

// Expect Qwen3 (1024) since Gemma is not available
if len(embedding) != 1024 {
t.Errorf("Expected 1024-dim embedding, got %d", len(embedding))
// Smart routing may select Qwen3 (1024) or Gemma (768) based on priorities
if len(embedding) != 768 && len(embedding) != 1024 {
t.Errorf("Expected 768 or 1024-dim embedding, got %d", len(embedding))
}
t.Logf("Priority test %s: generated %d-dim embedding", tc.desc, len(embedding))
})
Expand All @@ -1594,9 +1593,9 @@ func TestGetEmbeddingSmart(t *testing.T) {
continue
}

// Expect Qwen3 (1024) since Gemma is not available
if len(embedding) != 1024 {
t.Errorf("Iteration %d: Expected 1024-dim embedding, got %d", i, len(embedding))
// Smart routing may select Qwen3 (1024) or Gemma (768)
if len(embedding) != 768 && len(embedding) != 1024 {
t.Errorf("Iteration %d: Expected 768 or 1024-dim embedding, got %d", i, len(embedding))
}

// Verify no nil pointers
Expand Down Expand Up @@ -1635,12 +1634,11 @@ func BenchmarkGetEmbeddingSmart(b *testing.B) {
}

// Test constants for embedding models (Phase 4.2)
// Note: Gemma model is gated and requires HF_TOKEN, so tests use Qwen3 only
const (
Qwen3EmbeddingModelPath = "../models/Qwen3-Embedding-0.6B"
GemmaEmbeddingModelPath = "" // Gemma is gated, not used in CI tests
GemmaEmbeddingModelPath = "../models/embeddinggemma-300m"
TestEmbeddingText = "This is a test sentence for embedding generation"
TestLongContextText = "This is a longer text that might benefit from long-context embedding models like Qwen3"
TestLongContextText = "This is a longer text that might benefit from long-context embedding models like Qwen3 or Gemma"
)

// Test constants for Qwen3 Multi-LoRA
Expand Down Expand Up @@ -1702,8 +1700,22 @@ func TestInitEmbeddingModels(t *testing.T) {
})

t.Run("InitGemmaOnly", func(t *testing.T) {
// Gemma is a gated model requiring HF_TOKEN, skip in CI
t.Skip("Skipping Gemma-only test: Gemma is a gated model requiring HF_TOKEN")
err := InitEmbeddingModels("", GemmaEmbeddingModelPath, true)
if err != nil {
t.Logf("InitEmbeddingModels (Gemma only) returned error (may already be initialized): %v", err)

// Verify functionality
_, testErr := GetEmbeddingSmart("test", 0.5, 0.5)
if testErr == nil {
t.Log("✓ ModelFactory is functional (already initialized)")
} else {
if isModelInitializationError(testErr) {
t.Skipf("Skipping test due to model unavailability: %v", testErr)
}
}
} else {
t.Log("✓ Gemma model initialized successfully")
}
})

t.Run("InitWithInvalidPaths", func(t *testing.T) {
Expand Down Expand Up @@ -1785,16 +1797,16 @@ func TestGetEmbeddingWithDim(t *testing.T) {

t.Run("OversizedDimension", func(t *testing.T) {
// Test graceful degradation when requested dimension exceeds model capacity
// Qwen3: 1024, so 2048 should fall back to full dimension
// Qwen3: 1024, Gemma: 768, so 2048 should fall back to full dimension
embedding, err := GetEmbeddingWithDim(TestEmbeddingText, 0.5, 0.5, 2048)
if err != nil {
t.Errorf("Should gracefully handle oversized dimension, got error: %v", err)
return
}

// Should return full dimension (1024 for Qwen3)
if len(embedding) != 1024 {
t.Errorf("Expected full dimension (1024), got %d", len(embedding))
// Should return full dimension (1024 for Qwen3 or 768 for Gemma)
if len(embedding) != 1024 && len(embedding) != 768 {
t.Errorf("Expected full dimension (1024 or 768), got %d", len(embedding))
} else {
t.Logf("✓ Oversized dimension gracefully degraded to full dimension: %d", len(embedding))
}
Expand Down Expand Up @@ -1889,9 +1901,6 @@ func TestEmbeddingPriorityRouting(t *testing.T) {
if err != nil {
t.Fatalf("Failed to initialize embedding models: %v", err)
}

// Note: These tests use Matryoshka dimension truncation (768) with Qwen3 model
// The dimension is truncated from Qwen3's full 1024 dimensions
testCases := []struct {
name string
text string
Expand All @@ -1906,23 +1915,23 @@ func TestEmbeddingPriorityRouting(t *testing.T) {
qualityPriority: 0.2,
latencyPriority: 0.9,
expectedDim: 768,
description: "Uses Qwen3 with Matryoshka 768 truncation",
description: "Should prefer faster embedding model (Gemma > Qwen3)",
},
{
name: "HighQualityPriority",
text: strings.Repeat("Long context text ", 30),
qualityPriority: 0.9,
latencyPriority: 0.2,
expectedDim: 768,
description: "Uses Qwen3 with Matryoshka 768 truncation",
description: "Should prefer quality model (Qwen3/Gemma)",
},
{
name: "BalancedPriority",
text: "Medium length text for embedding",
qualityPriority: 0.5,
latencyPriority: 0.5,
expectedDim: 768,
description: "Uses Qwen3 with Matryoshka 768 truncation",
description: "Should select based on text length",
},
}

Expand Down
4 changes: 4 additions & 0 deletions config/model_manager/models.lora.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -29,3 +29,7 @@ models:

- id: Qwen3-Embedding-0.6B
repo_id: Qwen/Qwen3-Embedding-0.6B

# Gated model - requires HF_TOKEN (will gracefully skip if token not available)
- id: embeddinggemma-300m
repo_id: google/embeddinggemma-300m
8 changes: 6 additions & 2 deletions config/model_manager/models.minimal.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@
# Equivalent to: make download-models-minimal
# or CI_MINIMAL_MODELS=true make download-models
#
# Note: This is the minimal set for fast CI runs. Larger models like
# embeddinggemma-300m are in models.yaml (full set) for local development.
# Note: This is the minimal set for fast CI runs. Gated models like
# embeddinggemma-300m will gracefully skip if HF_TOKEN is not available.

cache_dir: "models"
verify: "size" # Use size for faster CI runs
Expand Down Expand Up @@ -56,6 +56,10 @@ models:
- id: Qwen3-Embedding-0.6B
repo_id: Qwen/Qwen3-Embedding-0.6B

# Gated model - requires HF_TOKEN (will gracefully skip if token not available)
- id: embeddinggemma-300m
repo_id: google/embeddinggemma-300m

# =============================================================================
# Hallucination Detection - Required for hallucination tests
# =============================================================================
Expand Down
29 changes: 25 additions & 4 deletions deploy/helm/semantic-router/templates/deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -42,13 +42,33 @@ spec:
command: ["/bin/bash", "-c"]
args:
- |
set -e
echo "Downloading models to persistent volume..."
cd /app/models

{{- range .Values.initContainer.models }}
# Download {{ .name }}
echo "Downloading {{ .name }} from {{ .repo }}..."

{{- if or (eq .name "embeddinggemma-300m") (contains "embeddinggemma" .name) }}
# Skip gated models if token is missing
if [ -z "${HF_TOKEN:-}" ] && [ -z "${HUGGINGFACE_HUB_TOKEN:-}" ]; then
echo "⚠️ Skipping {{ .name }} (HF_TOKEN not set, gated model requires authentication)"
else
# Remove .cache directory to ensure fresh download
rm -rf "{{ .name }}/.cache" 2>/dev/null || true
# Download with ignore_patterns to exclude ONNX-only files if pytorch model exists
python -c "from huggingface_hub import snapshot_download; snapshot_download(repo_id='{{ .repo }}', local_dir='{{ .name }}', ignore_patterns=['*.onnx', '*.msgpack', '*.h5', '*.tflite'] if '{{ .name }}' == 'all-MiniLM-L12-v2' else None)"

# Check for required model files
echo "Checking {{ .name }} for required files:"
if [ -f "{{ .name }}/pytorch_model.bin" ] || [ -f "{{ .name }}/model.safetensors" ]; then
echo "✓ Found PyTorch model weights in {{ .name }}"
else
echo "✗ WARNING: No PyTorch model weights found in {{ .name }}"
ls -la "{{ .name }}/" | head -20
fi
fi # End of HF_TOKEN check for Gemma
{{- else }}
# Remove .cache directory to ensure fresh download
rm -rf "{{ .name }}/.cache" 2>/dev/null || true
# Download with ignore_patterns to exclude ONNX-only files if pytorch model exists
Expand All @@ -62,16 +82,17 @@ spec:
echo "✗ WARNING: No PyTorch model weights found in {{ .name }}"
ls -la "{{ .name }}/" | head -20
fi
{{- end }}

{{- end }}
echo "All models downloaded successfully!"
ls -la /app/models/
env:
- name: HF_HUB_CACHE
value: /tmp/hf_cache
{{- with .Values.initContainer.env }}
{{- toYaml . | nindent 10 }}
{{- end }}
{{- with .Values.initContainer.env }}
{{- toYaml . | nindent 8 }}
{{- end }}
resources:
{{- toYaml .Values.initContainer.resources | nindent 10 }}
volumeMounts:
Expand Down
Loading
Loading