Change to chat endpoint + RAG improvements #9

Workflow file for this run

.github/workflows/test_embeddings.yml at fdfb2a9

	# Copyright(C) 2024-2025 Advanced Micro Devices, Inc. All rights reserved.
	# SPDX-License-Identifier: MIT

	name: Test Lemonade Embeddings

	on:
	workflow_call:
	push:
	branches: [ main ]
	pull_request:
	branches: [ main ]
	types: [opened, synchronize, reopened, ready_for_review]
	merge_group:
	workflow_dispatch:

	permissions:
	contents: read

	jobs:
	test-embeddings:
	name: Test Lemonade Embeddings API
	runs-on: [stx]

	steps:
	- uses: actions/checkout@v5

	- name: Setup Python environment
	uses: ./.github/actions/setup-venv
	with:
	python-version: '3.12'
	install-package: '.[dev,rag]'

	- name: Install Lemonade Server
	uses: ./.github/actions/install-lemonade

	- name: Start Lemonade Server and Run Tests
	shell: powershell
	run: \|
	# Set console to UTF-8 for Unicode support
	[Console]::OutputEncoding = [System.Text.Encoding]::UTF8
	[Console]::InputEncoding = [System.Text.Encoding]::UTF8
	$OutputEncoding = [System.Text.Encoding]::UTF8
	$env:PYTHONIOENCODING = "utf-8"
	$env:PYTHONUTF8 = "1"
	chcp 65001 \| Out-Null

	try {
	Write-Host "Starting Lemonade server..."
	$serverJob = Start-Job -ScriptBlock {
	# Workaround for Issue #612: Disable Vulkan cooperative matrix optimization
	$env:GGML_VK_DISABLE_COOPMAT = "1"
	& lemonade-server serve --host localhost --port 8000 --no-tray 2>&1
	}
	Write-Host "Started Lemonade server job with ID: $($serverJob.Id)"
	$env:LEMONADE_JOB_ID = $serverJob.Id

	# Wait for server to be ready
	Write-Host "Waiting for Lemonade server to start..."
	$maxWaitTime = 60
	$waitTime = 0
	$serverReady = $false

	while ($waitTime -lt $maxWaitTime -and -not $serverReady) {
	Start-Sleep -Seconds 2
	$waitTime += 2

	try {
	$response = Invoke-RestMethod -Uri "http://localhost:8000/api/v1/health" -Method GET -TimeoutSec 5
	Write-Host "[OK] Lemonade server is ready"
	Write-Host "Health response: $($response \| ConvertTo-Json -Compress)"
	$serverReady = $true
	} catch {
	Write-Host "Waiting... ($waitTime/$maxWaitTime seconds)"
	}
	}

	if (-not $serverReady) {
	Write-Host "[ERROR] Server health check failed after $maxWaitTime seconds"
	throw "Server failed to start"
	}

	# Clear any cached/corrupted model files to force fresh download
	# See: https://github.com/ggml-org/llama.cpp/issues/13534
	Write-Host "`n=== Clearing Model Cache ==="
	$lemonadeCache = "$env:LOCALAPPDATA\lemonade-server"
	if (Test-Path "$lemonadeCache\models") {
	Write-Host "Removing cached models from: $lemonadeCache\models"
	Get-ChildItem "$lemonadeCache\models" -Directory \| ForEach-Object {
	Write-Host " Removing: $($_.Name)"
	Remove-Item $_.FullName -Recurse -Force -ErrorAction SilentlyContinue
	}
	}

	# Pull embedding model (actual model used in tests)
	Write-Host "`n=== Pulling Embedding Model ==="
	Write-Host "Pulling nomic-embed-text-v2-moe-GGUF..."
	try {
	$body = @{ model_name = "nomic-embed-text-v2-moe-GGUF" } \| ConvertTo-Json
	$response = Invoke-RestMethod -Uri "http://localhost:8000/api/v1/pull" `
	-Method POST -ContentType "application/json" -Body $body -TimeoutSec 600
	Write-Host " [OK] Model pull initiated"
	} catch {
	Write-Host " [WARN] Pull may have failed: $($_.Exception.Message)"
	}

	# Load embedding model into memory (required in Lemonade v9.x)
	Write-Host "`n=== Loading Embedding Model ==="
	try {
	$loadRequest = @{
	model_name = "nomic-embed-text-v2-moe-GGUF"
	} \| ConvertTo-Json

	Write-Host "Loading model: nomic-embed-text-v2-moe-GGUF"
	$loadResponse = Invoke-RestMethod -Uri "http://localhost:8000/api/v1/load" `
	-Method POST -Body $loadRequest -ContentType "application/json" -TimeoutSec 60
	Write-Host "[OK] Model loaded successfully: $($loadResponse \| ConvertTo-Json -Compress)"
	} catch {
	Write-Host "[ERROR] Model load failed: $($_.Exception.Message)"
	if ($_.ErrorDetails) {
	Write-Host "Error details: $($_.ErrorDetails.Message)"
	}
	throw "Failed to load embedding model"
	}

	# Wait for llamacpp backend to fully initialize (increased from 10s)
	Write-Host "Waiting 30 seconds for llamacpp backend initialization..."
	Start-Sleep -Seconds 30

	# Verify model is available
	try {
	$models = Invoke-RestMethod -Uri "http://localhost:8000/api/v1/models" -Method GET
	Write-Host "`n[OK] Available models:"
	$models.data \| ForEach-Object { Write-Host " - $($_.id)" }
	} catch {
	Write-Host "[WARN] Could not verify model: $($_.Exception.Message)"
	}

	# Verify server is still responding before embeddings test
	Write-Host "`n=== Verifying Server Health ==="
	try {
	$health = Invoke-RestMethod -Uri "http://localhost:8000/api/v1/health" -Method GET -TimeoutSec 10
	Write-Host "[OK] Server responding: $($health \| ConvertTo-Json -Compress)"
	} catch {
	Write-Host "[ERROR] Server health check failed: $($_.Exception.Message)"
	# Show server job output for debugging
	if ($env:LEMONADE_JOB_ID) {
	$jobOutput = Receive-Job -Id $env:LEMONADE_JOB_ID -ErrorAction SilentlyContinue
	Write-Host "Server output: $jobOutput"
	}
	throw "Server not responding after model load"
	}

	# Verify embedding model with actual API call
	Write-Host "`n=== Verifying Embedding Model ==="
	$maxRetries = 3
	$retryCount = 0
	$modelReady = $false

	while ($retryCount -lt $maxRetries -and -not $modelReady) {
	try {
	$testBody = @{ input = @("test embedding"); model = "nomic-embed-text-v2-moe-GGUF" } \| ConvertTo-Json
	# Use localhost consistently and increased timeout for first embedding request
	$response = Invoke-RestMethod -Uri "http://localhost:8000/api/v1/embeddings" `
	-Method POST -ContentType "application/json" -Body $testBody -TimeoutSec 300
	Write-Host "[OK] Embedding model verified successfully"
	$modelReady = $true
	} catch {
	$retryCount++
	Write-Host "[WARN] Embedding verification attempt $retryCount failed: $($_.Exception.Message)"
	if ($retryCount -lt $maxRetries) {
	Write-Host "Waiting 30 seconds before retry..."
	Start-Sleep -Seconds 30
	}
	}
	}

	if (-not $modelReady) {
	throw "Embedding model failed to load after $maxRetries attempts"
	}

	# Run tests in same session while server is running
	Write-Host "`n=== Running Embedding Tests ==="
	# Use --tb=long for full traceback, -rA for all test output summary
	pytest tests/test_lemonade_embeddings.py -v -s --tb=long -rA --log-cli-level=DEBUG
	$testExitCode = $LASTEXITCODE

	Write-Host "`nTests completed with exit code: $testExitCode"

	if ($testExitCode -ne 0) {
	throw "Tests failed with exit code $testExitCode"
	}
	} finally {
	# Always cleanup server
	if ($env:LEMONADE_JOB_ID) {
	Write-Host "`n=== Stopping Lemonade Server ==="
	Stop-Job -Id $env:LEMONADE_JOB_ID -ErrorAction SilentlyContinue
	Remove-Job -Id $env:LEMONADE_JOB_ID -ErrorAction SilentlyContinue
	Write-Host "[OK] Server stopped"
	}
	}

	- name: Upload test results
	if: always()
	uses: actions/upload-artifact@v4
	with:
	name: embedding-test-results
	path: \|
	pytest-results/
	*.log

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Change to chat endpoint + RAG improvements #9

Workflow file

Change to chat endpoint + RAG improvements #9

Uh oh!

Workflow file for this run