fix: various fixes

leseb · leseb · commit 4f78bc540fa8 · 2025-11-28T17:02:13.000+01:00
Signed-off-by: Sébastien Han &lt;seb@redhat.com&gt;
diff --git a/.github/workflows/redhat-distro-container.yml b/.github/workflows/redhat-distro-container.yml
@@ -44,7 +44,7 @@ jobs:
     runs-on: ubuntu-latest
     env:
       INFERENCE_MODEL: Qwen/Qwen3-0.6B
-      EMBEDDING_MODEL: granite-embedding-125m
+      EMBEDDING_MODEL: ibm-granite/granite-embedding-125m-english
       VLLM_URL: http://localhost:8000/v1
       LLAMA_STACK_COMMIT_SHA: ${{ github.event.inputs.llama_stack_commit_sha || 'main' }}
     strategy:
@@ -73,7 +73,7 @@ jobs:
           LLAMA_STACK_VERSION: ${{ env.LLAMA_STACK_COMMIT_SHA }}
         run: |
           tmp_build_dir=$(mktemp -d)
-          git clone --filter=blob:none --no-checkout https://github.com/llamastack/llama-stack.git "$tmp_build_dir"
+          git clone --filter=blob:none --no-checkout https://github.com/opendatahub-io/llama-stack.git "$tmp_build_dir"
           cd "$tmp_build_dir"
           git checkout "$LLAMA_STACK_VERSION"
           python3 -m venv .venv
diff --git a/distribution/run.yaml b/distribution/run.yaml
@@ -238,27 +238,29 @@ storage:
     prompts:
       namespace: prompts
       backend: kv_default
-models:
-- metadata: {}
-  model_id: ${env.INFERENCE_MODEL}
-  provider_id: vllm-inference
-  model_type: llm
-- metadata:
-    embedding_dimension: 768
-  model_id: granite-embedding-125m
-  provider_id: sentence-transformers
-  provider_model_id: ibm-granite/granite-embedding-125m-english
-  model_type: embedding
-shields: []
-vector_dbs: []
-datasets: []
-scoring_fns: []
-benchmarks: []
-tool_groups:
-- toolgroup_id: builtin::websearch
-  provider_id: tavily-search
-- toolgroup_id: builtin::rag
-  provider_id: rag-runtime
+registered_resources:
+  models:
+  - metadata: {}
+    model_id: ${env.INFERENCE_MODEL}
+    provider_id: vllm-inference
+    model_type: llm
+
+  - metadata:
+      embedding_dimension: 768
+    model_id: granite-embedding-125m
+    provider_id: sentence-transformers
+    provider_model_id: ibm-granite/granite-embedding-125m-english
+    model_type: embedding
+  shields: []
+  vector_dbs: []
+  datasets: []
+  scoring_fns: []
+  benchmarks: []
+  tool_groups:
+  - toolgroup_id: builtin::websearch
+    provider_id: tavily-search
+  - toolgroup_id: builtin::rag
+    provider_id: rag-runtime
 telemetry:
   enabled: true
 server:
diff --git a/tests/run_integration_tests.sh b/tests/run_integration_tests.sh
@@ -5,6 +5,7 @@ set -exuo pipefail
 # Configuration
 WORK_DIR="/tmp/llama-stack-integration-tests"
 INFERENCE_MODEL="${INFERENCE_MODEL:-Qwen/Qwen3-0.6B}"
+EMBEDDING_MODEL="${EMBEDDING_MODEL:-ibm-granite/granite-embedding-125m-english}"
 
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 
@@ -38,10 +39,15 @@ function clone_llama_stack() {
     cd "$WORK_DIR"
     # fetch origin incase we didn't clone a fresh repo
     git fetch origin
-    if ! git checkout "v$LLAMA_STACK_VERSION"; then
-        echo "Error: Could not checkout tag v$LLAMA_STACK_VERSION"
+    if [ "$LLAMA_STACK_VERSION" == "main" ]; then
+        checkout_to="main"
+    else
+        checkout_to="v$LLAMA_STACK_VERSION"
+    fi
+    if ! git checkout "$checkout_to"; then
+        echo "Error: Could not checkout $checkout_to"
         echo "Available tags:"
-        git tag | grep "^v" | tail -10
+        git tag | tail -10
         exit 1
     fi
 }
@@ -64,12 +70,14 @@ function run_integration_tests() {
         exit 1
     fi
 
-    # TODO: remove this once we have a stable version of llama-stack client
-    # Currently, LLS client version is 0.3.0, while the server version is 0.3.0rc3+rhai0
-    uv run --with llama-stack-client==0.3.0 pytest -s -v tests/integration/inference/ \
+    uv venv
+    # shellcheck source=/dev/null
+    source .venv/bin/activate
+    uv pip install llama-stack-client
+    uv run pytest -s -v tests/integration/inference/ \
         --stack-config=server:"$STACK_CONFIG_PATH" \
         --text-model=vllm-inference/"$INFERENCE_MODEL" \
-        --embedding-model=granite-embedding-125m \
+        --embedding-model=sentence-transformers/"$EMBEDDING_MODEL" \
         -k "not ($SKIP_TESTS)"
 }
 
diff --git a/tests/smoke.sh b/tests/smoke.sh
@@ -35,27 +35,33 @@ function start_and_wait_for_llama_stack_container {
 }
 
 function test_model_list {
-  echo "===> Looking for model $INFERENCE_MODEL..."
-  resp=$(curl -fsS http://127.0.0.1:8321/v1/models)
-  if echo "$resp" | grep -q "$INFERENCE_MODEL"; then
-    echo "Model $INFERENCE_MODEL was found :)"
-    return
-  else
-    echo "Model $INFERENCE_MODEL was not found :("
-    echo "Container logs:"
-    docker logs llama-stack || true
-    exit 1
-  fi
+  for model in "$INFERENCE_MODEL" "$EMBEDDING_MODEL"; do
+    echo "===> Looking for model $model..."
+    resp=$(curl -fsS http://127.0.0.1:8321/v1/models)
+    echo "Response: $resp"
+    if echo "$resp" | grep -q "$model"; then
+      echo "Model $model was found :)"
+      continue
+    else
+      echo "Model $model was not found :("
+      echo "Response: $resp"
+      echo "Container logs:"
+      docker logs llama-stack || true
+      return 1
+    fi
+  done
+  return 0
 }
 
 function test_model_openai_inference {
   echo "===> Attempting to chat with model $INFERENCE_MODEL..."
-  resp=$(curl -fsS http://127.0.0.1:8321/v1/chat/completions -H "Content-Type: application/json" -d "{\"model\": \"$INFERENCE_MODEL\",\"messages\": [{\"role\": \"user\", \"content\": \"What color is grass?\"}], \"max_tokens\": 128, \"temperature\": 0.0}")
+  resp=$(curl -fsS http://127.0.0.1:8321/v1/chat/completions -H "Content-Type: application/json" -d "{\"model\": \"vllm-inference/$INFERENCE_MODEL\",\"messages\": [{\"role\": \"user\", \"content\": \"What color is grass?\"}], \"max_tokens\": 128, \"temperature\": 0.0}")
   if echo "$resp" | grep -q "green"; then
     echo "===> Inference is working :)"
     return
   else
     echo "===> Inference is not working :("
+    echo "Response: $resp"
     echo "Container logs:"
     docker logs llama-stack || true
     exit 1
@@ -65,7 +71,10 @@ function test_model_openai_inference {
 main() {
   echo "===> Starting smoke test..."
   start_and_wait_for_llama_stack_container
-  test_model_list
+  if ! test_model_list; then
+    echo "Model list test failed :("
+    exit 1
+  fi
   test_model_openai_inference
   echo "===> Smoke test completed successfully!"
 }