diff --git a/.github/workflows/test-playbooks.yml b/.github/workflows/test-playbooks.yml index 1d7b8905..a15ac2a0 100644 --- a/.github/workflows/test-playbooks.yml +++ b/.github/workflows/test-playbooks.yml @@ -139,7 +139,7 @@ jobs: - name: Setup Python uses: actions/setup-python@v5 with: - python-version: ${{ matrix.playbook == 'open-webui-chat' && '3.12' || '3.13' }} + python-version: ${{ (matrix.playbook == 'open-webui-chat' || matrix.playbook == 'vllm-inference') && '3.12' || '3.13' }} - name: Install test dependencies run: | diff --git a/playbooks/supplemental/vllm-inference/README.md b/playbooks/supplemental/vllm-inference/README.md index 29b36438..57e85743 100644 --- a/playbooks/supplemental/vllm-inference/README.md +++ b/playbooks/supplemental/vllm-inference/README.md @@ -38,15 +38,56 @@ For this playbook, we'll use the prebuilt AMD ROCm wheel from AMD's package inde ### Install vLLM + +```python +import os +import sys +import ast + +# Check that required script files exist +scripts = ['chat_with_model.py', 'curl_script.sh'] +missing = [s for s in scripts if not os.path.exists(s)] + +if missing: + print(f"FAIL: Missing files: {missing}") + sys.exit(1) +print("PASS: All required script files exist") + +# Verify Python scripts have valid syntax +for script in ['chat_with_model.py']: + with open(script, 'r') as f: + ast.parse(f.read()) + print(f"PASS: {script} has valid syntax") +``` + + + +```bash +set -euo pipefail + +test -f curl_script.sh +test -f chat_with_model.py + +python -m py_compile chat_with_model.py +bash curl_script.sh --help >/dev/null + +echo "PASS: Required vLLM asset scripts are present and valid" +``` + + Create a Python 3.12 virtual environment and activate it: + ```bash -python -m venv .venv +python3 -m venv .venv source .venv/bin/activate ``` + + Install PyTorch 2.9.1 built for ROCm 7.12.0, along with the required ROCm Python packages, in the virtual environment: + ```bash python -m pip install \ --index-url https://repo.amd.com/rocm/whl/gfx1151/ \ @@ -54,14 +95,17 @@ python -m pip install \ "torchaudio==2.9.0+rocm7.12.0" \ "torchvision==0.24.0+rocm7.12.0" ``` + Install vLLM from the prebuilt ROCm wheel: + ```bash python -m pip install \ --extra-index-url https://rocm.frameworks.amd.com/whl/gfx1151/ \ "vllm==0.16.1.dev10+g11515110f.d20260323.rocm712" ``` + Set the environment variables required by the ROCm pip packages before starting vLLM: @@ -78,6 +122,24 @@ echo "=== PyTorch ===" && python -c "import torch; print('PyTorch:', torch.__ver echo "=== flash-attn ===" && python -c "import flash_attn; print('flash-attn:', flash_attn.__version__)" ``` + +```bash +set -euo pipefail +python3 --version +which python3 +``` + + + +```bash +export PYTHONPATH=.venv/lib/python3.12/site-packages/_rocm_sdk_core/share/amd_smi +export FLASH_ATTENTION_TRITON_AMD_ENABLE=TRUE +echo "=== vLLM ===" && python -c "import vllm; print('vLLM version:', vllm.__version__)" +echo "=== PyTorch ===" && python -c "import torch; print('PyTorch:', torch.__version__); print('HIP available:', torch.cuda.is_available()); print('HIP built:', torch.backends.hip.is_built() if hasattr(torch.backends, 'hip') else 'N/A')" +echo "=== flash-attn ===" && python -c "import flash_attn; print('flash-attn:', flash_attn.__version__)" +``` + + ## Quick Start ### 1. Start the vLLM Server @@ -108,7 +170,7 @@ vllm serve Qwen/Qwen3-1.7B \ You can test the server using the curl script: ```bash -./assets/curl_script.sh +./curl_script.sh ``` Or use the curl command directly: @@ -174,8 +236,177 @@ for chunk in response: Run the script: ```bash -python assets/chat_with_model.py +python chat_with_model.py +``` + + +```bash +set -euo pipefail + +source .venv/bin/activate + +python -m pip install openai + +export PYTHONPATH="$PWD/.venv/lib/python3.12/site-packages/_rocm_sdk_core/share/amd_smi" +export FLASH_ATTENTION_TRITON_AMD_ENABLE=TRUE + +server_pid="" + +cleanup() { + if [ -n "${server_pid:-}" ] && kill -0 "$server_pid" 2>/dev/null; then + kill "$server_pid" 2>/dev/null || true + sleep 2 + kill -9 "$server_pid" 2>/dev/null || true + fi +} +trap cleanup EXIT + +vllm serve Qwen/Qwen3-1.7B \ + --host 127.0.0.1 \ + --port 8000 \ + --max-model-len 2048 \ + --gpu-memory-utilization 0.7 \ + >/tmp/vllm-test.log 2>&1 & + +server_pid=$! + +ready=false +for i in $(seq 1 300); do + code="$(curl -s -o /dev/null -w "%{http_code}" --max-time 2 http://127.0.0.1:8000/health || true)" + if [ "$code" = "200" ]; then + ready=true + break + fi + sleep 2 +done + +if [ "$ready" != "true" ]; then + echo "FAIL: vLLM server did not become ready on http://127.0.0.1:8000/health" + echo "Last 200 lines of vLLM log:" + tail -n 200 /tmp/vllm-test.log || true + exit 1 +fi + +echo "PASS: vLLM server is responding on /health" + +models_json="$(curl -s --max-time 10 http://127.0.0.1:8000/v1/models || true)" + +if [ -z "$models_json" ]; then + echo "FAIL: Empty response from /v1/models" + exit 1 +fi + +export MODELS_JSON="$models_json" +python - <<'PY' +import json +import os +import sys + +data = json.loads(os.environ["MODELS_JSON"]) +model_ids = [item.get("id") for item in data.get("data", [])] + +print("Available models:", model_ids) + +if "Qwen/Qwen3-1.7B" not in model_ids: + print("FAIL: Qwen/Qwen3-1.7B was not listed by /v1/models") + sys.exit(1) + +print("PASS: Qwen/Qwen3-1.7B is listed by /v1/models") +PY + +direct_response="$(curl -s -X POST --max-time 300 http://127.0.0.1:8000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "Qwen/Qwen3-1.7B", + "messages": [ + { + "role": "user", + "content": "Reply with exactly: OK" + } + ], + "temperature": 0, + "max_tokens": 32 + }')" + +export DIRECT_RESPONSE="$direct_response" +python - <<'PY' +import json +import os +import sys + +data = json.loads(os.environ["DIRECT_RESPONSE"]) +content = data["choices"][0]["message"]["content"] + +print("Direct curl response:", content) + +if "OK" not in content: + print(f"FAIL: Expected direct curl response to contain OK, got: {content!r}") + sys.exit(1) + +print("PASS: Direct curl chat completion worked") +PY + +script_response="$(bash curl_script.sh \ + --model Qwen/Qwen3-1.7B \ + --prompt "Reply with exactly: OK" \ + --temperature 0 \ + --max-tokens 32)" + +export SCRIPT_RESPONSE="$script_response" +python - <<'PY' +import json +import os +import sys + +raw = os.environ["SCRIPT_RESPONSE"] + +try: + data = json.loads(raw) +except json.JSONDecodeError: + print("FAIL: curl_script.sh did not return valid JSON") + print(raw) + sys.exit(1) + +content = data["choices"][0]["message"]["content"] + +print("curl_script.sh response:", content) + +if "OK" not in content: + print(f"FAIL: Expected curl_script.sh response to contain OK, got: {content!r}") + sys.exit(1) + +print("PASS: curl_script.sh chat completion worked") +PY + +python - <<'PY' +from openai import OpenAI +import sys + +client = OpenAI( + base_url="http://localhost:8000/v1", + api_key="EMPTY", +) + +response = client.chat.completions.create( + model="Qwen/Qwen3-1.7B", + messages=[ + {"role": "user", "content": "Reply with exactly: OK"}, + ], + temperature=0, + max_tokens=32, +) + +content = response.choices[0].message.content or "" +print("OpenAI Python API response:", content) + +if "OK" not in content: + print(f"FAIL: Expected OpenAI Python API response to contain OK, got: {content!r}") + sys.exit(1) + +print("PASS: OpenAI Python API chat completion worked") +PY ``` + ## Troubleshooting diff --git a/playbooks/supplemental/vllm-inference/playbook.json b/playbooks/supplemental/vllm-inference/playbook.json index ae562754..66d42973 100644 --- a/playbooks/supplemental/vllm-inference/playbook.json +++ b/playbooks/supplemental/vllm-inference/playbook.json @@ -8,8 +8,7 @@ "linux" ], "halo_box": [ - "linux", - "windows" + "linux" ], "stx": [ "linux" @@ -18,6 +17,19 @@ "linux" ] }, + "tested_platforms": { + "halo": [ + "linux" + ] + }, + "required_platforms": { + "halo": [ + "linux" + ], + "halo_box": [ + "linux" + ] + }, "difficulty": "beginner", "isNew": false, "isFeatured": true,