Skip to content

feat: add vllm rust frontend #1141

feat: add vllm rust frontend

feat: add vllm rust frontend #1141

Workflow file for this run

name: CI
on:
pull_request:
branches: [ main ]
paths:
- '.github/workflows/ci.yml'
- 'install.sh'
- 'src/**'
- 'tests/**'
- 'pyproject.toml'
jobs:
test:
name: Test on ${{ matrix.os }} / Python ${{ matrix.python-version }}
runs-on: ${{ matrix.os }}
strategy:
fail-fast: false
matrix:
os: [ubuntu-latest, macos-15, macos-26]
python-version: ['3.11', '3.12', '3.13']
steps:
- name: Free Disk Space (Ubuntu)
if: runner.os == 'Linux'
uses: jlumbroso/free-disk-space@v1.3.1
with:
tool-cache: false
android: true
dotnet: true
haskell: true
large-packages: true
docker-images: true
swap-storage: false
- name: Checkout code
uses: actions/checkout@v4
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
- name: Cache pip dependencies
uses: actions/cache@v4
with:
path: ~/.cache/pip
key: ${{ matrix.os }}-pip-${{ matrix.python-version }}-${{ hashFiles('pyproject.toml') }}
restore-keys: |
${{ matrix.os }}-pip-${{ matrix.python-version }}-
${{ matrix.os }}-pip-
- name: Install dependencies
run: |
python -m pip install --upgrade pip
# Install extras dependencies based on matrix variable
if [[ "${{ runner.os }}" == "Linux" ]]; then
pip install -e ".[gpu, dev]"
else
pip install -e ".[mac, dev]"
fi
- name: Run Unit Tests
shell: bash
run: |
pytest tests/ -v --cov=src/parallax --cov-report=xml
- name: Upload coverage to Codecov
if: matrix.os == 'macos-15' && matrix.python-version == '3.11'
uses: codecov/codecov-action@v4
with:
file: ./coverage.xml
fail_ci_if_error: false
token: ${{ secrets.CODECOV_TOKEN }}
- name: Install Parallax and vLLM Rust frontend
if: matrix.os == 'macos-26' && matrix.python-version == '3.12'
shell: bash
run: |
./install.sh --extras mac --python "${{ matrix.python-version }}"
test -x .venv/bin/vllm-rs
- name: Run E2E tests (macOS only)
if: matrix.os == 'macos-26' && matrix.python-version == '3.12'
shell: bash
env:
TERM: xterm-256color
run: |
# Start the server
.venv/bin/python src/parallax/launch.py \
--model-path Qwen/Qwen3-0.6B \
--max-num-tokens-per-batch 16384 \
--kv-block-size 32 \
--max-batch-size 128 \
--start-layer 0 \
--end-layer 28 &
PID=$!
echo "Waiting for server to start..."
# Poll to check if the port is ready (wait up to 120 seconds)
for i in {1..60}; do
# If curl succeeds (200) or returns 405 (Method Not Allowed), the port is open
if curl -s --max-time 5 --connect-timeout 2 -o /dev/null -w "%{http_code}" http://localhost:3000/v1/chat/completions | grep -qE "200|400|405"; then
echo "Server is up!"
break
fi
# Check if the process is still alive
if ! kill -0 $PID 2>/dev/null; then
echo "Server process died prematurely"
exit 1
fi
if [ $i -eq 60 ]; then
echo "Server failed to start within 120 seconds"
kill $PID 2>/dev/null
exit 1
fi
sleep 2
done
# Wait additional time to ensure server is fully ready (model loaded, KV cache allocated, etc.)
echo "Waiting for server to be fully ready..."
sleep 10
echo "Sending test request..."
# Capture the response with increased timeout (5 minutes for model inference)
RESPONSE=$(curl --fail --silent --show-error --location \
--max-time 300 \
--connect-timeout 10 \
'http://localhost:3000/v1/chat/completions' \
--header 'Content-Type: application/json' \
--data '{
"model": "Qwen/Qwen3-0.6B",
"messages": [
{
"role": "user",
"content": "What is the capital of France"
}
],
"stream": false,
"max_tokens": 1024,
"chat_template_kwargs": {"enable_thinking": false},
"top_k": 3
}')
echo "Response received:"
echo "$RESPONSE"
# Check if the response contains "Paris" (case-insensitive)
if echo "$RESPONSE" | grep -iq "Paris"; then
echo "Test passed: Response contains 'Paris'"
else
echo "Test failed: Response does not contain 'Paris'"
kill $PID 2>/dev/null || true
exit 1
fi
# Clean up process
kill $PID 2>/dev/null || true