feat: add vllm rust frontend #1141

Workflow file for this run

	name: CI

	on:
	pull_request:
	branches: [ main ]
	paths:
	- '.github/workflows/ci.yml'
	- 'install.sh'
	- 'src/**'
	- 'tests/**'
	- 'pyproject.toml'

	jobs:
	test:
	name: Test on ${{ matrix.os }} / Python ${{ matrix.python-version }}
	runs-on: ${{ matrix.os }}
	strategy:
	fail-fast: false
	matrix:
	os: [ubuntu-latest, macos-15, macos-26]
	python-version: ['3.11', '3.12', '3.13']

	steps:
	- name: Free Disk Space (Ubuntu)
	if: runner.os == 'Linux'
	uses: jlumbroso/free-disk-space@v1.3.1
	with:
	tool-cache: false
	android: true
	dotnet: true
	haskell: true
	large-packages: true
	docker-images: true
	swap-storage: false

	- name: Checkout code
	uses: actions/checkout@v4

	- name: Set up Python ${{ matrix.python-version }}
	uses: actions/setup-python@v5
	with:
	python-version: ${{ matrix.python-version }}

	- name: Cache pip dependencies
	uses: actions/cache@v4
	with:
	path: ~/.cache/pip
	key: ${{ matrix.os }}-pip-${{ matrix.python-version }}-${{ hashFiles('pyproject.toml') }}
	restore-keys: \|
	${{ matrix.os }}-pip-${{ matrix.python-version }}-
	${{ matrix.os }}-pip-

	- name: Install dependencies
	run: \|
	python -m pip install --upgrade pip
	# Install extras dependencies based on matrix variable
	if [[ "${{ runner.os }}" == "Linux" ]]; then
	pip install -e ".[gpu, dev]"
	else
	pip install -e ".[mac, dev]"
	fi

	- name: Run Unit Tests
	shell: bash
	run: \|
	pytest tests/ -v --cov=src/parallax --cov-report=xml

	- name: Upload coverage to Codecov
	if: matrix.os == 'macos-15' && matrix.python-version == '3.11'
	uses: codecov/codecov-action@v4
	with:
	file: ./coverage.xml
	fail_ci_if_error: false
	token: ${{ secrets.CODECOV_TOKEN }}

	- name: Install Parallax and vLLM Rust frontend
	if: matrix.os == 'macos-26' && matrix.python-version == '3.12'
	shell: bash
	run: \|
	./install.sh --extras mac --python "${{ matrix.python-version }}"
	test -x .venv/bin/vllm-rs

	- name: Run E2E tests (macOS only)
	if: matrix.os == 'macos-26' && matrix.python-version == '3.12'
	shell: bash
	env:
	TERM: xterm-256color
	run: \|
	# Start the server
	.venv/bin/python src/parallax/launch.py \
	--model-path Qwen/Qwen3-0.6B \
	--max-num-tokens-per-batch 16384 \
	--kv-block-size 32 \
	--max-batch-size 128 \
	--start-layer 0 \
	--end-layer 28 &
	PID=$!

	echo "Waiting for server to start..."
	# Poll to check if the port is ready (wait up to 120 seconds)
	for i in {1..60}; do
	# If curl succeeds (200) or returns 405 (Method Not Allowed), the port is open
	if curl -s --max-time 5 --connect-timeout 2 -o /dev/null -w "%{http_code}" http://localhost:3000/v1/chat/completions \| grep -qE "200\|400\|405"; then
	echo "Server is up!"
	break
	fi

	# Check if the process is still alive
	if ! kill -0 $PID 2>/dev/null; then
	echo "Server process died prematurely"
	exit 1
	fi

	if [ $i -eq 60 ]; then
	echo "Server failed to start within 120 seconds"
	kill $PID 2>/dev/null
	exit 1
	fi
	sleep 2
	done

	# Wait additional time to ensure server is fully ready (model loaded, KV cache allocated, etc.)
	echo "Waiting for server to be fully ready..."
	sleep 10

	echo "Sending test request..."
	# Capture the response with increased timeout (5 minutes for model inference)
	RESPONSE=$(curl --fail --silent --show-error --location \
	--max-time 300 \
	--connect-timeout 10 \
	'http://localhost:3000/v1/chat/completions' \
	--header 'Content-Type: application/json' \
	--data '{
	"model": "Qwen/Qwen3-0.6B",
	"messages": [
	{
	"role": "user",
	"content": "What is the capital of France"
	}
	],
	"stream": false,
	"max_tokens": 1024,
	"chat_template_kwargs": {"enable_thinking": false},
	"top_k": 3
	}')

	echo "Response received:"
	echo "$RESPONSE"

	# Check if the response contains "Paris" (case-insensitive)
	if echo "$RESPONSE" \| grep -iq "Paris"; then
	echo "Test passed: Response contains 'Paris'"
	else
	echo "Test failed: Response does not contain 'Paris'"
	kill $PID 2>/dev/null \|\| true
	exit 1
	fi

	# Clean up process
	kill $PID 2>/dev/null \|\| true

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

feat: add vllm rust frontend #1141

Workflow file

feat: add vllm rust frontend #1141

Uh oh!

Workflow file for this run