Skip to content

Update pr-sglang-g6-inference.yaml #2

Update pr-sglang-g6-inference.yaml

Update pr-sglang-g6-inference.yaml #2

name: PR - SGLang G6 Inference
on:
workflow_dispatch:
push:
paths:
- ".github/workflows/pr-sglang-g6-inference.yaml"
env:
SGLANG_IMAGE: "public.ecr.aws/deep-learning-containers/sglang:0.5.5-gpu-py312"
jobs:
sglang-heavy-inference:
runs-on: g6-2gpu-runner
steps:
- name: Checkout
uses: actions/checkout@v5
- name: Pull image
run: docker pull ${{ env.SGLANG_IMAGE }}
- name: Verify GPUs
run: |
echo "=== Host GPUs ==="
nvidia-smi
echo ""
echo "=== Container GPU Test ==="
docker run --rm --gpus=all ${{ env.SGLANG_IMAGE }} nvidia-smi
- name: Start container (2 GPUs)
run: |
CONTAINER_ID=$(docker run -d --rm --gpus=all \
-p 30000:30000 \
-e HF_TOKEN=${{ secrets.HUGGING_FACE_HUB_TOKEN }} \
${{ env.SGLANG_IMAGE }} \
python3 -m sglang.launch_server \
--model-path meta-llama/Llama-3.2-3B-Instruct \
--host 0.0.0.0 --port 30000 \
--tp 2)
echo "CONTAINER_ID=${CONTAINER_ID}" >> ${GITHUB_ENV}
sleep 60
- name: Test inference
run: |
docker exec ${CONTAINER_ID} curl -X POST http://localhost:30000/v1/completions \
-H "Content-Type: application/json" \
-d '{"model": "meta-llama/Llama-3.2-3B-Instruct", "prompt": "Hello, how are you?", "max_tokens": 50}'
- name: Show GPU usage
if: always()
run: docker exec ${CONTAINER_ID} nvidia-smi || true
- name: Cleanup
if: always()
run: docker stop ${CONTAINER_ID} || true
sglang-light-inference:
runs-on: g6-1gpu-runner
steps:
- name: Checkout
uses: actions/checkout@v5
- name: Pull image
run: docker pull ${{ env.SGLANG_IMAGE }}
- name: Verify GPUs
run: |
echo "=== Host GPUs ==="
nvidia-smi
echo ""
echo "=== Container GPU Test ==="
docker run --rm --gpus=all ${{ env.SGLANG_IMAGE }} nvidia-smi
- name: Start container (1 GPU)
run: |
CONTAINER_ID=$(docker run -d --rm --gpus=all \
-p 30000:30000 \
-e HF_TOKEN=${{ secrets.HUGGING_FACE_HUB_TOKEN }} \
${{ env.SGLANG_IMAGE }} \
python3 -m sglang.launch_server \
--model-path Qwen/Qwen2.5-0.5B-Instruct \
--host 0.0.0.0 --port 30000)
echo "CONTAINER_ID=${CONTAINER_ID}" >> ${GITHUB_ENV}
sleep 45
- name: Test inference
run: |
docker exec ${CONTAINER_ID} curl -X POST http://localhost:30000/v1/completions \
-H "Content-Type: application/json" \
-d '{"model": "Qwen/Qwen2.5-0.5B-Instruct", "prompt": "Hello, how are you?", "max_tokens": 50}'
- name: Show GPU usage
if: always()
run: docker exec ${CONTAINER_ID} nvidia-smi || true
- name: Cleanup
if: always()
run: docker stop ${CONTAINER_ID} || true