Update pr-sglang-g6-inference.yaml

Jyothirmaikottu · web-flow · commit 814ff7eec49b · 2026-01-13T17:21:28.000-08:00
diff --git a/.github/workflows/pr-sglang-g6-inference.yaml b/.github/workflows/pr-sglang-g6-inference.yaml
@@ -26,20 +26,19 @@ jobs:
         run: |
           CONTAINER_ID=$(docker run -d --rm --gpus=all \
             -p 30000:30000 \
-            -e HF_TOKEN=${{ secrets.HUGGING_FACE_HUB_TOKEN }} \
             ${{ env.SGLANG_IMAGE }} \
             python3 -m sglang.launch_server \
-            --model-path meta-llama/Llama-3.2-3B-Instruct \
+            --model-path Qwen/Qwen2.5-0.5B-Instruct \
             --host 0.0.0.0 --port 30000 \
             --tp 2)
           echo "CONTAINER_ID=${CONTAINER_ID}" >> ${GITHUB_ENV}
           sleep 60
       
       - name: Test inference
         run: |
-          docker exec ${CONTAINER_ID} curl -X POST http://localhost:30000/v1/completions \
+          docker exec ${CONTAINER_ID} curl -X POST http://localhost:30000/generate \
             -H "Content-Type: application/json" \
-            -d '{"model": "meta-llama/Llama-3.2-3B-Instruct", "prompt": "Hello, how are you?", "max_tokens": 50}'
+            -d '{"text": "Hello, how are you?", "sampling_params": {"temperature": 0.7, "max_new_tokens": 50}}'
       
       - name: Show GPU usage
         if: always()
@@ -65,7 +64,6 @@ jobs:
         run: |
           CONTAINER_ID=$(docker run -d --rm --gpus=all \
             -p 30000:30000 \
-            -e HF_TOKEN=${{ secrets.HUGGING_FACE_HUB_TOKEN }} \
             ${{ env.SGLANG_IMAGE }} \
             python3 -m sglang.launch_server \
             --model-path Qwen/Qwen2.5-0.5B-Instruct \
@@ -75,9 +73,9 @@ jobs:
       
       - name: Test inference
         run: |
-          docker exec ${CONTAINER_ID} curl -X POST http://localhost:30000/v1/completions \
+          docker exec ${CONTAINER_ID} curl -X POST http://localhost:30000/generate \
             -H "Content-Type: application/json" \
-            -d '{"model": "Qwen/Qwen2.5-0.5B-Instruct", "prompt": "Hello, how are you?", "max_tokens": 50}'
+            -d '{"text": "Hello, how are you?", "sampling_params": {"temperature": 0.7, "max_new_tokens": 50}}'
       
       - name: Show GPU usage
         if: always()