- 
                Notifications
    
You must be signed in to change notification settings  - Fork 479
 
Open
Labels
Description
🐛 Describe the bug
issue simple description
1P(TP16)1D (TP16)  and 2P(TP=16)1D(TP=16) is working normally, but 4P(TP8)1D(TP16) failed, when i run the curl :
LB_IP=$(kubectl get svc/envoy-aibrix-system-aibrix-eg-903790dc -n envoy-gateway-system -o=jsonpath='{.status.loadBalancer.ingress[0].ip}')
ENDPOINT="${LB_IP}:80"
curl -v http://${ENDPOINT}/v1/models/
curl -v http://${ENDPOINT}/v1/chat/completions -H "routing-strategy: pd" -H "Content-Type: application/json" -d '{
    "model": "DeepSeek-R1",
    "messages": [
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": "help me write a random generator in python"}
    ],
    "temperature": 0.7,
     "max_tokens": 10
}'
output is:
{"error":{"code":null,"message":"httproutes.gateway.networking.k8s.io \"DeepSeek-R1-router\" not found","param":null,"type":"api_error"}}
my yaml: sglang-4P1D.yaml
apiVersion: orchestration.aibrix.ai/v1alpha1
kind: StormService
metadata:
  name: sglang-1p1d
spec:
  replicas: 1
  updateStrategy:
    type: InPlaceUpdate
  stateful: true
  selector:
    matchLabels:
      app: sglang-1p1d
  template:
    metadata:
      labels:
        app: sglang-1p1d
    spec:
      roles:
        - name: prefill
          replicas: 4
          podGroupSize: 1
          stateful: true
          template:
            metadata:
              labels:
                model.aibrix.ai/name: DeepSeek-R1
                model.aibrix.ai/port: "30000"
                model.aibrix.ai/engine: sglang
            spec:
              affinity:
                nodeAffinity:
                  requiredDuringSchedulingIgnoredDuringExecution:
                    nodeSelectorTerms:
                      - matchExpressions:
                          - key: kubernetes.io/hostname
                            operator: In
                            values:
                              - pod1-gpu-027
                              - pod1-gpu-028
                              - pod1-gpu-029
                              - pod1-gpu-030
                              - pod1-gpu-031
                              - pod1-gpu-032
              containers:
                - name: prefill
                  image: 10.24.10.61:20405/sglang:v0.4.10-deepseek3.1-0822-my-re_mooncake
                  command: ["sh", "-c"]
                  args:
                    - |
                      python3 -m sglang.launch_server \
                        --model-path /llm/deepseek/DeepSeek-R1-0528-full \
                        --served-model-name DeepSeek-R1 \
                        --host 0.0.0.0 \
                        --port 30000 \
                        --mem-fraction-static 0.9 \
                        --tp-size 8
                  env:
                    - name: GLOO_SOCKET_IFNAME
                      value: eth0
                    - name: NCCL_SOCKET_IFNAME
                      value: eth0
                    - name: NCCL_IB_DISABLE
                      value: "0"
                    - name: NCCL_IB_GID_INDEX
                      value: "0"
                    - name: NCCL_DEBUG
                      value: "WARN"
                    - name: TORCH_CUDA_ARCH_LIST
                      value: "9.0"
                  volumeMounts:
                    - name: model-vol
                      mountPath: /llm
                    - mountPath: /dev/shm
                      name: shared-mem
                  resources:
                    limits:
                      nvidia.com/gpu: 8
                  securityContext:
                    allowPrivilegeEscalation: true
                    readOnlyRootFilesystem: false
                    runAsNonRoot: false
                    privileged: true
                    capabilities:
                      add:
                        - IPC_LOCK
              volumes:
                - name: model-vol
                  hostPath:
                    path: /llm
                    type: Directory
                - emptyDir:
                    medium: Memory
                  name: shared-mem
        - name: decode
          replicas: 1
          podGroupSize: 2
          stateful: true
          template:
            metadata:
              labels:
                model.aibrix.ai/name: DeepSeek-R1
                model.aibrix.ai/port: "30000"
                model.aibrix.ai/engine: sglang
            spec:
              affinity:
                nodeAffinity:
                  requiredDuringSchedulingIgnoredDuringExecution:
                    nodeSelectorTerms:
                      - matchExpressions:
                          - key: kubernetes.io/hostname
                            operator: In
                            values:
                              - pod1-gpu-027
                              - pod1-gpu-028
                              - pod1-gpu-029
                              - pod1-gpu-030
                              - pod1-gpu-031
                              - pod1-gpu-032
              containers:
                - name: decode
                  image: 10.24.10.61:20405/sglang:v0.4.10-deepseek3.1-0822-my-re_mooncake
                  command: ["sh", "-c"]
                  args:
                    - |
                      python3 -m sglang.launch_server \
                        --model-path /llm/deepseek/DeepSeek-R1-0528-full \
                        --served-model-name DeepSeek-R1 \
                        --host 0.0.0.0 \
                        --port 30000 \
                        --disaggregation-mode decode \
                        --disaggregation-transfer-backend=mooncake \
                        --trust-remote-code \
                        --disaggregation-ib-device mlx5_0,mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_6,mlx5_9 \
                        --dist-init-addr "${PODSET_NAME}-0.${STORM_SERVICE_NAME}.default.svc.cluster.local:5000" \
                        --nnodes 2 \
                        --node-rank $POD_GROUP_INDEX \
                        --tp-size 16 \
                        --mem-fraction-static 0.8
                  env:
                    - name: GLOO_SOCKET_IFNAME
                      value: eth0
                    - name: NCCL_SOCKET_IFNAME
                      value: eth0
                    - name: NCCL_IB_DISABLE
                      value: "0"
                    - name: NCCL_IB_GID_INDEX
                      value: "0"
                    - name: NCCL_DEBUG
                      value: "WARN"
                    - name: TORCH_CUDA_ARCH_LIST
                      value: "9.0"
                  volumeMounts:
                    - name: model-vol
                      mountPath: /llm
                    - mountPath: /dev/shm
                      name: shared-mem
                  resources:
                    limits:
                      nvidia.com/gpu: 8
                  securityContext:
                    allowPrivilegeEscalation: true
                    readOnlyRootFilesystem: false
                    runAsNonRoot: false
                    privileged: true
                    capabilities:
                      add:
                        - IPC_LOCK
              volumes:
                - name: model-vol
                  hostPath:
                    path: /llm
                    type: Directory
                - emptyDir:
                    medium: Memory
                  name: shared-mem
Steps to Reproduce
kubectl apply -f sglang-4P1D.yaml- run the curl
 
LB_IP=$(kubectl get svc/envoy-aibrix-system-aibrix-eg-903790dc -n envoy-gateway-system -o=jsonpath='{.status.loadBalancer.ingress[0].ip}')
ENDPOINT="${LB_IP}:80"
curl -v http://${ENDPOINT}/v1/models/
curl -v http://10.24.8.71:80/v1/models/
curl -v http://${ENDPOINT}/v1/chat/completions -H "routing-strategy: pd" -H "Content-Type: application/json" -d '{
    "model": "DeepSeek-R1",
    "messages": [
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": "help me write a random generator in python"}
    ],
    "temperature": 0.7,
     "max_tokens": 10
}'
Expected behavior
get the normal output
Environment
- aibrix version is 2025/10/21 nightly
 - sglang version: 0.4.10