Skip to content

Commit 5963527

Browse files
committed
fix v2 telemetry
Signed-off-by: Junpu Fan <junpu@amazon.com>
1 parent c8477d6 commit 5963527

File tree

6 files changed

+175
-8
lines changed

6 files changed

+175
-8
lines changed

.github/workflows/pr-sglang.yml

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@ permissions:
1313

1414
env:
1515
# CI Image configuration
16+
CONTAINER_TYPE: "general"
17+
FRAMEWORK: "sglang"
1618
SGLANG_VERSION: "0.5.5"
1719
PYTHON_VERSION: "py312"
1820
CUDA_VERSION: "cu129"
@@ -116,6 +118,9 @@ jobs:
116118
docker buildx build --progress plain \
117119
--build-arg CACHE_REFRESH="$(date +"%Y-%m-%d")" \
118120
--build-arg BASE_IMAGE="lmsysorg/sglang:v${{ env.SGLANG_VERSION }}-${{ env.CUDA_VERSION }}-amd64" \
121+
--build-arg CONTAINER_TYPE="${{ env.CONTAINER_TYPE }}" \
122+
--build-arg FRAMEWORK="${{ env.FRAMEWORK }}" \
123+
--build-arg FRAMEWORK_VERSION="${{ env.SGLANG_VERSION }}" \
119124
--cache-to=type=inline \
120125
--cache-from=type=registry,ref=${CI_IMAGE_URI} \
121126
--tag ${CI_IMAGE_URI} \
@@ -158,6 +163,39 @@ jobs:
158163
echo "AWS_ACCOUNT_ID=${AWS_ACCOUNT_ID}" >> ${GITHUB_OUTPUT}
159164
echo "IMAGE_URI=${IMAGE_URI}" >> ${GITHUB_OUTPUT}
160165
166+
sglang-telemetry-test:
167+
needs: [set-test-environment, build-sglang-image]
168+
if: success()
169+
runs-on:
170+
- codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }}
171+
buildspec-override:true
172+
concurrency:
173+
group: ${{ github.workflow }}-sglang-telemetry-test-${{ github.event.pull_request.number }}
174+
cancel-in-progress: true
175+
steps:
176+
- name: Checkout DLC source
177+
uses: actions/checkout@v5
178+
179+
- name: Container pull
180+
uses: ./.github/actions/ecr-authenticate
181+
with:
182+
aws-account-id: ${{ needs.set-test-environment.outputs.aws-account-id }}
183+
aws-region: ${{ vars.AWS_REGION }}
184+
image-uri: ${{ needs.set-test-environment.outputs.image-uri }}
185+
186+
- name: Start container
187+
run: |
188+
CONTAINER_ID=$(docker run -d -it --rm --entrypoint /bin/bash \
189+
-e TEST_MODE=1 \
190+
${{ needs.set-test-environment.outputs.image-uri }})
191+
echo "CONTAINER_ID=${CONTAINER_ID}" >> ${GITHUB_ENV}
192+
docker logs ${CONTAINER_ID}
193+
194+
- name: Run tests
195+
run: |
196+
docker exec ${CONTAINER_ID} cat /tmp/test_request.txt | grep "sglang"
197+
198+
161199
sglang-local-benchmark-test:
162200
needs: [set-test-environment, build-sglang-image]
163201
if: success()

.github/workflows/pr-vllm.yml

Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@ permissions:
1313

1414
env:
1515
# CI Image configuration
16+
CONTAINER_TYPE: "general"
17+
FRAMEWORK: "vllm"
1618
VLLM_VERSION: 0.11.2
1719
VLLM_RAYSERVE_VERSION: 0.10.2
1820
PYTHON_VERSION: "py312"
@@ -118,6 +120,9 @@ jobs:
118120
docker buildx build --progress plain \
119121
--build-arg CACHE_REFRESH="$(date +"%Y-%m-%d")" \
120122
--build-arg BASE_IMAGE="vllm/vllm-openai:v${{ env.VLLM_VERSION }}" \
123+
--build-arg CONTAINER_TYPE="${{ env.CONTAINER_TYPE }}" \
124+
--build-arg FRAMEWORK="${{ env.FRAMEWORK }}" \
125+
--build-arg FRAMEWORK_VERSION="${{ env.VLLM_VERSION }}" \
121126
--cache-to=type=inline \
122127
--cache-from=type=registry,ref=${CI_IMAGE_URI} \
123128
--tag ${CI_IMAGE_URI} \
@@ -160,6 +165,38 @@ jobs:
160165
echo "AWS_ACCOUNT_ID=${AWS_ACCOUNT_ID}" >> ${GITHUB_OUTPUT}
161166
echo "IMAGE_URI=${IMAGE_URI}" >> ${GITHUB_OUTPUT}
162167
168+
vllm-ec2-telemetry-test:
169+
needs: [build-vllm-ec2-image, set-ec2-test-environment]
170+
if: success()
171+
runs-on:
172+
- codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }}
173+
buildspec-override:true
174+
concurrency:
175+
group: ${{ github.workflow }}-vllm-ec2-telemetry-test-${{ github.event.pull_request.number }}
176+
cancel-in-progress: true
177+
steps:
178+
- name: Checkout DLC source
179+
uses: actions/checkout@v5
180+
181+
- name: Container pull
182+
uses: ./.github/actions/ecr-authenticate
183+
with:
184+
aws-account-id: ${{ needs.set-ec2-test-environment.outputs.aws-account-id }}
185+
aws-region: ${{ vars.AWS_REGION }}
186+
image-uri: ${{ needs.set-ec2-test-environment.outputs.image-uri }}
187+
188+
- name: Start container
189+
run: |
190+
CONTAINER_ID=$(docker run -d -it --rm --entrypoint /bin/bash \
191+
-e TEST_MODE=1 \
192+
${{ needs.set-ec2-test-environment.outputs.image-uri }})
193+
echo "CONTAINER_ID=${CONTAINER_ID}" >> ${GITHUB_ENV}
194+
docker logs ${CONTAINER_ID}
195+
196+
- name: Run tests
197+
run: |
198+
docker exec ${CONTAINER_ID} cat /tmp/test_request.txt | grep "${{ env.FRAMEWORK }}"
199+
163200
vllm-ec2-regression-test:
164201
needs: [build-vllm-ec2-image, set-ec2-test-environment]
165202
if: success()
@@ -402,6 +439,9 @@ jobs:
402439
docker buildx build --progress plain \
403440
--build-arg CACHE_REFRESH="$(date +"%Y-%m-%d")" \
404441
--build-arg BASE_IMAGE="vllm/vllm-openai:v${{ env.VLLM_RAYSERVE_VERSION }}" \
442+
--build-arg CONTAINER_TYPE="${{ env.CONTAINER_TYPE }}" \
443+
--build-arg FRAMEWORK="${{ env.FRAMEWORK }}" \
444+
--build-arg FRAMEWORK_VERSION="${{ env.VLLM_RAYSERVE_VERSION }}" \
405445
--cache-to=type=inline \
406446
--cache-from=type=registry,ref=${CI_IMAGE_URI} \
407447
--tag ${CI_IMAGE_URI} \
@@ -444,6 +484,38 @@ jobs:
444484
echo "AWS_ACCOUNT_ID=${AWS_ACCOUNT_ID}" >> ${GITHUB_OUTPUT}
445485
echo "IMAGE_URI=${IMAGE_URI}" >> ${GITHUB_OUTPUT}
446486
487+
vllm-rayserve-telemetry-test:
488+
needs: [build-vllm-rayserve-image, set-rayserve-test-environment]
489+
if: success()
490+
runs-on:
491+
- codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }}
492+
buildspec-override:true
493+
concurrency:
494+
group: ${{ github.workflow }}-vllm-rayserve-telemetry-test-${{ github.event.pull_request.number }}
495+
cancel-in-progress: true
496+
steps:
497+
- name: Checkout DLC source
498+
uses: actions/checkout@v5
499+
500+
- name: Container pull
501+
uses: ./.github/actions/ecr-authenticate
502+
with:
503+
aws-account-id: ${{ needs.set-rayserve-test-environment.outputs.aws-account-id }}
504+
aws-region: ${{ vars.AWS_REGION }}
505+
image-uri: ${{ needs.set-rayserve-test-environment.outputs.image-uri }}
506+
507+
- name: Start container
508+
run: |
509+
CONTAINER_ID=$(docker run -d -it --rm --entrypoint /bin/bash \
510+
-e TEST_MODE=1 \
511+
${{ needs.set-rayserve-test-environment.outputs.image-uri }})
512+
echo "CONTAINER_ID=${CONTAINER_ID}" >> ${GITHUB_ENV}
513+
docker logs ${CONTAINER_ID}
514+
515+
- name: Run tests
516+
run: |
517+
docker exec ${CONTAINER_ID} cat /tmp/test_request.txt | grep "${{ env.FRAMEWORK }}"
518+
447519
vllm-rayserve-regression-test:
448520
needs: [build-vllm-rayserve-image, set-rayserve-test-environment]
449521
if: success()
@@ -686,6 +758,9 @@ jobs:
686758
docker buildx build --progress plain \
687759
--build-arg CACHE_REFRESH="$(date +"%Y-%m-%d")" \
688760
--build-arg BASE_IMAGE="vllm/vllm-openai:v${{ env.VLLM_VERSION }}" \
761+
--build-arg CONTAINER_TYPE="${{ env.CONTAINER_TYPE }}" \
762+
--build-arg FRAMEWORK="${{ env.FRAMEWORK }}" \
763+
--build-arg FRAMEWORK_VERSION="${{ env.VLLM_VERSION }}" \
689764
--cache-to=type=inline \
690765
--cache-from=type=registry,ref=${CI_IMAGE_URI} \
691766
--tag ${CI_IMAGE_URI} \
@@ -728,6 +803,38 @@ jobs:
728803
echo "AWS_ACCOUNT_ID=${AWS_ACCOUNT_ID}" >> ${GITHUB_OUTPUT}
729804
echo "IMAGE_URI=${IMAGE_URI}" >> ${GITHUB_OUTPUT}
730805
806+
vllm-sagemaker-telemetry-test:
807+
needs: [build-vllm-sagemaker-image, set-sagemaker-test-environment]
808+
if: success()
809+
runs-on:
810+
- codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }}
811+
buildspec-override:true
812+
concurrency:
813+
group: ${{ github.workflow }}-vllm-sagemaker-telemetry-test-${{ github.event.pull_request.number }}
814+
cancel-in-progress: true
815+
steps:
816+
- name: Checkout DLC source
817+
uses: actions/checkout@v5
818+
819+
- name: Container pull
820+
uses: ./.github/actions/ecr-authenticate
821+
with:
822+
aws-account-id: ${{ needs.set-sagemaker-test-environment.outputs.aws-account-id }}
823+
aws-region: ${{ vars.AWS_REGION }}
824+
image-uri: ${{ needs.set-sagemaker-test-environment.outputs.image-uri }}
825+
826+
- name: Start container
827+
run: |
828+
CONTAINER_ID=$(docker run -d -it --rm --entrypoint /bin/bash \
829+
-e TEST_MODE=1 \
830+
${{ needs.set-sagemaker-test-environment.outputs.image-uri }})
831+
echo "CONTAINER_ID=${CONTAINER_ID}" >> ${GITHUB_ENV}
832+
docker logs ${CONTAINER_ID}
833+
834+
- name: Run tests
835+
run: |
836+
docker exec ${CONTAINER_ID} cat /tmp/test_request.txt | grep "${{ env.FRAMEWORK }}"
837+
731838
vllm-sagemaker-regression-test:
732839
needs: [build-vllm-sagemaker-image, set-sagemaker-test-environment]
733840
if: success()

docker/sglang/Dockerfile

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
11
# Declare the argument as default to use as input
22
# base image: https://hub.docker.com/r/lmsysorg/sglang/tags
3+
ARG FRAMEWORK=sglang
4+
ARG FRAMEWORK_VERSION=0.5.5
5+
ARG CONTAINER_TYPE=general
36
ARG BASE_IMAGE=lmsysorg/sglang:v0.5.5-cu129-amd64
47

58
# Use input argument as base image
@@ -32,13 +35,21 @@ WORKDIR /
3235
# Copy artifacts
3336
# ===============
3437
COPY ./scripts/telemetry/deep_learning_container.py /usr/local/bin/deep_learning_container.py
35-
COPY ./scripts/telemetry/bash_telemetry.sh /usr/local/bin/bash_telemetry.sh
38+
COPY ./scripts/telemetry/bash_telemetry.sh.template /tmp/bash_telemetry.sh.template
3639
COPY ./scripts/common/install_efa.sh install_efa.sh
3740
COPY ./scripts/common/start_cuda_compat.sh /usr/local/bin/start_cuda_compat.sh
3841

42+
ARG FRAMEWORK
43+
ARG FRAMEWORK_VERSION
44+
ARG CONTAINER_TYPE
3945
RUN chmod +x /usr/local/bin/deep_learning_container.py \
46+
&& sed -e "s/{{FRAMEWORK}}/${FRAMEWORK}/g" \
47+
-e "s/{{FRAMEWORK_VERSION}}/${FRAMEWORK_VERSION}/g" \
48+
-e "s/{{CONTAINER_TYPE}}/${CONTAINER_TYPE}/g" \
49+
/tmp/bash_telemetry.sh.template >/usr/local/bin/bash_telemetry.sh \
4050
&& chmod +x /usr/local/bin/bash_telemetry.sh \
41-
&& chmod +x /usr/local/bin/start_cuda_compat.sh
51+
&& chmod +x /usr/local/bin/start_cuda_compat.sh \
52+
&& rm /tmp/bash_telemetry.sh.template
4253

4354
# Install EFA and remove vulnerable nvjpeg
4455
# =========================================

docker/vllm/Dockerfile

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
11
# Declare the argument as default to use as input
22
# base image: https://hub.docker.com/r/vllm/vllm-openai/tags
3+
ARG FRAMEWORK=vllm
4+
ARG FRAMEWORK_VERSION=0.11.0
5+
ARG CONTAINER_TYPE=general
36
ARG BASE_IMAGE=vllm/vllm-openai:v0.11.0
47

58
# Use input argument as base image
@@ -25,11 +28,19 @@ ENV DEBIAN_FRONTEND=noninteractive \
2528
WORKDIR /
2629

2730
COPY ./scripts/telemetry/deep_learning_container.py /usr/local/bin/deep_learning_container.py
28-
COPY ./scripts/telemetry/bash_telemetry.sh /usr/local/bin/bash_telemetry.sh
31+
COPY ./scripts/telemetry/bash_telemetry.sh.template /tmp/bash_telemetry.sh.template
2932
COPY ./scripts/common/setup_oss_compliance.sh setup_oss_compliance.sh
3033

34+
ARG FRAMEWORK
35+
ARG FRAMEWORK_VERSION
36+
ARG CONTAINER_TYPE
3137
RUN chmod +x /usr/local/bin/deep_learning_container.py \
38+
&& sed -e "s/{{FRAMEWORK}}/${FRAMEWORK}/g" \
39+
-e "s/{{FRAMEWORK_VERSION}}/${FRAMEWORK_VERSION}/g" \
40+
-e "s/{{CONTAINER_TYPE}}/${CONTAINER_TYPE}/g" \
41+
/tmp/bash_telemetry.sh.template >/usr/local/bin/bash_telemetry.sh \
3242
&& chmod +x /usr/local/bin/bash_telemetry.sh \
43+
&& rm /tmp/bash_telemetry.sh.template \
3344
&& echo 'source /usr/local/bin/bash_telemetry.sh' >>/etc/bash.bashrc \
3445
&& bash setup_oss_compliance.sh ${PYTHON} && rm setup_oss_compliance.sh \
3546
# create symlink for python

scripts/telemetry/bash_telemetry.sh renamed to scripts/telemetry/bash_telemetry.sh.template

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,9 @@
33
if [ -f /usr/local/bin/deep_learning_container.py ] && [[ -z "${OPT_OUT_TRACKING}" || "${OPT_OUT_TRACKING,,}" != "true" ]]; then
44
(
55
python /usr/local/bin/deep_learning_container.py \
6-
--framework "${FRAMEWORK}" \
7-
--framework-version "${FRAMEWORK_VERSION}" \
8-
--container-type "${CONTAINER_TYPE}" \
6+
--framework "{{FRAMEWORK}}" \
7+
--framework-version "{{FRAMEWORK_VERSION}}" \
8+
--container-type "{{CONTAINER_TYPE}}" \
99
&>/dev/null &
1010
)
11-
fi
11+
fi

scripts/telemetry/deep_learning_container.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -240,7 +240,7 @@ def parse_args():
240240
parser = argparse.ArgumentParser()
241241
parser.add_argument(
242242
"--framework",
243-
choices=["tensorflow", "mxnet", "pytorch", "base", "vllm"],
243+
choices=["tensorflow", "mxnet", "pytorch", "base", "vllm", "sglang"],
244244
help="framework of container image.",
245245
required=True,
246246
)

0 commit comments

Comments
 (0)