Skip to content

Commit 8ea932b

Browse files
committed
try build
Signed-off-by: Junpu Fan <junpu@amazon.com>
1 parent 66c6091 commit 8ea932b

File tree

10 files changed

+650
-13
lines changed

10 files changed

+650
-13
lines changed

.github/workflows/pr-example.yml

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,17 @@ jobs:
4141
- uses: actions/checkout@v5
4242
- run: .github/scripts/runner_setup.sh
4343
- run: .github/scripts/buildkitd.sh
44+
- name: build vllm-rayserve-ec2 image
45+
shell: bash
46+
run: |
47+
DATE=$(date +"%Y-%m-%d")
48+
COMMIT_REF=$(git rev-parse --short HEAD)
49+
DOCKER_BUILDKIT=1 docker build --progress plain \
50+
--build-arg CACHE_REFRESH=${DATE} \
51+
--tag vllm:0.10.2-gpu-py312-cu128-ubuntu22.04-rayserve-ec2-${COMMIT_REF} \
52+
--target vllm-rayserve-ec2 \
53+
-f docker/vllm/Dockerfile.rayserve .
54+
docker image ls
4455
4556
example-on-g6xl-runner-1:
4657
needs: [example-on-build-runner]

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,3 +3,4 @@ __pycache__
33
.idea
44
*.pyc
55
.venv
6+
.ruff_cache

.pre-commit-config.yaml

Lines changed: 15 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -15,23 +15,14 @@ repos:
1515
# optional: add additional arguments here
1616
- --indent=2
1717
- --write
18-
stages: [manual] # run in CI
19-
- repo: https://github.com/rhysd/actionlint
20-
rev: v1.7.7
21-
hooks:
22-
- id: actionlint
23-
stages: [manual] # run in CI
18+
stages: [pre-commit]
2419
- repo: https://github.com/scop/pre-commit-shfmt
2520
rev: v3.12.0-2 # Use the latest stable revision
2621
hooks:
2722
- id: shfmt
2823
# Optional: Add arguments to shfmt if needed, e.g., to enable "simplify" mode
2924
args: ["-s"]
30-
- repo: https://github.com/crate-ci/typos
31-
rev: v1.38.1
32-
hooks:
33-
- id: typos
34-
args: [--force-exclude]
25+
stages: [pre-commit]
3526
- repo: https://github.com/hukkin/mdformat
3627
rev: 1.0.0 # Use the ref you want to point at
3728
hooks:
@@ -40,17 +31,28 @@ repos:
4031
additional_dependencies:
4132
- mdformat-gfm
4233
- mdformat-black
34+
stages: [pre-commit]
4335
- repo: https://github.com/igorshubovych/markdownlint-cli
4436
rev: v0.45.0
4537
hooks:
4638
- id: markdownlint
4739
args: [--fix]
40+
stages: [pre-commit]
4841
- repo: https://github.com/astral-sh/ruff-pre-commit
4942
rev: v0.14.3
5043
hooks:
51-
- id: ruff-check
52-
args: [ --fix, --output-format=github ]
5344
- id: ruff-format
45+
stages: [pre-commit]
46+
- id: ruff-check
47+
- repo: https://github.com/rhysd/actionlint
48+
rev: v1.7.7
49+
hooks:
50+
- id: actionlint
51+
- repo: https://github.com/crate-ci/typos
52+
rev: v1.38.1
53+
hooks:
54+
- id: typos
55+
args: [--force-exclude]
5456
- repo: local
5557
hooks:
5658
- id: signoff-commit

DEVELOPMENT.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,13 @@ uv pip install pre-commit
2828
pre-commit install
2929
```
3030

31+
Install go using [homebrew](https://brew.sh/), below example assume on Mac.
32+
33+
```bash
34+
brew install go
35+
go env -w GOPROXY=direct
36+
```
37+
3138
To manually run all linters:
3239

3340
```bash

docker/vllm/Dockerfile.rayserve

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
FROM docker.io/vllm/vllm-openai:v0.10.2 AS base
2+
ARG PYTHON="python3"
3+
LABEL maintainer="Amazon AI"
4+
ARG EFA_VERSION="1.43.3"
5+
LABEL dlc_major_version="1"
6+
ENV DEBIAN_FRONTEND=noninteractive \
7+
LANG=C.UTF-8 \
8+
LC_ALL=C.UTF-8 \
9+
DLC_CONTAINER_TYPE=base \
10+
# Python won’t try to write .pyc or .pyo files on the import of source modules
11+
# Force stdin, stdout and stderr to be totally unbuffered. Good for logging
12+
PYTHONDONTWRITEBYTECODE=1 \
13+
PYTHONUNBUFFERED=1 \
14+
PYTHONIOENCODING=UTF-8 \
15+
LD_LIBRARY_PATH="/usr/local/lib:/opt/amazon/ofi-nccl/lib/x86_64-linux-gnu:/opt/amazon/openmpi/lib:/opt/amazon/efa/lib:/usr/local/cuda/lib64:${LD_LIBRARY_PATH}" \
16+
PATH="/opt/amazon/openmpi/bin:/opt/amazon/efa/bin:/usr/local/cuda/bin:${PATH}"
17+
18+
WORKDIR /
19+
20+
COPY ./scripts/telemetry/deep_learning_container.py /usr/local/bin/deep_learning_container.py
21+
COPY ./scripts/telemetry/bash_telemetry.sh /usr/local/bin/bash_telemetry.sh
22+
COPY ./scripts/setup_oss_compliance.sh setup_oss_compliance.sh
23+
24+
RUN chmod +x /usr/local/bin/deep_learning_container.py \
25+
&& chmod +x /usr/local/bin/bash_telemetry.sh \
26+
&& echo 'source /usr/local/bin/bash_telemetry.sh' >>/etc/bash.bashrc \
27+
&& bash setup_oss_compliance.sh ${PYTHON} && rm setup_oss_compliance.sh \
28+
# create symlink for python
29+
&& ln -s /usr/bin/python3 /usr/bin/python \
30+
# clean up
31+
&& rm -rf ${HOME_DIR}/oss_compliance* \
32+
&& rm -rf /tmp/tmp* \
33+
&& rm -rf /tmp/uv* \
34+
&& rm -rf /var/lib/apt/lists/* \
35+
&& rm -rf /root/.cache | true
36+
37+
COPY ./scripts/install_efa.sh install_efa.sh
38+
RUN bash install_efa.sh ${EFA_VERSION} \
39+
&& rm install_efa.sh \
40+
&& mkdir -p /tmp/nvjpeg \
41+
&& cd /tmp/nvjpeg \
42+
&& wget https://developer.download.nvidia.com/compute/cuda/redist/libnvjpeg/linux-x86_64/libnvjpeg-linux-x86_64-12.4.0.76-archive.tar.xz \
43+
&& tar -xvf libnvjpeg-linux-x86_64-12.4.0.76-archive.tar.xz \
44+
&& rm -rf /usr/local/cuda/targets/x86_64-linux/lib/libnvjpeg* \
45+
&& rm -rf /usr/local/cuda/targets/x86_64-linux/include/nvjpeg.h \
46+
&& cp libnvjpeg-linux-x86_64-12.4.0.76-archive/lib/libnvjpeg* /usr/local/cuda/lib64/ \
47+
&& cp libnvjpeg-linux-x86_64-12.4.0.76-archive/include/* /usr/local/cuda/include/ \
48+
&& rm -rf /tmp/nvjpeg \
49+
# remove cuobjdump and nvdisasm
50+
&& rm -rf /usr/local/cuda/bin/cuobjdump* \
51+
&& rm -rf /usr/local/cuda/bin/nvdisasm*
52+
53+
# ====================== ray serve =========================================
54+
FROM base AS vllm-rayserve-ec2
55+
56+
RUN uv pip install --system ray[serve]==2.49.0 \
57+
&& uv cache clean
58+
59+
ARG CACHE_REFRESH=0
60+
RUN dpkg -l | grep -E "cuda|nvidia|libnv" | awk '{print $2}' | xargs apt-mark hold \
61+
&& apt-get update \
62+
&& apt-get upgrade -y \
63+
&& apt-get clean
64+
65+
COPY ./scripts/dockerd_entrypoint.sh /usr/local/bin/dockerd_entrypoint.sh
66+
RUN chmod +x /usr/local/bin/dockerd_entrypoint.sh
67+
68+
ENTRYPOINT ["/usr/local/bin/dockerd_entrypoint.sh"]

scripts/dockerd_entrypoint.sh

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
#!/usr/bin/env bash
2+
# Check if telemetry file exists before executing
3+
# Execute telemetry script if it exists, suppress errors
4+
bash /usr/local/bin/bash_telemetry.sh >/dev/null 2>&1 || true
5+
6+
python3 -m vllm.entrypoints.openai.api_server "$@"

scripts/install_efa.sh

Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
#!/bin/bash
2+
3+
set -ex
4+
5+
ARCH=$(uname -m)
6+
case $ARCH in
7+
x86_64)
8+
ARCH_DIR="x86_64-linux-gnu"
9+
;;
10+
aarch64)
11+
ARCH_DIR="aarch64-linux-gnu"
12+
;;
13+
*)
14+
echo "Unsupported architecture: $ARCH"
15+
exit 1
16+
;;
17+
esac
18+
19+
function check_libnccl_net_so {
20+
OFI_LIB_DIR="/opt/amazon/ofi-nccl/lib/${ARCH_DIR}"
21+
NCCL_NET_SO="$OFI_LIB_DIR/libnccl-net.so"
22+
23+
# Check if file exists
24+
if [ ! -f "$NCCL_NET_SO" ]; then
25+
echo "ERROR: $NCCL_NET_SO does not exist"
26+
return 1
27+
fi
28+
}
29+
30+
function install_efa {
31+
EFA_VERSION=$1
32+
OPEN_MPI_PATH="/opt/amazon/openmpi"
33+
34+
# Install build time tools
35+
apt-get update
36+
apt-get install -y --allow-change-held-packages --no-install-recommends \
37+
curl \
38+
build-essential \
39+
cmake \
40+
git
41+
42+
# Install EFA
43+
mkdir /tmp/efa
44+
cd /tmp/efa
45+
curl -O https://s3-us-west-2.amazonaws.com/aws-efa-installer/aws-efa-installer-${EFA_VERSION}.tar.gz
46+
tar -xf aws-efa-installer-${EFA_VERSION}.tar.gz
47+
cd aws-efa-installer
48+
./efa_installer.sh -y --skip-kmod --skip-limit-conf --no-verify
49+
rm -rf /tmp/efa
50+
# Configure Open MPI and configure NCCL parameters
51+
mv ${OPEN_MPI_PATH}/bin/mpirun ${OPEN_MPI_PATH}/bin/mpirun.real
52+
echo '#!/bin/bash' > ${OPEN_MPI_PATH}/bin/mpirun
53+
echo "${OPEN_MPI_PATH}/bin/mpirun.real --allow-run-as-root \"\$@\"" >> ${OPEN_MPI_PATH}/bin/mpirun
54+
chmod a+x ${OPEN_MPI_PATH}/bin/mpirun
55+
echo "hwloc_base_binding_policy = none" >> ${OPEN_MPI_PATH}/etc/openmpi-mca-params.conf
56+
echo "rmaps_base_mapping_policy = slot" >> ${OPEN_MPI_PATH}/etc/openmpi-mca-params.conf
57+
echo NCCL_DEBUG=INFO >> /etc/nccl.conf
58+
echo NCCL_SOCKET_IFNAME=^docker0,lo >> /etc/nccl.conf
59+
60+
# Install OpenSSH for MPI to communicate between containers, allow OpenSSH to talk to containers without asking for confirmation
61+
apt-get install -y --no-install-recommends \
62+
openssh-client \
63+
openssh-server
64+
mkdir -p /var/run/sshd
65+
cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new
66+
echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new
67+
mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config
68+
# Configure OpenSSH so that nodes can communicate with each other
69+
mkdir -p /var/run/sshd
70+
sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd
71+
rm -rf /root/.ssh/
72+
mkdir -p /root/.ssh/
73+
ssh-keygen -q -t rsa -N '' -f /root/.ssh/id_rsa
74+
cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys
75+
printf "Host *\n StrictHostKeyChecking no\n" >> /root/.ssh/config
76+
77+
# Remove build time tools
78+
# apt-get remove -y
79+
# curl
80+
# build-essential
81+
# cmake
82+
# git
83+
84+
# Cleanup
85+
apt-get clean
86+
apt-get autoremove -y
87+
rm -rf /var/lib/apt/lists/*
88+
ldconfig
89+
check_libnccl_net_so
90+
}
91+
92+
# idiomatic parameter and option handling in sh
93+
while test $# -gt 0
94+
do
95+
case "$1" in
96+
[0-9].[0-9]*.[0-9]*) install_efa $1;
97+
;;
98+
*) echo "bad argument $1"; exit 1
99+
;;
100+
esac
101+
shift
102+
done

scripts/setup_oss_compliance.sh

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
#!/bin/bash
2+
3+
set -ex
4+
5+
function install_oss_compliance {
6+
HOME_DIR="/root"
7+
PYTHON=$1
8+
9+
if [ -z "$PYTHON" ]; then
10+
echo "Python version not specified. Using default Python."
11+
PYTHON="python3"
12+
fi
13+
curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip
14+
${PYTHON} -c "import zipfile, os; zipfile.ZipFile('/root/oss_compliance.zip').extractall('/root/'); os.remove('/root/oss_compliance.zip')"
15+
cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance
16+
chmod +x /usr/local/bin/testOSSCompliance
17+
chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh
18+
${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} ${PYTHON}
19+
rm -rf ${HOME_DIR}/oss_compliance*
20+
rm -rf /tmp/tmp*
21+
# Removing the cache as it is needed for security verification
22+
rm -rf /root/.cache | true
23+
}
24+
25+
while test $# -gt 0
26+
do
27+
case "$1" in
28+
python*) install_oss_compliance $1;
29+
;;
30+
*) echo "bad argument $1"; exit 1
31+
;;
32+
esac
33+
shift
34+
done
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
# telemetry.sh
2+
#!/bin/bash
3+
if [ -f /usr/local/bin/deep_learning_container.py ] && [[ -z "${OPT_OUT_TRACKING}" || "${OPT_OUT_TRACKING,,}" != "true" ]]; then
4+
(
5+
python /usr/local/bin/deep_learning_container.py \
6+
--framework "${FRAMEWORK}" \
7+
--framework-version "${FRAMEWORK_VERSION}" \
8+
--container-type "${CONTAINER_TYPE}" \
9+
&>/dev/null &
10+
)
11+
fi

0 commit comments

Comments
 (0)