Skip to content

Commit 6d83f4a

Browse files
authored
Merge branch 'master' into codex/fix-ray-client-specific-server-fork-race
2 parents 07fe244 + 297fbea commit 6d83f4a

516 files changed

Lines changed: 16782 additions & 8427 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.buildkite/rllib.rayci.yml

Lines changed: 66 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -1,25 +1,30 @@
11
group: rllib tests
22
depends_on:
33
- forge
4-
- ray-core-build(python=3.10)
4+
- ray-core-build(*)
55
- ray-dashboard-build
66
steps:
77
# builds
8-
- name: rllibbuild
8+
- name: rllibbuild-multipy
9+
label: "wanda: rllibbuild-py{{array.python}}"
910
wanda: ci/docker/rllib.build.wanda.yaml
10-
depends_on: oss-ci-base_ml-multipy(python=3.10)
11+
tags: cibase
12+
array:
13+
python:
14+
- "3.10"
15+
- "3.12"
1116
env:
12-
PYTHON: "3.10"
17+
PYTHON: "{{array.python}}"
1318
BASE_TYPE: "ml"
1419
BUILD_VARIANT: "build"
1520
RAYCI_IS_GPU_BUILD: "false"
16-
tags: cibase
21+
depends_on: oss-ci-base_ml-multipy($)
1722

1823
- name: rllibgpubuild
1924
wanda: ci/docker/rllib.build.wanda.yaml
20-
depends_on: oss-ci-base_gpu-multipy(python=3.10)
25+
depends_on: oss-ci-base_gpu-multipy(python=3.12)
2126
env:
22-
PYTHON: "3.10"
27+
PYTHON: "3.12"
2328
BASE_TYPE: "gpu"
2429
BUILD_VARIANT: "gpubuild"
2530
RAYCI_IS_GPU_BUILD: "true"
@@ -37,9 +42,9 @@ steps:
3742
--only-tags env,evaluation,models,offline,policy,utils,algorithms,callbacks,core
3843
--except-tags gpu,multi_gpu,manual
3944
--test-env RAY_USE_MULTIPROCESSING_CPU_COUNT=1
40-
--build-name rllibbuild-py3.10
41-
--python-version 3.10
42-
depends_on: rllibbuild
45+
--build-name rllibbuild-py3.12
46+
--python-version 3.12
47+
depends_on: rllibbuild-multipy(python=3.12)
4348

4449
- label: ":brain: rllib: examples"
4550
tags: rllib
@@ -52,18 +57,18 @@ steps:
5257
--only-tags examples
5358
--except-tags gpu,multi_gpu,manual,examples_use_all_core
5459
--test-env RAY_USE_MULTIPROCESSING_CPU_COUNT=1
55-
--build-name rllibbuild-py3.10
56-
--python-version 3.10
60+
--build-name rllibbuild-py3.12
61+
--python-version 3.12
5762
# Tests all examples without gpu or multi_gpu
5863
- bazel run //ci/ray_ci:test_in_docker -- //rllib/... rllib
5964
--workers "$${BUILDKITE_PARALLEL_JOB_COUNT}" --worker-id "$${BUILDKITE_PARALLEL_JOB}"
6065
--only-tags examples_use_all_core
6166
--except-tags gpu,multi_gpu,manual
6267
--test-env RAY_USE_MULTIPROCESSING_CPU_COUNT=1
63-
--build-name rllibbuild-py3.10
64-
--python-version 3.10
68+
--build-name rllibbuild-py3.12
69+
--python-version 3.12
6570
--skip-ray-installation # reuse the same docker image as the previous run
66-
depends_on: rllibbuild
71+
depends_on: rllibbuild-multipy(python=3.12)
6772

6873
- label: ":brain: rllib: learning tests"
6974
tags: rllib
@@ -76,18 +81,18 @@ steps:
7681
--only-tags learning_tests
7782
--except-tags gpu,multi_gpu,learning_tests_use_all_core,manual
7883
--test-arg --framework=torch
79-
--build-name rllibbuild-py3.10
80-
--python-version 3.10
84+
--build-name rllibbuild-py3.12
85+
--python-version 3.12
8186
# learning tests without a gpu but use all cores
8287
- bazel run //ci/ray_ci:test_in_docker -- //rllib/... rllib
8388
--workers "$${BUILDKITE_PARALLEL_JOB_COUNT}" --worker-id "$${BUILDKITE_PARALLEL_JOB}"
8489
--only-tags learning_tests_use_all_core
8590
--except-tags gpu,multi_gpu,manual
8691
--test-arg --framework=torch
87-
--build-name rllibbuild-py3.10
88-
--python-version 3.10
92+
--build-name rllibbuild-py3.12
93+
--python-version 3.12
8994
--skip-ray-installation # reuse the same docker image as the previous run
90-
depends_on: rllibbuild
95+
depends_on: rllibbuild-multipy(python=3.12)
9196

9297
- label: ":brain: rllib: gpu tests"
9398
tags:
@@ -104,8 +109,8 @@ steps:
104109
--except-tags multi_gpu,manual
105110
--test-env=RAY_USE_MULTIPROCESSING_CPU_COUNT=1
106111
--test-env=RLLIB_NUM_GPUS=1
107-
--build-name rllibgpubuild-py3.10
108-
--python-version 3.10
112+
--build-name rllibgpubuild-py3.12
113+
--python-version 3.12
109114
depends_on: rllibgpubuild
110115

111116
- label: ":brain: rllib: multi-gpu tests"
@@ -123,8 +128,8 @@ steps:
123128
--gpus 4
124129
--only-tags multi_gpu
125130
--except-tags manual
126-
--build-name rllibgpubuild-py3.10
127-
--python-version 3.10
131+
--build-name rllibgpubuild-py3.12
132+
--python-version 3.12
128133
depends_on: rllibgpubuild
129134

130135
- label: ":brain: rllib: doc tests"
@@ -139,23 +144,40 @@ steps:
139144
--only-tags doctest
140145
--except-tags gpu,manual
141146
--parallelism-per-worker 2
142-
--build-name rllibbuild-py3.10
143-
--python-version 3.10
147+
--build-name rllibbuild-py3.12
148+
--python-version 3.12
144149
# doc examples
145150
- bazel run //ci/ray_ci:test_in_docker -- //doc/... rllib
146151
--except-tags gpu,post_wheel_build,timeseries_libs,doctest
147152
--parallelism-per-worker 2
148-
--build-name rllibbuild-py3.10
149-
--python-version 3.10
153+
--build-name rllibbuild-py3.12
154+
--python-version 3.12
150155
--skip-ray-installation # reuse the same docker image as the previous run
151156
# documentation test
152157
- bazel run //ci/ray_ci:test_in_docker -- //rllib/... rllib
153158
--only-tags documentation
154159
--parallelism-per-worker 2
160+
--build-name rllibbuild-py3.12
161+
--python-version 3.12
162+
--skip-ray-installation # reuse the same docker image as the previous run
163+
depends_on: rllibbuild-multipy(python=3.12)
164+
165+
- label: ":brain: rllib: python 3.10 component testing ({{matrix.worker_id}})"
166+
if: build.pull_request.labels includes "continuous-build" || pipeline.id == "0189e759-8c96-4302-b6b5-b4274406bf89" || pipeline.id == "018f4f1e-1b73-4906-9802-92422e3badaa"
167+
tags: rllib_directly
168+
instance_type: large
169+
commands:
170+
- bazel run //ci/ray_ci:test_in_docker -- //rllib/... rllib
171+
--workers 4 --worker-id {{matrix.worker_id}} --parallelism-per-worker 3
172+
--only-tags env,evaluation,models,offline,policy,utils,algorithms,callbacks,core
173+
--except-tags gpu,multi_gpu,manual
174+
--test-env RAY_USE_MULTIPROCESSING_CPU_COUNT=1
155175
--build-name rllibbuild-py3.10
156176
--python-version 3.10
157-
--skip-ray-installation # reuse the same docker image as the previous run
158-
depends_on: rllibbuild
177+
depends_on: rllibbuild-multipy(python=3.10)
178+
matrix:
179+
setup:
180+
worker_id: ["0", "1", "2", "3"]
159181

160182
- label: ":brain: rllib: flaky component & examples tests"
161183
key: rllib_flaky_tests_02
@@ -170,27 +192,27 @@ steps:
170192
--only-tags env,evaluation,models,offline,policy,utils,algorithms,callbacks,core
171193
--except-tags learning_tests,examples,documentation,gpu,multi_gpu,manual
172194
--test-env RAY_USE_MULTIPROCESSING_CPU_COUNT=1
173-
--build-name rllibbuild-py3.10
174-
--python-version 3.10
195+
--build-name rllibbuild-py3.12
196+
--python-version 3.12
175197

176198
# flaky examples
177199
- bazel run //ci/ray_ci:test_in_docker -- //rllib/... rllib --run-flaky-tests --parallelism-per-worker 3
178200
--only-tags examples
179201
--except-tags multi_gpu,gpu,manual,examples_use_all_core
180202
--test-env RAY_USE_MULTIPROCESSING_CPU_COUNT=1
181-
--build-name rllibbuild-py3.10
182-
--python-version 3.10
203+
--build-name rllibbuild-py3.12
204+
--python-version 3.12
183205
--skip-ray-installation # reuse the same docker image as the previous run
184206

185207
# flaky examples use all core
186208
- bazel run //ci/ray_ci:test_in_docker -- //rllib/... rllib --run-flaky-tests --parallelism-per-worker 3
187209
--only-tags examples_use_all_core
188210
--except-tags gpu,multi_gpu,manual
189211
--test-env RAY_USE_MULTIPROCESSING_CPU_COUNT=1
190-
--build-name rllibbuild-py3.10
191-
--python-version 3.10
212+
--build-name rllibbuild-py3.12
213+
--python-version 3.12
192214
--skip-ray-installation # reuse the same docker image as the previous run
193-
depends_on: rllibbuild
215+
depends_on: rllibbuild-multipy(python=3.12)
194216
soft_fail: true
195217

196218
- label: ":brain: rllib: flaky learning tests"
@@ -206,9 +228,9 @@ steps:
206228
--only-tags learning_tests
207229
--except-tags gpu,multi_gpu,manual
208230
--test-arg --framework=torch
209-
--build-name rllibbuild-py3.10
210-
--python-version 3.10
211-
depends_on: rllibbuild
231+
--build-name rllibbuild-py3.12
232+
--python-version 3.12
233+
depends_on: rllibbuild-multipy(python=3.12)
212234
soft_fail: true
213235

214236
- label: ":brain: rllib: flaky gpu tests"
@@ -225,8 +247,8 @@ steps:
225247
--except-tags multi_gpu,manual
226248
--test-env=RAY_USE_MULTIPROCESSING_CPU_COUNT=1
227249
--test-env=RLLIB_NUM_GPUS=1
228-
--build-name rllibgpubuild-py3.10
229-
--python-version 3.10
250+
--build-name rllibgpubuild-py3.12
251+
--python-version 3.12
230252
depends_on: rllibgpubuild
231253
soft_fail: true
232254

@@ -244,7 +266,7 @@ steps:
244266
--gpus 4
245267
--only-tags multi_gpu
246268
--except-tags manual
247-
--build-name rllibgpubuild-py3.10
248-
--python-version 3.10
269+
--build-name rllibgpubuild-py3.12
270+
--python-version 3.12
249271
depends_on: rllibgpubuild
250272
soft_fail: true

.buildkite/serve.rayci.yml

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -303,4 +303,21 @@ steps:
303303
commands:
304304
- bazel run //ci/ray_ci:test_in_docker -- //... serve --run-flaky-tests --parallelism-per-worker 3
305305
--python-version 3.10 --build-name servebuild-py3.10
306+
--except-tags gpu
306307
depends_on: servebuild-multipy(python=3.10)
308+
309+
- label: ":ray-serve: serve: flaky gpu tests"
310+
key: serve_flaky_gpu_tests
311+
tags:
312+
- serve
313+
- python
314+
- gpu
315+
- flaky
316+
- skip-on-premerge
317+
instance_type: gpu
318+
soft_fail: true
319+
commands:
320+
- bazel run //ci/ray_ci:test_in_docker -- //... serve --run-flaky-tests
321+
--python-version 3.10 --build-name docgpubuild-py3.10
322+
--only-tags gpu
323+
depends_on: docgpubuild

.github/CODEOWNERS

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -131,3 +131,9 @@
131131
/.github/workflows/ @ray-project/ray-ci
132132

133133
/.gemini/ @edoakes @ray-project/ray-ci
134+
135+
# TPU
136+
/python/ray/util/tpu.py @andrewsykim @edoakes @ryanaoleary
137+
/python/ray/tests/test_tpu.py @andrewsykim @edoakes @ryanaoleary
138+
/python/ray/tests/accelerators/test_tpu.py @andrewsykim @edoakes @ryanaoleary
139+
/python/ray/_private/accelerators/tpu.py @andrewsykim @edoakes @ryanaoleary

README.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ Install Ray with: ``pip install ray``. For nightly wheels, see the
4848
`Installation page <https://docs.ray.io/en/latest/ray-overview/installation.html>`__.
4949

5050
.. _`Serve`: https://docs.ray.io/en/latest/serve/index.html
51-
.. _`Data`: https://docs.ray.io/en/latest/data/dataset.html
51+
.. _`Data`: https://docs.ray.io/en/latest/data/data.html
5252
.. _`Workflow`: https://docs.ray.io/en/latest/workflows/
5353
.. _`Train`: https://docs.ray.io/en/latest/train/train.html
5454
.. _`Tune`: https://docs.ray.io/en/latest/tune/index.html

ci/ci.sh

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -214,8 +214,9 @@ build_dashboard_front_end() {
214214
if [[ -z "${BUILDKITE-}" || "${OSTYPE}" != linux* ]]; then
215215
if [[ -d "${HOME}/.nvm" ]]; then
216216
set +x # suppress set -x since it'll get very noisy here
217+
export TMPDIR="${TMPDIR:-/tmp}"
217218
. "${HOME}/.nvm/nvm.sh"
218-
NODE_VERSION="14"
219+
NODE_VERSION="20"
219220
nvm install $NODE_VERSION
220221
nvm use --silent $NODE_VERSION
221222
fi

ci/docker/llm.build.Dockerfile

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,42 @@
11
# syntax=docker/dockerfile:1.3-labs
22

33
ARG DOCKER_IMAGE_BASE_BUILD=cr.ray.io/rayproject/oss-ci-base_build-py3.11
4+
FROM $DOCKER_IMAGE_BASE_BUILD AS haproxy-builder
5+
6+
RUN <<EOF
7+
#!/bin/bash
8+
set -euo pipefail
9+
10+
apt-get update -y
11+
apt-get install -y --no-install-recommends \
12+
build-essential \
13+
ca-certificates \
14+
curl \
15+
libc6-dev \
16+
liblua5.3-dev \
17+
libpcre3-dev \
18+
libssl-dev \
19+
zlib1g-dev
20+
21+
# Install HAProxy from source.
22+
# Fetched from ray-project/haproxy-release (a GitHub release mirror) because
23+
# www.haproxy.org's wildcard TLS cert expired 2026-04-17 and the release tarball
24+
# disappeared from the upstream download site. Integrity is enforced by sha256
25+
# verification. Drop this mirror and switch back to www.haproxy.org once the
26+
# cert is renewed and the tarball is republished.
27+
HAPROXY_VERSION="2.8.20"
28+
HAPROXY_SHA256="c8301de11dabfbf049db07080e43b9570a63f99e41d4b0754760656bf7ea00b7"
29+
HAPROXY_BUILD_DIR=$(mktemp -d)
30+
curl --retry 5 --retry-all-errors --connect-timeout 20 --max-time 300 \
31+
-sSfL -o "${HAPROXY_BUILD_DIR}/haproxy.tar.gz" \
32+
"https://github.com/ray-project/haproxy-release/releases/download/${HAPROXY_VERSION}/haproxy-${HAPROXY_VERSION}.tar.gz"
33+
echo "${HAPROXY_SHA256} ${HAPROXY_BUILD_DIR}/haproxy.tar.gz" | sha256sum -c -
34+
tar -xzf "${HAPROXY_BUILD_DIR}/haproxy.tar.gz" -C "${HAPROXY_BUILD_DIR}" --strip-components=1
35+
make -C "${HAPROXY_BUILD_DIR}" TARGET=linux-glibc USE_OPENSSL=1 USE_ZLIB=1 USE_PCRE=1 USE_LUA=1 USE_PROMEX=1 -j$(nproc)
36+
make -C "${HAPROXY_BUILD_DIR}" install
37+
rm -rf "${HAPROXY_BUILD_DIR}"
38+
EOF
39+
440
FROM $DOCKER_IMAGE_BASE_BUILD
541

642
ARG RAY_CI_JAVA_BUILD=
@@ -10,6 +46,17 @@ SHELL ["/bin/bash", "-ice"]
1046

1147
COPY . .
1248

49+
COPY --from=haproxy-builder /usr/local/sbin/haproxy /usr/local/sbin/haproxy
50+
51+
RUN <<EOF
52+
#!/bin/bash
53+
set -euo pipefail
54+
55+
apt-get update -y && apt-get install -y --no-install-recommends liblua5.3-0 libpcre3
56+
rm -rf /var/lib/apt/lists/*
57+
mkdir -p /etc/haproxy /run/haproxy /var/log/haproxy
58+
EOF
59+
1360
RUN <<EOF
1461
#!/bin/bash
1562

ci/env/install-dependencies.sh

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -163,8 +163,15 @@ install_node() {
163163

164164
(
165165
set +x # suppress set -x since it'll get very noisy here.
166+
# nvm's error-recovery path in nvm.sh references $TMPDIR. The caller
167+
# runs with `set -u`, so an unset TMPDIR (launchd-spawned shells don't
168+
# inherit one) crashes nvm with "TMPDIR: unbound variable" whenever a
169+
# binary download fails and nvm falls back to source.
170+
export TMPDIR="${TMPDIR:-/tmp}"
166171
. "${HOME}/.nvm/nvm.sh"
167-
NODE_VERSION="14"
172+
# Node 14 EOL'd April 2023; nodejs.org removed the darwin-arm64
173+
# prebuilt and the URL now 404s. 20 is the current LTS line.
174+
NODE_VERSION="20"
168175
nvm install $NODE_VERSION
169176
nvm use --silent $NODE_VERSION
170177
npm config set loglevel warn # make NPM quieter

0 commit comments

Comments
 (0)