[Feature] Support unpadded whole-trajectory collector batches #11370

Workflow file for this run

.github/workflows/benchmarks_pr.yml at 3d5124f

	name: Continuous Benchmark (PR)
	on:
	pull_request:

	permissions:
	contents: read
	actions: read

	concurrency:
	# Documentation suggests ${{ github.head_ref }}, but that's only available on pull_request/pull_request_target triggers, so using ${{ github.ref }}.
	# On master, we want all builds to complete even if merging happens faster to make it easier to discover at which point something broke.
	group: ${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && format('ci-master-{0}', github.sha) \|\| format('ci-{0}', github.ref) }}
	cancel-in-progress: true

	jobs:

	benchmark:
	name: ${{ matrix.device }} Pytest benchmark
	runs-on: linux.g5.4xlarge.nvidia.gpu
	env:
	PR_BASE_REPO: ${{ github.event.pull_request.base.repo.full_name }}
	PR_BASE_SHA: ${{ github.event.pull_request.base.sha }}
	PR_HEAD_SHA: ${{ github.event.pull_request.head.sha }}
	strategy:
	fail-fast: false
	matrix:
	include:
	- device: CPU
	image: nvidia/cuda:12.6.3-cudnn-runtime-ubuntu22.04
	- device: GPU
	image: nvidia/cuda:12.6.3-cudnn-devel-ubuntu22.04
	defaults:
	run:
	shell: bash -l {0}
	container:
	image: ${{ matrix.image }}
	options: --gpus all --shm-size=8g
	steps:
	- name: Who triggered this?
	run: \|
	echo "Action triggered by ${{ github.event.pull_request.html_url }}"
	- name: Check ldd --version
	run: ldd --version
	- name: Install git for checkout
	run: \|
	apt-get update -y
	apt-get install -y git ca-certificates
	- name: Checkout
	uses: actions/checkout@v4
	with:
	fetch-depth: 0
	repository: ${{ github.event.pull_request.head.repo.full_name }}
	ref: ${{ github.event.pull_request.head.sha }}
	- name: Python Setup
	uses: actions/setup-python@v5
	with:
	python-version: '3.10'
	- name: Setup Environment
	run: \|
	export TZ=Europe/London
	export DEBIAN_FRONTEND=noninteractive # tzdata bug
	apt-get update -y
	apt-get install software-properties-common cmake -y
	add-apt-repository ppa:git-core/candidate -y
	apt-get update -y
	apt-get upgrade -y
	apt-get -y install libglu1-mesa libgl1-mesa-glx libosmesa6 gcc curl g++ unzip wget libglfw3-dev libgles2-mesa-dev libglew-dev sudo git cmake libz-dev libpython3.10-dev
	- name: Setup git
	run: git config --global --add safe.directory /__w/rl/rl
	- name: setup Path
	run: \|
	echo /usr/local/bin >> "$GITHUB_PATH"
	- name: Setup benchmarks
	run: \|
	BENCHMARK_SITE_DIR="$(mktemp -d)"
	{
	echo "BASE_SHA=${PR_BASE_SHA:0:8}"
	echo "HEAD_SHA=${PR_HEAD_SHA:0:8}"
	echo "BASELINE_JSON=$(mktemp)"
	echo "CONTENDER_JSON=$(mktemp)"
	echo "BENCHMARK_SITE_DIR=${BENCHMARK_SITE_DIR}"
	} >> "$GITHUB_ENV"
	cat > "${BENCHMARK_SITE_DIR}/sitecustomize.py" <<'PY'
	import warnings

	try:
	import torch

	torch._dynamo.config.reorderable_logging_functions.add(warnings.warn)
	except (AttributeError, ImportError):
	pass
	PY
	- name: Install benchmark dependencies
	run: \|
	set -euxo pipefail
	python3.10 -m venv --system-site-packages ./py310
	source ./py310/bin/activate
	export PYTHON_INCLUDE_DIR=/usr/include/python3.10

	# NB: the nightly/cu128 channel is frozen (torch and torchvision builds
	# drifted out of sync there, making install ResolutionImpossible). Use the
	# live cu126 nightly channel; its CUDA 12.6 wheels run fine on the GPU
	# runner via driver backward-compatibility.
	# The --extra-index-url onto PyPI is required: torch nightly pulls in
	# transitive deps (e.g. spmd-types) that are only shipped as sdists on the
	# torch channel, and building those sdists needs setuptools/wheel which the
	# torch index does not host. torch/torchvision still resolve from nightly
	# (their dev versions outrank any PyPI stable), and assert_torch_version.sh
	# below fails the job loudly if that ever stops holding.
	python3.10 -m pip install --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/cu126 --extra-index-url https://pypi.org/simple -U
	python3.10 -m pip install ninja pytest pytest-benchmark pytest-timeout "hoptorch>=0.1.4" "mujoco>=3.8.1,<3.9.0" "dm_control>=1.0.41" "gym[accept-rom-license,atari]" transformers accelerate ray
	python3.10 -m pip install "pybind11[global]"
	python3.10 -m pip install cloudpickle packaging importlib_metadata numpy orjson "pyvers>=0.2.0,<0.3.0"
	python3.10 -m pip install --no-deps git+https://github.com/pytorch/tensordict
	python3.10 -m pip install safetensors tqdm pandas numpy matplotlib

	bash .github/unittest/helpers/assert_torch_version.sh nightly
	bash .github/unittest/helpers/assert_torch_tensordict_versions.sh nightly
	- name: Run baseline benchmarks
	run: \|
	set -euxo pipefail
	source ./py310/bin/activate
	export PYTHON_INCLUDE_DIR=/usr/include/python3.10
	export TORCHRL_BENCHMARK_DEVICE="${{ matrix.device }}"
	if [ "${TORCHRL_BENCHMARK_DEVICE}" = "CPU" ]; then
	export CUDA_VISIBLE_DEVICES=
	fi
	git fetch --no-tags --depth=1 "https://github.com/${PR_BASE_REPO}.git" "${PR_BASE_SHA}"
	git checkout --detach "${PR_BASE_SHA}"
	rm -rf build
	python3.10 -m pip install -e . --no-build-isolation --no-deps

	if [ "${{ matrix.device }}" = "GPU" ]; then
	# test import and fail early if the GPU runner did not expose CUDA
	nvcc --version
	python -c "import torch; assert torch.cuda.device_count()"
	python -c "import torchrl._torchrl as ext; assert hasattr(ext, 'CudaSumSegmentTreeFp32')"
	fi

	REPO_ROOT="$(pwd)"
	cd "${REPO_ROOT}/benchmarks"
	export TORCHDYNAMO_INLINE_INBUILT_NN_MODULES=1
	export COMPOSITE_LP_AGGREGATE=0
	export TD_GET_DEFAULTS_TO_NONE=1
	export PYTHONPATH="${BENCHMARK_SITE_DIR}${PYTHONPATH:+:${PYTHONPATH}}"
	python -m pytest -vvv --rank 0 --timeout=240 --benchmark-only --benchmark-json "${BASELINE_JSON}" --ignore test_llm.py .
	- name: Run PR benchmarks
	run: \|
	set -euxo pipefail
	source ./py310/bin/activate
	export PYTHON_INCLUDE_DIR=/usr/include/python3.10
	export TORCHRL_BENCHMARK_DEVICE="${{ matrix.device }}"
	if [ "${TORCHRL_BENCHMARK_DEVICE}" = "CPU" ]; then
	export CUDA_VISIBLE_DEVICES=
	fi
	git checkout --detach "${PR_HEAD_SHA}"
	rm -rf build
	python3.10 -m pip install -e . --no-build-isolation --no-deps

	if [ "${{ matrix.device }}" = "GPU" ]; then
	# test import and fail early if the GPU runner did not expose CUDA
	nvcc --version
	python -c "import torch; assert torch.cuda.device_count()"
	python -c "import torchrl._torchrl as ext; assert hasattr(ext, 'CudaSumSegmentTreeFp32')"
	fi

	REPO_ROOT="$(pwd)"
	cd "${REPO_ROOT}/benchmarks"
	export TORCHDYNAMO_INLINE_INBUILT_NN_MODULES=1
	export COMPOSITE_LP_AGGREGATE=0
	export TD_GET_DEFAULTS_TO_NONE=1
	export PYTHONPATH="${BENCHMARK_SITE_DIR}${PYTHONPATH:+:${PYTHONPATH}}"
	python -m pytest -vvv --rank 0 --timeout=240 --benchmark-only --benchmark-json "${CONTENDER_JSON}" --ignore test_llm.py .
	- name: Upload PR benchmark results
	if: ${{ always() }}
	run: \|
	set -euxo pipefail
	mkdir -p benchmark-results
	cp "${BASELINE_JSON}" benchmark-results/baseline.json
	cp "${CONTENDER_JSON}" benchmark-results/contender.json
	cat > benchmark-results/metadata.json <<EOF
	{
	"device": "${{ matrix.device }}",
	"pr_number": ${{ github.event.pull_request.number }},
	"base_sha": "${PR_BASE_SHA}",
	"head_sha": "${PR_HEAD_SHA}",
	"run_id": "${{ github.run_id }}"
	}
	EOF
	- name: Upload PR benchmark artifact
	if: ${{ always() }}
	uses: actions/upload-artifact@v4
	with:
	name: ${{ matrix.device }}-benchmark-pr-results
	path: benchmark-results

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[Feature] Support unpadded whole-trajectory collector batches #11370

Workflow file

[Feature] Support unpadded whole-trajectory collector batches #11370

Uh oh!

Workflow file for this run