Enable sdpa backends for server export in export.py #2852

Workflow file for this run

.github/workflows/more-tests.yml at a50d9ab

	name: Run parallel prefill

	on:
	pull_request:
	push:
	branches:
	- main
	workflow_dispatch:

	jobs:
	test-cuda:
	permissions:
	id-token: write
	contents: read
	uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
	with:
	runner: linux.g5.4xlarge.nvidia.gpu
	gpu-arch-type: cuda
	gpu-arch-version: "12.4"
	timeout: 60
	script: \|
	set -xeou pipefail
	echo "::group::Print machine info"
	uname -a
	echo "::endgroup::"

	echo "::group::Download checkpoints"
	# Install requirements
	./install/install_requirements.sh cuda
	pip3 list
	python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")'
	echo "::endgroup::"

	echo "::group::Download checkpoints"
	mkdir -p checkpoints/stories15M
	pushd checkpoints/stories15M
	wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories15M.pt
	wget https://github.com/karpathy/llama2.c/raw/master/tokenizer.model
	popd
	echo "::endgroup::"

	echo "::group::Run inference"
	export MODEL_DIR=checkpoints/stories15M/
	export MODEL_PATH=${MODEL_DIR}/stories15M.pt
	export MODEL_NAME=stories15M


	for DTYPE in bfloat16 float16 float32; do
	###################################################################
	# group with different temperatures
	python torchchat.py generate --checkpoint-path ${MODEL_PATH} --device cpu --dtype ${DTYPE} --temperature 0
	python torchchat.py generate --checkpoint-path ${MODEL_PATH} --device cpu --dtype ${DTYPE} --temperature 0.9
	python torchchat.py generate --checkpoint-path ${MODEL_PATH} --device cpu --dtype ${DTYPE} --temperature 1.0
	python torchchat.py generate --checkpoint-path ${MODEL_PATH} --device cpu --dtype ${DTYPE} --top-k 100
	python torchchat.py generate --checkpoint-path ${MODEL_PATH} --device cpu --dtype ${DTYPE} --top-k 200
	python torchchat.py generate --checkpoint-path ${MODEL_PATH} --device cpu --dtype ${DTYPE} --top-k 500
	###################################################################
	# group with different temperatures and prefill, and compile
	# and prefill compile
	python torchchat.py generate --checkpoint-path ${MODEL_PATH} --device cpu --dtype ${DTYPE} --temperature 0 --compile --compile-prefill
	python torchchat.py generate --checkpoint-path ${MODEL_PATH} --device cpu --dtype ${DTYPE} --temperature 0.9 --compile --compile-prefill
	python torchchat.py generate --checkpoint-path ${MODEL_PATH} --device cpu --dtype ${DTYPE} --temperature 1.0 --compile --compile-prefill
	python torchchat.py generate --checkpoint-path ${MODEL_PATH} --device cpu --dtype ${DTYPE} --top-k 100 --compile --compile-prefill
	python torchchat.py generate --checkpoint-path ${MODEL_PATH} --device cpu --dtype ${DTYPE} --top-k 200 --compile --compile-prefill
	python torchchat.py generate --checkpoint-path ${MODEL_PATH} --device cpu --dtype ${DTYPE} --top-k 500 --compile --compile-prefill
	###################################################################
	# group with different temperatures and sequential prefill
	python torchchat.py generate --checkpoint-path ${MODEL_PATH} --device cpu --dtype ${DTYPE} --temperature 0 --sequential-prefill
	python torchchat.py generate --checkpoint-path ${MODEL_PATH} --device cpu --dtype ${DTYPE} --temperature 0.9 --sequential-prefill
	python torchchat.py generate --checkpoint-path ${MODEL_PATH} --device cpu --dtype ${DTYPE} --temperature 1.0 --sequential-prefill
	python torchchat.py generate --checkpoint-path ${MODEL_PATH} --device cpu --dtype ${DTYPE} --top-k 100 --sequential-prefill
	python torchchat.py generate --checkpoint-path ${MODEL_PATH} --device cpu --dtype ${DTYPE} --top-k 200 --sequential-prefill
	python torchchat.py generate --checkpoint-path ${MODEL_PATH} --device cpu --dtype ${DTYPE} --top-k 500 --sequential-prefill
	###################################################################
	# group with different temperatures and prefill, and compile
	python torchchat.py generate --checkpoint-path ${MODEL_PATH} --device cpu --dtype ${DTYPE} --temperature 0 --sequential-prefill --compile
	python torchchat.py generate --checkpoint-path ${MODEL_PATH} --device cpu --dtype ${DTYPE} --temperature 0.9 --sequential-prefill --compile
	python torchchat.py generate --checkpoint-path ${MODEL_PATH} --device cpu --dtype ${DTYPE} --temperature 1.0 --sequential-prefill --compile
	python torchchat.py generate --checkpoint-path ${MODEL_PATH} --device cpu --dtype ${DTYPE} --top-k 100 --sequential-prefill --compile
	python torchchat.py generate --checkpoint-path ${MODEL_PATH} --device cpu --dtype ${DTYPE} --top-k 200 --sequential-prefill --compile
	python torchchat.py generate --checkpoint-path ${MODEL_PATH} --device cpu --dtype ${DTYPE} --top-k 500 --sequential-prefill --compile

	done

	echo "tests complete"
	echo "******************************************"
	echo "::endgroup::"


	test-sdpa-backends-export:
	permissions:
	id-token: write
	contents: read
	uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
	with:
	runner: linux.g5.4xlarge.nvidia.gpu
	gpu-arch-type: cuda
	gpu-arch-version: "12.4"
	timeout: 60
	script: \|
	set -xeou pipefail
	echo "::group::Print machine info"
	uname -a
	echo "::endgroup::"

	echo "::group::Download checkpoints"
	# Install requirements
	./install/install_requirements.sh cuda
	pip3 list
	python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")'
	echo "::endgroup::"

	echo "::group::Download checkpoints"
	mkdir -p checkpoints/stories15M
	pushd checkpoints/stories15M
	wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories15M.pt
	wget https://github.com/karpathy/llama2.c/raw/master/tokenizer.model
	popd
	echo "::endgroup::"

	echo "::group::Run inference"
	export MODEL_DIR=checkpoints/stories15M/
	export MODEL_PATH=${MODEL_DIR}/stories15M.pt
	export MODEL_NAME=stories15M

	./torchchat/utils/scripts/build_native.sh aoti

	for DEVICE in cpu cuda; do
	# depending on how the parameter passing works, may only be able to do bfloat16 for aoti_run, similar to runner-cuda-dtype.yml
	# (although the runner environment should not have an opinion what we us in the artifact, and we might suitably abstract that)
	for DTYPE in bfloat16 float16 float32; do
	for SDPA in 'math' 'flash_attention' 'efficient_attention' 'cudnn_attention'; do
	echo "***************************************************************"
	echo "*** $DEVICE $DTYPE $SDPA"
	###################################################################
	# Export DSO and run with Python
	python torchchat.py export --output-dso dso.so --checkpoint-path ${MODEL_PATH} --attention-backend ${SDPA} --device ${DEVICE} --dtype ${DTYPE}
	python torchchat.py generate --dso-path dso.so --checkpoint-path ${MODEL_PATH} --attention-backend ${SDPA} --device ${DEVICE} --dtype ${DTYPE} --temperature 0 --prompt "Once upon a time"
	###################################################################
	# Export AOTI and run with aoti_run
	python torchchat.py export --output-aoti /tmp/model.pt2 --checkpoint-path ${MODEL_PATH} --attention-backend ${SDPA} --device ${DEVICE} --dtype ${DTYPE}
	./cmake-out/aoti_run /tmp/model.pt2 -z ${MODEL_DIR}/tokenizer.model -i "Once upon a time"
	###################################################################
	done
	done
	done

	echo "tests complete"
	echo "******************************************"
	echo "::endgroup::"

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Enable sdpa backends for server export in export.py #2852

Workflow file

Enable sdpa backends for server export in export.py #2852

Uh oh!

Jobs

Run details

Workflow file for this run