Skip to content

Reproduce PlantCAD1 using Marin #2

Reproduce PlantCAD1 using Marin

Reproduce PlantCAD1 using Marin #2

Workflow file for this run

name: CI with GCP TPU
on:
issue_comment:
types: [created]
jobs:
tpu-tests:
if: |
github.event.issue.pull_request &&
contains(github.event.comment.body, '@tpubot go') &&
(github.event.comment.author_association == 'MEMBER' ||
github.actor == github.repository_owner)
runs-on: ubuntu-latest
env:
TPU_ZONE: "us-east5-b"
TPU_NAME: "ci-run-${{ github.run_id }}"
DOCKER_IMAGE: "us-east5-docker.pkg.dev/hai-gcp-models/marin/marin_vllm"
DOCKER_SHA: "6e804a10"
steps:
- name: Checkout code
uses: actions/checkout@v2
- name: Set up Google Cloud SDK
uses: google-github-actions/setup-gcloud@v1
with:
project_id: ${{ secrets.GCP_PROJECT_ID }}
- name: Authenticate to Google Cloud
uses: google-github-actions/auth@v2
with:
credentials_json: ${{ secrets.NEW_GCP_JSON }}
- name: Configure Google Cloud
run: |
gcloud config set project ${{ secrets.GCP_PROJECT_ID }}
- name: Create TPU VM
id: create_tpu
run: |
# Generate a unique name for this CI run
TPU_NAME="ci-run-${{ github.run_id }}"
echo "TPU_NAME=${TPU_NAME}" >> $GITHUB_ENV
echo "Creating TPU VM: ${TPU_NAME}"
# Retry TPU creation up to 5 times with 30 second delays
for i in {1..5}; do
if gcloud compute tpus tpu-vm create ${TPU_NAME} \
--zone=us-east5-b \
--accelerator-type=v6e-8 \
--version=v2-alpha-tpuv6e \
--project=${{ secrets.GCP_PROJECT_ID }} \
--preemptible; then
echo "Successfully created TPU on attempt $i"
break
else
if [ $i -eq 5 ]; then
echo "Failed to create TPU after 5 attempts"
exit 1
fi
echo "Attempt $i failed, waiting 30 seconds before retry..."
sleep 30
fi
done
- name: Copy current branch code to TPU VM
run: |
echo "Copying code from runner workspace to TPU VM at /tmp/repo"
# Create a target directory on the TPU VM first
# Retry SSH command up to 5 times with 30 second delays
for i in {1..5}; do
if gcloud compute tpus tpu-vm ssh ${{ env.TPU_NAME }} \
--zone=${{ env.TPU_ZONE }} \
--project=${{ secrets.GCP_PROJECT_ID }} \
--command="mkdir -p /tmp/repo"; then
echo "Successfully created directory on TPU on attempt $i"
break
else
if [ $i -eq 5 ]; then
echo "Failed to create directory on TPU after 5 attempts"
exit 1
fi
echo "Attempt $i failed, waiting 30 seconds before retry..."
sleep 30
fi
done
# Copy the checked-out code (entire workspace '.') to the target directory
gcloud compute tpus tpu-vm scp . ${{ env.TPU_NAME }}:/tmp/repo \
--recurse \
--zone=${{ env.TPU_ZONE }} \
--project=${{ secrets.GCP_PROJECT_ID }}
- name: Configure docker and run tests
run: |
gcloud config set project ${{ secrets.GCP_PROJECT_ID }}
gcloud compute tpus tpu-vm ssh ${{ env.TPU_NAME }} --zone=${{ env.TPU_ZONE }} --project=${{ secrets.GCP_PROJECT_ID }} \
--command='sudo gcloud auth configure-docker us-east5-docker.pkg.dev --quiet \
&& sudo docker run --net host --shm-size=16G --privileged -v /tmp/repo:/opt/marin \
${{ env.DOCKER_IMAGE }}:${{ env.DOCKER_SHA }} \
/bin/bash -c "gcsfuse --implicit-dirs --cache-dir /dev/shm --file-cache-max-size-mb 160000 --client-protocol grpc --only-dir gcsfuse_mount $BUCKET /opt/gcsfuse_mount || true \
&& export TPU_CI=true \
&& export START_RAY_TPU_CLUSTER=true \
&& export WANDB_API_KEY=${{ secrets.WANDB_API_KEY }} \
&& export WANDB_ENTITY=stanford-mercury \
&& export OPENAI_API_KEY=${{ secrets.OPENAI_API_KEY }} \
&& export HF_TOKEN=${{ secrets.HF_TOKEN }} \
&& export START_RAY_TPU_CLUSTER=true \
&& uv sync \
&& echo TPU_TEST \
&& uv run pytest tests/tpu -s -v \
&& echo VLLM_TEST \
&& uv run pytest tests/vllm -s -v \
&& echo EVALS_TEST \
&& uv run pytest tests/evals -s -v \
&& echo RAY_DATA_TEST \
&& uv run pytest tests/ray-data -s -v"'
- name: Cleanup
if: ${{ always() }}
run: |
export TPU_NAME=ci-run-${{ github.run_id }}
echo gcloud compute tpus tpu-vm delete $TPU_NAME --zone ${TPU_ZONE} --quiet
gcloud compute tpus tpu-vm delete $TPU_NAME --zone ${TPU_ZONE} --quiet