Reproduce PlantCAD1 using Marin #2
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: CI with GCP TPU | |
| on: | |
| issue_comment: | |
| types: [created] | |
| jobs: | |
| tpu-tests: | |
| if: | | |
| github.event.issue.pull_request && | |
| contains(github.event.comment.body, '@tpubot go') && | |
| (github.event.comment.author_association == 'MEMBER' || | |
| github.actor == github.repository_owner) | |
| runs-on: ubuntu-latest | |
| env: | |
| TPU_ZONE: "us-east5-b" | |
| TPU_NAME: "ci-run-${{ github.run_id }}" | |
| DOCKER_IMAGE: "us-east5-docker.pkg.dev/hai-gcp-models/marin/marin_vllm" | |
| DOCKER_SHA: "6e804a10" | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v2 | |
| - name: Set up Google Cloud SDK | |
| uses: google-github-actions/setup-gcloud@v1 | |
| with: | |
| project_id: ${{ secrets.GCP_PROJECT_ID }} | |
| - name: Authenticate to Google Cloud | |
| uses: google-github-actions/auth@v2 | |
| with: | |
| credentials_json: ${{ secrets.NEW_GCP_JSON }} | |
| - name: Configure Google Cloud | |
| run: | | |
| gcloud config set project ${{ secrets.GCP_PROJECT_ID }} | |
| - name: Create TPU VM | |
| id: create_tpu | |
| run: | | |
| # Generate a unique name for this CI run | |
| TPU_NAME="ci-run-${{ github.run_id }}" | |
| echo "TPU_NAME=${TPU_NAME}" >> $GITHUB_ENV | |
| echo "Creating TPU VM: ${TPU_NAME}" | |
| # Retry TPU creation up to 5 times with 30 second delays | |
| for i in {1..5}; do | |
| if gcloud compute tpus tpu-vm create ${TPU_NAME} \ | |
| --zone=us-east5-b \ | |
| --accelerator-type=v6e-8 \ | |
| --version=v2-alpha-tpuv6e \ | |
| --project=${{ secrets.GCP_PROJECT_ID }} \ | |
| --preemptible; then | |
| echo "Successfully created TPU on attempt $i" | |
| break | |
| else | |
| if [ $i -eq 5 ]; then | |
| echo "Failed to create TPU after 5 attempts" | |
| exit 1 | |
| fi | |
| echo "Attempt $i failed, waiting 30 seconds before retry..." | |
| sleep 30 | |
| fi | |
| done | |
| - name: Copy current branch code to TPU VM | |
| run: | | |
| echo "Copying code from runner workspace to TPU VM at /tmp/repo" | |
| # Create a target directory on the TPU VM first | |
| # Retry SSH command up to 5 times with 30 second delays | |
| for i in {1..5}; do | |
| if gcloud compute tpus tpu-vm ssh ${{ env.TPU_NAME }} \ | |
| --zone=${{ env.TPU_ZONE }} \ | |
| --project=${{ secrets.GCP_PROJECT_ID }} \ | |
| --command="mkdir -p /tmp/repo"; then | |
| echo "Successfully created directory on TPU on attempt $i" | |
| break | |
| else | |
| if [ $i -eq 5 ]; then | |
| echo "Failed to create directory on TPU after 5 attempts" | |
| exit 1 | |
| fi | |
| echo "Attempt $i failed, waiting 30 seconds before retry..." | |
| sleep 30 | |
| fi | |
| done | |
| # Copy the checked-out code (entire workspace '.') to the target directory | |
| gcloud compute tpus tpu-vm scp . ${{ env.TPU_NAME }}:/tmp/repo \ | |
| --recurse \ | |
| --zone=${{ env.TPU_ZONE }} \ | |
| --project=${{ secrets.GCP_PROJECT_ID }} | |
| - name: Configure docker and run tests | |
| run: | | |
| gcloud config set project ${{ secrets.GCP_PROJECT_ID }} | |
| gcloud compute tpus tpu-vm ssh ${{ env.TPU_NAME }} --zone=${{ env.TPU_ZONE }} --project=${{ secrets.GCP_PROJECT_ID }} \ | |
| --command='sudo gcloud auth configure-docker us-east5-docker.pkg.dev --quiet \ | |
| && sudo docker run --net host --shm-size=16G --privileged -v /tmp/repo:/opt/marin \ | |
| ${{ env.DOCKER_IMAGE }}:${{ env.DOCKER_SHA }} \ | |
| /bin/bash -c "gcsfuse --implicit-dirs --cache-dir /dev/shm --file-cache-max-size-mb 160000 --client-protocol grpc --only-dir gcsfuse_mount $BUCKET /opt/gcsfuse_mount || true \ | |
| && export TPU_CI=true \ | |
| && export START_RAY_TPU_CLUSTER=true \ | |
| && export WANDB_API_KEY=${{ secrets.WANDB_API_KEY }} \ | |
| && export WANDB_ENTITY=stanford-mercury \ | |
| && export OPENAI_API_KEY=${{ secrets.OPENAI_API_KEY }} \ | |
| && export HF_TOKEN=${{ secrets.HF_TOKEN }} \ | |
| && export START_RAY_TPU_CLUSTER=true \ | |
| && uv sync \ | |
| && echo TPU_TEST \ | |
| && uv run pytest tests/tpu -s -v \ | |
| && echo VLLM_TEST \ | |
| && uv run pytest tests/vllm -s -v \ | |
| && echo EVALS_TEST \ | |
| && uv run pytest tests/evals -s -v \ | |
| && echo RAY_DATA_TEST \ | |
| && uv run pytest tests/ray-data -s -v"' | |
| - name: Cleanup | |
| if: ${{ always() }} | |
| run: | | |
| export TPU_NAME=ci-run-${{ github.run_id }} | |
| echo gcloud compute tpus tpu-vm delete $TPU_NAME --zone ${TPU_ZONE} --quiet | |
| gcloud compute tpus tpu-vm delete $TPU_NAME --zone ${TPU_ZONE} --quiet |