adenzler-nvidia
diff --git a/‎.github/workflows/aws_gpu_tests.yml‎
Lines changed: 8 additions & 2 deletions b/‎.github/workflows/aws_gpu_tests.yml‎
Lines changed: 8 additions & 2 deletions
diff --git a/‎.github/workflows/docs-dev.yml‎
Lines changed: 2 additions & 0 deletions b/‎.github/workflows/docs-dev.yml‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎.github/workflows/docs-release.yml‎
Lines changed: 2 additions & 0 deletions b/‎.github/workflows/docs-release.yml‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎.github/workflows/minimum_deps_tests.yml‎
Lines changed: 194 additions & 0 deletions b/‎.github/workflows/minimum_deps_tests.yml‎
Lines changed: 194 additions & 0 deletions
diff --git a/‎…/scheduled_nightly_mujoco_warp_tests.yml‎ ‎.github/workflows/mujoco_warp_tests.yml‎.github/workflows/scheduled_nightly_mujoco_warp_tests.yml renamed to .github/workflows/mujoco_warp_tests.yml
Lines changed: 46 additions & 14 deletions b/‎…/scheduled_nightly_mujoco_warp_tests.yml‎ ‎.github/workflows/mujoco_warp_tests.yml‎.github/workflows/scheduled_nightly_mujoco_warp_tests.yml renamed to .github/workflows/mujoco_warp_tests.yml
Lines changed: 46 additions & 14 deletions
diff --git a/‎.github/workflows/pr.yml‎
Lines changed: 2 additions & 0 deletions b/‎.github/workflows/pr.yml‎
Lines changed: 2 additions & 0 deletions
@@ -5,7 +5,7 @@ name: GPU Unit Tests on AWS EC2 (Reusable)
 #   - pr_target_aws_gpu_tests.yml (for pull requests)
 #   - merge_queue_aws_gpu.yml (for merge groups)
 #   - push_aws_gpu.yml (for pushes to main/release branches)
-#   - scheduled_weekly_gpu_tests.yml (for weekly multi-GPU tests)
+#   - scheduled_nightly.yml (for nightly multi-GPU tests)
 
 # Workflow configuration variables
 env:
@@ -42,6 +42,12 @@ on:
       CODECOV_TOKEN:
         required: true
   workflow_dispatch:
+    inputs:
+      instance-type:
+        description: 'EC2 instance type'
+        required: false
+        type: string
+        default: 'g7e.2xlarge'
 
 jobs:
   start-runner:
@@ -215,7 +221,7 @@ jobs:
     needs:
       - start-runner
       - gpu-unit-tests
-    if: always() && github.repository == 'newton-physics/newton'
+    if: always() && needs.start-runner.result != 'skipped' && github.repository == 'newton-physics/newton'
     steps:
       - name: Harden the runner (Audit all outbound calls)
         uses: step-security/harden-runner@fa2e9d605c4eeb9fcad4c99c224cee0c6c7f3594  # v2.16.0
 
@@ -42,6 +42,8 @@ jobs:
 
       - name: Build Sphinx documentation
         run: uv run --extra docs --extra sim sphinx-build -j auto -b html docs docs/_build/html
+        env:
+          NEWTON_REQUIRE_PANDOC: "1"
 
       - name: Deploy to gh-pages /latest/
         run: |
 
@@ -75,6 +75,8 @@ jobs:
       - name: Build Sphinx documentation
         if: steps.version.outputs.SHOULD_DEPLOY == 'true'
         run: uv run --extra docs --extra sim sphinx-build -j auto -b html docs docs/_build/html
+        env:
+          NEWTON_REQUIRE_PANDOC: "1"
 
       - name: Deploy to gh-pages
         if: steps.version.outputs.SHOULD_DEPLOY == 'true'
 
@@ -0,0 +1,194 @@
+name: Minimum Dependency Version Tests on AWS EC2 (Reusable)
+
+# Standalone workflow that tests Newton with the lowest compatible versions
+# of direct PyPI dependencies (as specified by version floors in pyproject.toml).
+# Dispatched by scheduled_nightly.yml via the workflow_dispatch API.
+
+env:
+  AWS_REGION: us-east-2
+  AWS_INSTANCE_TYPE: g7e.2xlarge
+  AWS_VOLUME_SIZE: 92
+  AWS_VOLUME_TYPE: gp3
+  AWS_SECURITY_GROUP_IDS: sg-07807c44e7f2a368a
+  AWS_ROLE_ARN: arn:aws:iam::968945269301:role/newton-physics-newton-ec2-github-runner-role
+  AWS_ROLE_DURATION: 3600
+  HOME: /actions-runner
+
+on:
+  workflow_call:
+    secrets:
+      GH_PERSONAL_ACCESS_TOKEN:
+        required: true
+      CODECOV_TOKEN:
+        required: true
+  workflow_dispatch:
+
+jobs:
+  start-runner:
+    name: Start self-hosted EC2 runner
+    if: github.repository == 'newton-physics/newton'
+    runs-on: ubuntu-latest
+    permissions:
+      id-token: write
+      contents: read
+    outputs:
+      label: ${{ steps.start-ec2-runner.outputs.label }}
+      ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }}
+    steps:
+      - name: Harden the runner (Audit all outbound calls)
+        uses: step-security/harden-runner@fa2e9d605c4eeb9fcad4c99c224cee0c6c7f3594  # v2.16.0
+        with:
+          egress-policy: audit
+
+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@8df5847569e6427dd6c4fb1cf565c83acfa8afa7  # v6.0.0
+        with:
+          aws-region: ${{ env.AWS_REGION }}
+          role-to-assume: ${{ env.AWS_ROLE_ARN }}
+          role-duration-seconds: ${{ env.AWS_ROLE_DURATION }}
+
+      - name: Get the latest AWS Deep Learning Base GPU AMI
+        run: |
+          echo "Finding the latest AWS Deep Learning Base GPU AMI..."
+          LATEST_AMI_ID=$(aws ec2 describe-images --region ${{ env.AWS_REGION }} \
+            --owners amazon \
+            --filters 'Name=name,Values=Deep Learning Base AMI with Single CUDA (Ubuntu 22.04) ????????' 'Name=state,Values=available' \
+            --query 'reverse(sort_by(Images, &CreationDate))[:1].ImageId' \
+            --output text)
+          if [[ -z "$LATEST_AMI_ID" ]]; then
+            echo "❌ No AMI ID found. Exiting."
+            exit 1
+          fi
+          echo "Latest AMI ID found: $LATEST_AMI_ID"
+          echo "LATEST_AMI_ID=$LATEST_AMI_ID" >> "$GITHUB_ENV"
+
+      - name: Start EC2 runner
+        id: start-ec2-runner
+        uses: machulav/ec2-github-runner@a00f575a87f3a96ec6de9413d16eeb828a3cc0a8  # v2.5.2
+        with:
+          mode: start
+          github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
+          ec2-instance-type: ${{ env.AWS_INSTANCE_TYPE }}
+          ec2-volume-size: ${{ env.AWS_VOLUME_SIZE }}
+          ec2-volume-type: ${{ env.AWS_VOLUME_TYPE }}
+          availability-zones-config: >
+            [
+              {"imageId": "${{ env.LATEST_AMI_ID }}", "subnetId": "subnet-051b9d2e71acf8047", "securityGroupId": "${{ env.AWS_SECURITY_GROUP_IDS }}"},
+              {"imageId": "${{ env.LATEST_AMI_ID }}", "subnetId": "subnet-0c98bd06abe8ee5eb", "securityGroupId": "${{ env.AWS_SECURITY_GROUP_IDS }}"}
+            ]
+          pre-runner-script: |
+            if [ -d /opt/dlami/nvme ]; then
+              mkdir -p /opt/dlami/nvme/actions-runner/_work
+              mkdir -p /opt/dlami/nvme/actions-runner/.local
+              mkdir -p /opt/dlami/nvme/actions-runner/.cache
+              ln -s /opt/dlami/nvme/actions-runner/_work /actions-runner/_work
+              ln -s /opt/dlami/nvme/actions-runner/.local /actions-runner/.local
+              ln -s /opt/dlami/nvme/actions-runner/.cache /actions-runner/.cache
+            fi
+          aws-resource-tags: >
+            [
+              {"Key": "Name", "Value": "ec2-github-runner"},
+              {"Key": "created-by", "Value": "github-actions-newton-role"},
+              {"Key": "GitHub-Repository", "Value": "${{ github.repository }}"}
+            ]
+
+  minimum-deps-tests:
+    name: Run Tests with Minimum Dependency Versions
+    needs: start-runner
+    if: ${{ !cancelled() && needs.start-runner.result == 'success' }}
+    runs-on: ${{ needs.start-runner.outputs.label }}
+    timeout-minutes: 60
+    permissions:
+      contents: read
+    env:
+      PYTHONFAULTHANDLER: "1"
+    steps:
+      - name: Harden the runner (Audit all outbound calls)
+        uses: step-security/harden-runner@fa2e9d605c4eeb9fcad4c99c224cee0c6c7f3594  # v2.16.0
+        with:
+          egress-policy: audit
+
+      - name: Checkout repository
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8  # v6.0.1
+
+      - name: Install uv
+        uses: astral-sh/setup-uv@37802adc94f370d6bfd71619e3f0bf239e1f3b78  # v7.6.0
+        with:
+          version: "0.11.0"
+
+      - name: Set up Python
+        run: uv python install
+
+      - name: Resolve minimum dependency versions
+        run: |
+          uv lock --resolution lowest-direct
+          echo "Resolved dependency versions:"
+          uv tree --depth 1
+
+      - name: Run Tests
+        run: uv run --extra dev -m newton.tests --junit-report-xml rspec.xml
+
+      - name: Test Summary
+        if: ${{ !cancelled() }}
+        uses: test-summary/action@31493c76ec9e7aa675f1585d3ed6f1da69269a86  # v2.4
+        with:
+          paths: "rspec.xml"
+          show: "fail"
+
+      - name: Upload test results to Codecov
+        if: ${{ !cancelled() }}
+        continue-on-error: true
+        uses: codecov/codecov-action@1af58845a975a7985b0beb0cbe6fbbb71a41dbad  # v5.5.3
+        with:
+          disable_search: true
+          files: ./rspec.xml
+          flags: minimum-deps-nightly
+          report_type: test_results
+          token: ${{ secrets.CODECOV_TOKEN }}
+
+      - name: Re-run instructions
+        if: failure()
+        run: |
+          echo "::error::DO NOT use 'Re-run failed jobs' - the EC2 runner no longer exists and your job will be queued forever."
+          echo "::error::USE 'Re-run all jobs' instead to start a fresh EC2 runner."
+          cat >> "$GITHUB_STEP_SUMMARY" << 'EOF'
+          ## ⚠️ How to Re-run This Workflow
+
+          This workflow uses **ephemeral EC2 runners** that are terminated after each run.
+
+          | | Option | Result |
+          |---|--------|--------|
+          | ❌ | **Re-run failed jobs** | Runner no longer exists → job queued forever |
+          | ✅ | **Re-run all jobs** | Starts new EC2 runner → tests re-run |
+          EOF
+
+  stop-runner:
+    name: Stop self-hosted EC2 runner
+    runs-on: ubuntu-latest
+    permissions:
+      id-token: write
+      contents: read
+    needs:
+      - start-runner
+      - minimum-deps-tests
+    if: always() && needs.start-runner.result != 'skipped' && github.repository == 'newton-physics/newton'
+    steps:
+      - name: Harden the runner (Audit all outbound calls)
+        uses: step-security/harden-runner@fa2e9d605c4eeb9fcad4c99c224cee0c6c7f3594  # v2.16.0
+        with:
+          egress-policy: audit
+
+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@8df5847569e6427dd6c4fb1cf565c83acfa8afa7  # v6.0.0
+        with:
+          aws-region: ${{ env.AWS_REGION }}
+          role-to-assume: ${{ env.AWS_ROLE_ARN }}
+          role-duration-seconds: ${{ env.AWS_ROLE_DURATION }}
+
+      - name: Stop EC2 runner
+        uses: machulav/ec2-github-runner@a00f575a87f3a96ec6de9413d16eeb828a3cc0a8  # v2.5.2
+        with:
+          mode: stop
+          github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
+          label: ${{ needs.start-runner.outputs.label }}
+          ec2-instance-id: ${{ needs.start-runner.outputs.ec2-instance-id }}
@@ -1,25 +1,26 @@
-name: Newton + MuJoCo Warp Nightly Builds
+name: MuJoCo Warp Tests on AWS EC2 (Reusable)
 
-# This workflow runs daily to test Newton with the latest mujoco-warp from source.
-# It installs mujoco-warp directly from the google-deepmind/mujoco_warp repository,
-# which pulls in whatever mujoco version it depends on.
-# Scheduled workflows automatically run on the default branch (main).
+# Standalone workflow that tests Newton with the latest mujoco-warp from source.
+# Not currently dispatched by scheduled_nightly.yml; kept available for manual dispatch/reuse.
 
-# Workflow configuration variables
 env:
   AWS_REGION: us-east-2
   AWS_INSTANCE_TYPE: g7e.2xlarge
   AWS_VOLUME_SIZE: 92
   AWS_VOLUME_TYPE: gp3
   AWS_SECURITY_GROUP_IDS: sg-07807c44e7f2a368a
   AWS_ROLE_ARN: arn:aws:iam::968945269301:role/newton-physics-newton-ec2-github-runner-role
-  AWS_ROLE_DURATION: 7200
-  PYTHONFAULTHANDLER: "1" # Dump tracebacks on fatal signals (SIGSEGV, SIGABRT, etc.)
+  AWS_ROLE_DURATION: 3600
+  HOME: /actions-runner
 
 on:
-  schedule:
-    - cron: '0 12 * * *'  # Daily at 12 PM UTC (4 AM PST)
-  workflow_dispatch:  # Allow manual triggers
+  workflow_call:
+    secrets:
+      GH_PERSONAL_ACCESS_TOKEN:
+        required: true
+      CODECOV_TOKEN:
+        required: true
+  workflow_dispatch:
 
 jobs:
   start-runner:
@@ -44,6 +45,7 @@ jobs:
           aws-region: ${{ env.AWS_REGION }}
           role-to-assume: ${{ env.AWS_ROLE_ARN }}
           role-duration-seconds: ${{ env.AWS_ROLE_DURATION }}
+
       - name: Get the latest AWS Deep Learning Base GPU AMI
         run: |
           echo "Finding the latest AWS Deep Learning Base GPU AMI..."
@@ -58,6 +60,7 @@ jobs:
           fi
           echo "Latest AMI ID found: $LATEST_AMI_ID"
           echo "LATEST_AMI_ID=$LATEST_AMI_ID" >> "$GITHUB_ENV"
+
       - name: Start EC2 runner
         id: start-ec2-runner
         uses: machulav/ec2-github-runner@a00f575a87f3a96ec6de9413d16eeb828a3cc0a8  # v2.5.2
@@ -88,14 +91,16 @@ jobs:
               {"Key": "GitHub-Repository", "Value": "${{ github.repository }}"}
             ]
 
-  nightly-mujoco-warp-tests:
+  mujoco-warp-tests:
     name: Run Tests with MuJoCo Warp from Source
     needs: start-runner
+    if: ${{ !cancelled() && needs.start-runner.result == 'success' }}
     runs-on: ${{ needs.start-runner.outputs.label }}
+    timeout-minutes: 60
     permissions:
       contents: read
     env:
-      HOME: /actions-runner
+      PYTHONFAULTHANDLER: "1"
     steps:
       - name: Harden the runner (Audit all outbound calls)
         uses: step-security/harden-runner@fa2e9d605c4eeb9fcad4c99c224cee0c6c7f3594  # v2.16.0
@@ -138,6 +143,33 @@ jobs:
           paths: "rspec.xml"
           show: "fail"
 
+      - name: Upload test results to Codecov
+        if: ${{ !cancelled() }}
+        continue-on-error: true
+        uses: codecov/codecov-action@1af58845a975a7985b0beb0cbe6fbbb71a41dbad  # v5.5.3
+        with:
+          disable_search: true
+          files: ./rspec.xml
+          flags: mujoco-warp-nightly
+          report_type: test_results
+          token: ${{ secrets.CODECOV_TOKEN }}
+
+      - name: Re-run instructions
+        if: failure()
+        run: |
+          echo "::error::DO NOT use 'Re-run failed jobs' - the EC2 runner no longer exists and your job will be queued forever."
+          echo "::error::USE 'Re-run all jobs' instead to start a fresh EC2 runner."
+          cat >> "$GITHUB_STEP_SUMMARY" << 'EOF'
+          ## ⚠️ How to Re-run This Workflow
+
+          This workflow uses **ephemeral EC2 runners** that are terminated after each run.
+
+          | | Option | Result |
+          |---|--------|--------|
+          | ❌ | **Re-run failed jobs** | Runner no longer exists → job queued forever |
+          | ✅ | **Re-run all jobs** | Starts new EC2 runner → tests re-run |
+          EOF
+
   stop-runner:
     name: Stop self-hosted EC2 runner
     runs-on: ubuntu-latest
@@ -146,7 +178,7 @@ jobs:
       contents: read
     needs:
       - start-runner
-      - nightly-mujoco-warp-tests
+      - mujoco-warp-tests
     if: always() && needs.start-runner.result != 'skipped' && github.repository == 'newton-physics/newton'
     steps:
       - name: Harden the runner (Audit all outbound calls)
 
@@ -61,6 +61,8 @@ jobs:
         uses: pandoc/actions/setup@86321b6dd4675f5014c611e05088e10d4939e09e  # v1.1.1
       - name: Build Sphinx documentation
         run: uv run --extra docs --extra sim sphinx-build -j auto -W -b html docs docs/_build/html
+        env:
+          NEWTON_REQUIRE_PANDOC: "1"
       - name: Verify API docs are up-to-date
         run: |
           git diff --exit-code docs/api/ || {