diff --git a/.buildkite/generate_pipeline.py b/.buildkite/generate_pipeline.py
index 066d4427b94..5eadd4c9974 100644
--- a/.buildkite/generate_pipeline.py
+++ b/.buildkite/generate_pipeline.py
@@ -146,6 +146,7 @@ def _parse_args(args: Optional[str] = None):
     parser.add_argument('--grpc', action="store_true")
     parser.add_argument('--env-file')
     parser.add_argument('--plugin-yaml')
+    parser.add_argument('--submodule-base-branch')
     parser.add_argument('--dependency', nargs='?', const='', default='all')
 
     parsed_args, _ = parser.parse_known_args(args_list)
@@ -190,6 +191,11 @@ def _parse_args(args: Optional[str] = None):
         extra_args.append('--grpc')
     if parsed_args.env_file:
         extra_args.append(f'--env-file {parsed_args.env_file}')
+    if parsed_args.plugin_yaml:
+        extra_args.append(f'--plugin-yaml {parsed_args.plugin_yaml}')
+    if parsed_args.submodule_base_branch:
+        extra_args.append(
+            f'--submodule-base-branch {parsed_args.submodule_base_branch}')
     if parsed_args.dependency != 'all':
         space = ' ' if parsed_args.dependency else ''
         extra_args.append(f'--dependency{space}{parsed_args.dependency}')
@@ -198,8 +204,9 @@ def _parse_args(args: Optional[str] = None):
 
 
 def _extract_marked_tests(
-        file_path: str, args: str
-) -> Dict[str, Tuple[List[str], List[str], List[Optional[str]]]]:
+    file_path: str, args: str
+) -> Dict[str, Tuple[List[str], List[str], List[Optional[str]], List[str],
+                     List[bool]]]:
     """Extract test functions and filter clouds using pytest.mark
     from a Python test file.
 
@@ -212,6 +219,10 @@ def _extract_marked_tests(
     and run for hours. This makes it hard to visualize the test results and
     rerun failures. Additionally, the parallelism would be controlled by pytest
     instead of the buildkite job queue.
+
+    Returns:
+        Dict mapping function_name to tuple of:
+        (clouds, queues, params, extra_args, no_auto_retry_flags)
     """
     # Args are already in the format pytest expects (cloud names like --lambda)
     cmd = f'pytest {file_path} --collect-only {args}'
@@ -259,6 +270,7 @@ def _extract_marked_tests(
         run_on_cloud_kube_backend = ('resource_heavy' in marks and
                                      'kubernetes' in default_clouds_to_run)
         benchmark_test = 'benchmark' in marks
+        no_auto_retry = 'no_auto_retry' in marks
 
         for mark in marks:
             if mark not in PYTEST_TO_CLOUD_KEYWORD:
@@ -302,20 +314,19 @@ def _extract_marked_tests(
             for cloud in final_clouds_to_include
         ], param_list, [
             extra_args for _ in range(len(final_clouds_to_include))
-        ])
+        ], [no_auto_retry for _ in range(len(final_clouds_to_include))])
 
     return function_cloud_map
 
 
-def _generate_pipeline(test_file: str,
-                       args: str,
-                       auto_retry: bool = False) -> Dict[str, Any]:
+def _generate_pipeline(test_file: str, args: str) -> Dict[str, Any]:
     """Generate a Buildkite pipeline from test files."""
     steps = []
     generated_steps_set = set()
     function_cloud_map = _extract_marked_tests(test_file, args)
     for test_function, clouds_queues_param in function_cloud_map.items():
-        for cloud, queue, param, extra_args in zip(*clouds_queues_param):
+        for cloud, queue, param, extra_args, no_auto_retry in zip(
+                *clouds_queues_param):
             label = f'{test_function} on {cloud}'
             command = f'pytest {test_file}::{test_function} --{cloud}'
             if param:
@@ -328,6 +339,7 @@ def _generate_pipeline(test_file: str,
                 continue
             if 'PYTHON_VERSION' in os.environ:
                 command = f'PYTHONPATH="$PWD:$PYTHONPATH" {command}'
+
             step = {
                 'label': label,
                 'command': command,
@@ -338,7 +350,15 @@ def _generate_pipeline(test_file: str,
                     'queue': queue
                 }
             }
-            if auto_retry:
+            if no_auto_retry:
+                # Disable automatic retries but allow manual retries.
+                step['retry'] = {
+                    'automatic': False,
+                    'manual': {
+                        'allowed': True
+                    }
+                }
+            else:
                 step['retry'] = {
                     # Automatically retry 2 times on any failure by default.
                     'automatic': True
@@ -391,7 +411,7 @@ def _convert_release(test_files: List[str], args: str, trigger_command: str):
     output_file_pipelines = []
     for test_file in test_files:
         print(f'Converting {test_file} to {yaml_file_path}')
-        pipeline = _generate_pipeline(test_file, args, auto_retry=True)
+        pipeline = _generate_pipeline(test_file, args)
         output_file_pipelines.append(pipeline)
         print(f'Converted {test_file} to {yaml_file_path}\n\n')
     # Enable all clouds by default for release pipeline.
@@ -462,11 +482,10 @@ def _convert_quick_tests_core(test_files: List[str], args: str,
                         branch != 'master'):
                     continue
                 pipeline = _generate_pipeline(test_file,
-                                              args + f' --base-branch {branch}',
-                                              auto_retry=True)
+                                              args + f' --base-branch {branch}')
                 output_file_pipelines.append(pipeline)
         else:
-            pipeline = _generate_pipeline(test_file, args, auto_retry=True)
+            pipeline = _generate_pipeline(test_file, args)
             output_file_pipelines.append(pipeline)
         print(f'Converted {test_file} to {yaml_file_path}\n\n')
     _dump_pipeline_to_file(yaml_file_path,
diff --git a/.buildkite/test_buildkite_pipeline_generation.py b/.buildkite/test_buildkite_pipeline_generation.py
index 7c68064d18a..b5ce058c1f9 100644
--- a/.buildkite/test_buildkite_pipeline_generation.py
+++ b/.buildkite/test_buildkite_pipeline_generation.py
@@ -128,6 +128,61 @@ def _extract_test_names_from_pipeline(pipeline_path):
     return test_names
 
 
+def _extract_steps_from_pipeline(pipeline_path):
+    """Extract all steps from a pipeline YAML file."""
+    with open(pipeline_path, 'r') as f:
+        pipeline = yaml.safe_load(f)
+
+    all_steps = []
+    for group in pipeline['steps']:
+        if 'steps' in group:
+            all_steps.extend(group['steps'])
+        else:
+            all_steps.append(group)
+    return all_steps
+
+
+def test_no_auto_retry_marker():
+    """Test that no_auto_retry marker works correctly.
+
+    This test uses the actual test_kubernetes_container_status_unknown_status_refresh
+    test which has the marker applied.
+    """
+    # Generate pipeline for the specific test
+    env = dict(os.environ)
+    env['PYTHONPATH'] = f"{pathlib.Path.cwd()}/tests:{env.get('PYTHONPATH', '')}"
+
+    subprocess.run([
+        'python', '.buildkite/generate_pipeline.py', '--args', '--kubernetes',
+        '--file_pattern', 'test_cluster_job'
+    ],
+                   env=env,
+                   check=True)
+
+    # Check the generated pipeline
+    pipeline_path = pathlib.Path('.buildkite/pipeline_smoke_tests_release.yaml')
+    steps = _extract_steps_from_pipeline(pipeline_path)
+
+    # Find steps for test_kubernetes_container_status_unknown_status_refresh
+    target_steps = [
+        s for s in steps
+        if 'test_kubernetes_container_status_unknown_status_refresh' in s.get(
+            'label', '')
+    ]
+
+    # Should have exactly 1 step
+    assert len(target_steps) == 1, \
+        f"Expected 1 step, got {len(target_steps)}"
+
+    # Verify no_auto_retry is applied
+    step = target_steps[0]
+    retry = step.get('retry', {})
+    assert retry.get('automatic') is False, \
+        f"no_auto_retry step should have automatic=False: {retry}"
+    assert retry.get('manual', {}).get('allowed') is True, \
+        f"no_auto_retry step should allow manual retry: {retry}"
+
+
 @pytest.mark.parametrize('args', [
     '',
     '--aws',
diff --git a/.cursor/worktrees.json b/.cursor/worktrees.json
new file mode 100644
index 00000000000..aaf47d23c99
--- /dev/null
+++ b/.cursor/worktrees.json
@@ -0,0 +1,8 @@
+{
+  "setup-worktree": [
+    "uv venv --seed --python 3.11",
+    "uv pip install -e \".[all]\" --prerelease=allow",
+    "uv pip install -r requirements-dev.txt",
+    "npm --prefix sky/dashboard install && npm --prefix sky/dashboard run build"
+  ]
+}
diff --git a/.github/workflows/compile-protos-check.yml b/.github/workflows/compile-protos-check.yml
index 5601f995d30..2b895d1d93f 100644
--- a/.github/workflows/compile-protos-check.yml
+++ b/.github/workflows/compile-protos-check.yml
@@ -18,7 +18,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ["3.8"]
+        python-version: ["3.9"]
     steps:
     - uses: actions/checkout@v3
     - name: Install the latest version of uv
diff --git a/.github/workflows/format.yml b/.github/workflows/format.yml
index 3c85ed81252..18b67937150 100644
--- a/.github/workflows/format.yml
+++ b/.github/workflows/format.yml
@@ -18,7 +18,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ["3.8"]
+        python-version: ["3.9"]
     steps:
     - uses: actions/checkout@v3
     - name: Install the latest version of uv
diff --git a/.github/workflows/mypy.yml b/.github/workflows/mypy.yml
index 6df98401fcb..9e198c0890c 100644
--- a/.github/workflows/mypy.yml
+++ b/.github/workflows/mypy.yml
@@ -18,7 +18,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ["3.8"]
+        python-version: ["3.9"]
     steps:
     - uses: actions/checkout@v3
     - name: Install the latest version of uv
diff --git a/.github/workflows/nightly-build.yml b/.github/workflows/nightly-build.yml
index 3008e16ceb4..c979cb371e0 100644
--- a/.github/workflows/nightly-build.yml
+++ b/.github/workflows/nightly-build.yml
@@ -230,6 +230,20 @@ jobs:
     secrets:
       BUILDKITE_TOKEN: ${{ secrets.BUILDKITE_TOKEN }}
 
+  smoke-tests-kubernetes-jobs-consolidation:
+    needs: [gate-tests, nightly-build-pypi]
+    if: ${{ needs.gate-tests.outputs.run_tests == 'true' }}
+    uses: ./.github/workflows/buildkite-trigger-wait.yml
+    with:
+      commit: ${{ github.sha }}
+      branch: ${{ github.ref_name }}
+      message: "nightly-build-pypi --kubernetes --jobs-consolidation --no-resource-heavy"
+      pipeline: "smoke-tests"
+      build_env_vars: '{"ARGS": "--kubernetes --jobs-consolidation --no-resource-heavy"}'
+      timeout_minutes: 60
+    secrets:
+      BUILDKITE_TOKEN: ${{ secrets.BUILDKITE_TOKEN }}
+
   smoke-tests-shared-gke-api-server:
     needs: [gate-tests, nightly-build-pypi]
     if: ${{ needs.gate-tests.outputs.run_tests == 'true' }}
@@ -273,12 +287,10 @@ jobs:
   #     BUILDKITE_TOKEN: ${{ secrets.BUILDKITE_TOKEN }}
 
   publish-and-validate-both:
-    # needs: [gate-tests, nightly-build-pypi, smoke-tests-aws, smoke-tests-kubernetes-resource-heavy, smoke-tests-kubernetes-no-resource-heavy, smoke-tests-kubernetes-no-resource-heavy-limit-deps, smoke-tests-remote-server-kubernetes, smoke-tests-shared-gke-api-server, smoke-tests-lambda-job-queue, smoke-tests-runpod-minimal, backward-compat-test-nightly, backward-compat-test-stable]
-    needs: [gate-tests, nightly-build-pypi, smoke-tests-aws, smoke-tests-kubernetes-resource-heavy, smoke-tests-kubernetes-no-resource-heavy, smoke-tests-kubernetes-no-resource-heavy-limit-deps, smoke-tests-remote-server-kubernetes, smoke-tests-shared-gke-api-server, smoke-tests-lambda-job-queue, backward-compat-test-nightly, backward-compat-test-stable]
+    needs: [gate-tests, nightly-build-pypi, smoke-tests-aws, smoke-tests-kubernetes-resource-heavy, smoke-tests-kubernetes-no-resource-heavy, smoke-tests-kubernetes-no-resource-heavy-limit-deps, smoke-tests-remote-server-kubernetes, smoke-tests-kubernetes-jobs-consolidation, smoke-tests-shared-gke-api-server, smoke-tests-lambda-job-queue, backward-compat-test-nightly, backward-compat-test-stable]
     # Allow publish/validate for manual dispatch or the original nightly cron; skip for the 5PM PT preflight
     # Use always() so this job evaluates even if some test jobs were skipped when skip_buildkite is selected
-    # if: ${{ always() && (github.event_name == 'workflow_dispatch' || (github.event_name == 'schedule' && github.event.schedule == '35 8 * * *')) && needs.nightly-build-pypi.result == 'success' && (needs.gate-tests.outputs.publish_without_tests == 'true' || (needs.gate-tests.outputs.run_tests == 'true' && needs.smoke-tests-aws.result == 'success' && needs.smoke-tests-kubernetes-resource-heavy.result == 'success' && needs.smoke-tests-kubernetes-no-resource-heavy.result == 'success' && needs.smoke-tests-kubernetes-no-resource-heavy-limit-deps.result == 'success' && needs.smoke-tests-remote-server-kubernetes.result == 'success' && needs.smoke-tests-shared-gke-api-server.result == 'success' && needs.smoke-tests-lambda-job-queue.result == 'success' && needs.smoke-tests-runpod-minimal.result == 'success' && needs.backward-compat-test-nightly.result == 'success' && needs.backward-compat-test-stable.result == 'success')) }}
-    if: ${{ always() && (github.event_name == 'workflow_dispatch' || (github.event_name == 'schedule' && github.event.schedule == '35 8 * * *')) && needs.nightly-build-pypi.result == 'success' && (needs.gate-tests.outputs.publish_without_tests == 'true' || (needs.gate-tests.outputs.run_tests == 'true' && needs.smoke-tests-aws.result == 'success' && needs.smoke-tests-kubernetes-resource-heavy.result == 'success' && needs.smoke-tests-kubernetes-no-resource-heavy.result == 'success' && needs.smoke-tests-kubernetes-no-resource-heavy-limit-deps.result == 'success' && needs.smoke-tests-remote-server-kubernetes.result == 'success' && needs.smoke-tests-shared-gke-api-server.result == 'success' && needs.smoke-tests-lambda-job-queue.result == 'success' && needs.backward-compat-test-nightly.result == 'success' && needs.backward-compat-test-stable.result == 'success')) }}
+    if: ${{ always() && (github.event_name == 'workflow_dispatch' || (github.event_name == 'schedule' && github.event.schedule == '35 8 * * *')) && needs.nightly-build-pypi.result == 'success' && (needs.gate-tests.outputs.publish_without_tests == 'true' || (needs.gate-tests.outputs.run_tests == 'true' && needs.smoke-tests-aws.result == 'success' && needs.smoke-tests-kubernetes-resource-heavy.result == 'success' && needs.smoke-tests-kubernetes-no-resource-heavy.result == 'success' && needs.smoke-tests-kubernetes-no-resource-heavy-limit-deps.result == 'success' && needs.smoke-tests-remote-server-kubernetes.result == 'success' && needs.smoke-tests-kubernetes-jobs-consolidation.result == 'success' && needs.smoke-tests-shared-gke-api-server.result == 'success' && needs.smoke-tests-lambda-job-queue.result == 'success' && needs.backward-compat-test-nightly.result == 'success' && needs.backward-compat-test-stable.result == 'success')) }}
     uses: ./.github/workflows/publish-and-validate-both.yml
     with:
       package_name: skypilot-nightly
@@ -297,8 +309,7 @@ jobs:
 
   summary:
     runs-on: ubuntu-latest
-    needs: [check-date, nightly-build-pypi, smoke-tests-aws, smoke-tests-kubernetes-resource-heavy, smoke-tests-kubernetes-no-resource-heavy, smoke-tests-kubernetes-no-resource-heavy-limit-deps, smoke-tests-remote-server-kubernetes, smoke-tests-shared-gke-api-server, smoke-tests-lambda-job-queue, backward-compat-test-nightly, backward-compat-test-stable]
-    # needs: [check-date, nightly-build-pypi, smoke-tests-aws, smoke-tests-kubernetes-resource-heavy, smoke-tests-kubernetes-no-resource-heavy, smoke-tests-kubernetes-no-resource-heavy-limit-deps, smoke-tests-remote-server-kubernetes, smoke-tests-shared-gke-api-server, smoke-tests-lambda-job-queue, smoke-tests-runpod-minimal, backward-compat-test-nightly, backward-compat-test-stable]
+    needs: [check-date, nightly-build-pypi, smoke-tests-aws, smoke-tests-kubernetes-resource-heavy, smoke-tests-kubernetes-no-resource-heavy, smoke-tests-kubernetes-no-resource-heavy-limit-deps, smoke-tests-remote-server-kubernetes, smoke-tests-kubernetes-jobs-consolidation, smoke-tests-shared-gke-api-server, smoke-tests-lambda-job-queue, backward-compat-test-nightly, backward-compat-test-stable]
     if: always()
     steps:
       - name: Summary
@@ -333,6 +344,11 @@ jobs:
           - [Smoke Tests Remote Server Kubernetes](https://buildkite.com/skypilot-1/smoke-tests/builds/${{ needs.smoke-tests-remote-server-kubernetes.outputs.build_number }}) - $([ "${{ needs.smoke-tests-remote-server-kubernetes.outputs.build_status }}" == "success" ] && echo "✅ Success" || echo "❌ Failed")
           EOF
           fi
+          if [ "${{ needs.smoke-tests-kubernetes-jobs-consolidation.result }}" != "skipped" ] && [ -n "${{ needs.smoke-tests-kubernetes-jobs-consolidation.outputs.build_number }}" ]; then
+            cat <<EOF >> "$GITHUB_STEP_SUMMARY"
+          - [Smoke Tests Kubernetes (Jobs Consolidation)](https://buildkite.com/skypilot-1/smoke-tests/builds/${{ needs.smoke-tests-kubernetes-jobs-consolidation.outputs.build_number }}) - $([ "${{ needs.smoke-tests-kubernetes-jobs-consolidation.outputs.build_status }}" == "success" ] && echo "✅ Success" || echo "❌ Failed")
+          EOF
+          fi
           if [ "${{ needs.smoke-tests-shared-gke-api-server.result }}" != "skipped" ] && [ -n "${{ needs.smoke-tests-shared-gke-api-server.outputs.build_number }}" ]; then
             cat <<EOF >> "$GITHUB_STEP_SUMMARY"
           - [Smoke Tests Shared GKE API Server](https://buildkite.com/skypilot-1/nightly-build-shared-gke-api-server/builds/${{ needs.smoke-tests-shared-gke-api-server.outputs.build_number }}) - $([ "${{ needs.smoke-tests-shared-gke-api-server.outputs.build_status }}" == "success" ] && echo "✅ Success" || echo "❌ Failed")
@@ -361,8 +377,7 @@ jobs:
 
   notify-slack-failure:
     runs-on: ubuntu-latest
-    needs: [check-date, nightly-build-pypi, smoke-tests-aws, smoke-tests-kubernetes-resource-heavy, smoke-tests-kubernetes-no-resource-heavy, smoke-tests-kubernetes-no-resource-heavy-limit-deps, smoke-tests-remote-server-kubernetes, smoke-tests-shared-gke-api-server, smoke-tests-lambda-job-queue, backward-compat-test-nightly, backward-compat-test-stable, publish-and-validate-both, trigger-docker-and-helm-release]
-    # needs: [check-date, nightly-build-pypi, smoke-tests-aws, smoke-tests-kubernetes-resource-heavy, smoke-tests-kubernetes-no-resource-heavy, smoke-tests-kubernetes-no-resource-heavy-limit-deps, smoke-tests-remote-server-kubernetes, smoke-tests-shared-gke-api-server, smoke-tests-lambda-job-queue, smoke-tests-runpod-minimal, backward-compat-test-nightly, backward-compat-test-stable, publish-and-validate-both, trigger-docker-and-helm-release]
+    needs: [check-date, nightly-build-pypi, smoke-tests-aws, smoke-tests-kubernetes-resource-heavy, smoke-tests-kubernetes-no-resource-heavy, smoke-tests-kubernetes-no-resource-heavy-limit-deps, smoke-tests-remote-server-kubernetes, smoke-tests-kubernetes-jobs-consolidation, smoke-tests-shared-gke-api-server, smoke-tests-lambda-job-queue, backward-compat-test-nightly, backward-compat-test-stable, publish-and-validate-both, trigger-docker-and-helm-release]
     # Only run this job if any of the previous jobs failed
     if: failure()
     steps:
@@ -374,8 +389,7 @@ jobs:
           COMMIT_URL="${{ github.server_url }}/${{ github.repository }}/commit/${COMMIT_SHA}"
           SHORT_SHA=$(echo "$COMMIT_SHA" | cut -c1-7)
           BUILDKITE_MSG=""
-          if [[ "${{ needs.smoke-tests-aws.result }}" == "failure" || "${{ needs.smoke-tests-kubernetes-resource-heavy.result }}" == "failure" || "${{ needs.smoke-tests-kubernetes-no-resource-heavy.result }}" == "failure" || "${{ needs.smoke-tests-kubernetes-no-resource-heavy-limit-deps.result }}" == "failure" || "${{ needs.smoke-tests-remote-server-kubernetes.result }}" == "failure" || "${{ needs.smoke-tests-shared-gke-api-server.result }}" == "failure" || "${{ needs.smoke-tests-lambda-job-queue.result }}" == "failure" || "${{ needs.backward-compat-test-nightly.result }}" == "failure" || "${{ needs.backward-compat-test-stable.result }}" == "failure" ]]; then
-          # if [[ "${{ needs.smoke-tests-aws.result }}" == "failure" || "${{ needs.smoke-tests-kubernetes-resource-heavy.result }}" == "failure" || "${{ needs.smoke-tests-kubernetes-no-resource-heavy.result }}" == "failure" || "${{ needs.smoke-tests-kubernetes-no-resource-heavy-limit-deps.result }}" == "failure" || "${{ needs.smoke-tests-remote-server-kubernetes.result }}" == "failure" || "${{ needs.smoke-tests-shared-gke-api-server.result }}" == "failure" || "${{ needs.smoke-tests-lambda-job-queue.result }}" == "failure" || "${{ needs.smoke-tests-runpod-minimal.result }}" == "failure" || "${{ needs.backward-compat-test-nightly.result }}" == "failure" || "${{ needs.backward-compat-test-stable.result }}" == "failure" ]]; then
+          if [[ "${{ needs.smoke-tests-aws.result }}" == "failure" || "${{ needs.smoke-tests-kubernetes-resource-heavy.result }}" == "failure" || "${{ needs.smoke-tests-kubernetes-no-resource-heavy.result }}" == "failure" || "${{ needs.smoke-tests-kubernetes-no-resource-heavy-limit-deps.result }}" == "failure" || "${{ needs.smoke-tests-remote-server-kubernetes.result }}" == "failure" || "${{ needs.smoke-tests-kubernetes-jobs-consolidation.result }}" == "failure" || "${{ needs.smoke-tests-shared-gke-api-server.result }}" == "failure" || "${{ needs.smoke-tests-lambda-job-queue.result }}" == "failure" || "${{ needs.backward-compat-test-nightly.result }}" == "failure" || "${{ needs.backward-compat-test-stable.result }}" == "failure" ]]; then
             if [[ "${{ needs.smoke-tests-aws.result }}" == "failure" ]]; then
               BUILDKITE_MSG="<https://buildkite.com/skypilot-1/smoke-tests/builds/${{ needs.smoke-tests-aws.outputs.build_number }}|Buildkite Log(--aws)>"
             fi
@@ -403,6 +417,12 @@ jobs:
               fi
               BUILDKITE_MSG="${BUILDKITE_MSG} <https://buildkite.com/skypilot-1/smoke-tests/builds/${{ needs.smoke-tests-remote-server-kubernetes.outputs.build_number }}|Buildkite Log(--remote-server --kubernetes)>"
             fi
+            if [[ "${{ needs.smoke-tests-kubernetes-jobs-consolidation.result }}" == "failure" ]]; then
+              if [[ ! -z "$BUILDKITE_MSG" ]]; then
+                BUILDKITE_MSG="${BUILDKITE_MSG} and"
+              fi
+              BUILDKITE_MSG="${BUILDKITE_MSG} <https://buildkite.com/skypilot-1/smoke-tests/builds/${{ needs.smoke-tests-kubernetes-jobs-consolidation.outputs.build_number }}|Buildkite Log(--kubernetes --jobs-consolidation --no-resource-heavy)>"
+            fi
             if [[ "${{ needs.smoke-tests-shared-gke-api-server.result }}" == "failure" ]]; then
               if [[ ! -z "$BUILDKITE_MSG" ]]; then
                 BUILDKITE_MSG="${BUILDKITE_MSG} and"
diff --git a/.github/workflows/publish-helm.yml b/.github/workflows/publish-helm.yml
index 005bbe51aa0..a6611187266 100644
--- a/.github/workflows/publish-helm.yml
+++ b/.github/workflows/publish-helm.yml
@@ -115,11 +115,6 @@ jobs:
           line=$(grep -n "^-----*$" src/README.md | cut -d: -f1 | head -n 1)
           tail -n +$line src/README.md >> src/charts/skypilot/README.md
 
-          # Update the version in the external-metrics chart (prometheus server)
-          # todo(rohan): update name the way we do for the main skypilot chart?
-          sed -i "s/^version:.*$/version: ${semversion}/" src/charts/external-metrics/Chart.yaml
-          sed -i "s/^appVersion:.*$/appVersion: ${version}/" src/charts/external-metrics/Chart.yaml
-
       - name: Update docker image in charts
         if: inputs.version != ''
         run: |
diff --git a/.github/workflows/pylint.yml b/.github/workflows/pylint.yml
index 7d2847ab501..43c85f77fc4 100644
--- a/.github/workflows/pylint.yml
+++ b/.github/workflows/pylint.yml
@@ -18,7 +18,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ["3.8"]
+        python-version: ["3.9"]
     steps:
     - uses: actions/checkout@v3
     - name: Install the latest version of uv
diff --git a/.github/workflows/pytest-optimizer.yml b/.github/workflows/pytest-optimizer.yml
index 3f8667b67d7..578b0cb0da0 100644
--- a/.github/workflows/pytest-optimizer.yml
+++ b/.github/workflows/pytest-optimizer.yml
@@ -19,13 +19,16 @@ jobs:
         python-version: ["3.9"]
         test-path:
           - "tests/test_optimizer_dryruns.py -k \"partial\""
-          - "tests/test_optimizer_dryruns.py -k \"not partial\""
+          - "tests/test_optimizer_dryruns.py -k \"not partial and not accelerator_memory and not accelerator_manufacturer\""
+          - "tests/test_optimizer_dryruns.py -k \"accelerator_memory or accelerator_manufacturer\""
           - tests/test_optimizer_random_dag.py
         include:
           - test-path: "tests/test_optimizer_dryruns.py -k \"partial\""
             test-name: "Optimizer Dryruns Part 1"
-          - test-path: "tests/test_optimizer_dryruns.py -k \"not partial\""
+          - test-path: "tests/test_optimizer_dryruns.py -k \"not partial and not accelerator_memory and not accelerator_manufacturer\""
             test-name: "Optimizer Dryruns Part 2"
+          - test-path: "tests/test_optimizer_dryruns.py -k \"accelerator_memory or accelerator_manufacturer\""
+            test-name: "Optimizer Dryruns Part 3"
           - test-path: tests/test_optimizer_random_dag.py
             test-name: "Optimizer Random DAG Tests"
     runs-on: ubuntu-latest
@@ -38,4 +41,4 @@ jobs:
         with:
           python-version: ${{ matrix.python-version }}
           test-path: ${{ matrix.test-path }}
-          test-name: ${{ matrix.test-name }}
\ No newline at end of file
+          test-name: ${{ matrix.test-name }}
diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml
index 271e967729a..422db1a7f9c 100644
--- a/.github/workflows/pytest.yml
+++ b/.github/workflows/pytest.yml
@@ -21,7 +21,8 @@ jobs:
           # Group them based on running time to save CI time and resources
           - tests/unit_tests
           - tests/test_cli.py
-          - tests/test_jobs_and_serve.py tests/test_yaml_parser.py tests/test_global_user_state.py tests/test_config.py tests/test_jobs.py tests/test_list_accelerators.py tests/test_wheels.py tests/test_api.py tests/test_storage.py tests/test_api_compatibility.py
+          - tests/test_jobs.py tests/test_jobs_and_serve.py tests/test_list_accelerators.py tests/test_api.py
+          - tests/test_config.py tests/test_wheels.py tests/test_yaml_parser.py tests/test_global_user_state.py tests/test_storage.py tests/test_api_compatibility.py tests/test_infra_k8s_alias.py
           - tests/test_no_parellel.py
           - tests/test_ssh_proxy_lag.py
         include:
@@ -31,8 +32,10 @@ jobs:
             test-name: "Unit Tests"
           - test-path: tests/test_cli.py
             test-name: "CLI Tests"
-          - test-path: tests/test_jobs_and_serve.py tests/test_yaml_parser.py tests/test_global_user_state.py tests/test_config.py tests/test_jobs.py tests/test_list_accelerators.py tests/test_wheels.py tests/test_api.py tests/test_storage.py tests/test_api_compatibility.py tests/test_infra_k8s_alias.py
-            test-name: "Jobs, Serve, Wheels, API, Config, Optimizer & Storage Tests"
+          - test-path: tests/test_jobs.py tests/test_jobs_and_serve.py tests/test_list_accelerators.py tests/test_api.py
+            test-name: "Jobs & API Tests"
+          - test-path: tests/test_config.py tests/test_wheels.py tests/test_yaml_parser.py tests/test_global_user_state.py tests/test_storage.py tests/test_api_compatibility.py tests/test_infra_k8s_alias.py
+            test-name: "Config, Storage & Compatibility Tests"
           - test-path: tests/test_no_parellel.py
             test-name: "No Parallel Tests"
           - test-path: tests/test_ssh_proxy_lag.py
diff --git a/.github/workflows/release-build.yml b/.github/workflows/release-build.yml
index 80819f5c53a..9342d563a34 100644
--- a/.github/workflows/release-build.yml
+++ b/.github/workflows/release-build.yml
@@ -190,7 +190,7 @@ jobs:
   smoke-tests:
     needs: release-build
     if: |
-      always() && 
+      always() &&
       needs.release-build.result == 'success' &&
       github.event.inputs.skip_smoke_tests != 'true'
     uses: ./.github/workflows/buildkite-trigger-wait.yml
@@ -208,7 +208,7 @@ jobs:
   quicktest-core:
     needs: release-build
     if: |
-      always() && 
+      always() &&
       needs.release-build.result == 'success' &&
       github.event.inputs.skip_smoke_tests != 'true'
     uses: ./.github/workflows/buildkite-trigger-wait.yml
@@ -227,7 +227,7 @@ jobs:
   quicktest-core-previous-minor:
     needs: release-build
     if: |
-      always() && 
+      always() &&
       needs.release-build.result == 'success' &&
       github.event.inputs.skip_smoke_tests != 'true'
     uses: ./.github/workflows/buildkite-trigger-wait.yml
@@ -246,7 +246,7 @@ jobs:
   smoke-tests-remote-server-kubernetes:
     needs: release-build
     if: |
-      always() && 
+      always() &&
       needs.release-build.result == 'success' &&
       github.event.inputs.skip_smoke_tests != 'true'
     uses: ./.github/workflows/buildkite-trigger-wait.yml
@@ -263,10 +263,29 @@ jobs:
     secrets:
       BUILDKITE_TOKEN: ${{ secrets.BUILDKITE_TOKEN }}
 
+  smoke-tests-kubernetes-jobs-consolidation:
+    needs: release-build
+    if: |
+      always() &&
+      needs.release-build.result == 'success' &&
+      github.event.inputs.skip_smoke_tests != 'true'
+    uses: ./.github/workflows/buildkite-trigger-wait.yml
+    with:
+      commit: ${{ needs.release-build.outputs.new_commit_sha }}
+      branch: ${{ needs.release-build.outputs.test_branch }}
+      message: "Release ${{ needs.release-build.outputs.release_version }} --kubernetes --jobs-consolidation --no-resource-heavy"
+      pipeline: "smoke-tests"
+      build_env_vars: '{"ARGS": "--kubernetes --jobs-consolidation --no-resource-heavy"}'
+      timeout_minutes: 60
+      wait: true
+      fail_on_buildkite_failure: true
+    secrets:
+      BUILDKITE_TOKEN: ${{ secrets.BUILDKITE_TOKEN }}
+
   release-tests:
     needs: release-build
     if: |
-      always() && 
+      always() &&
       needs.release-build.result == 'success' &&
       github.event.inputs.skip_smoke_tests != 'true'
     uses: ./.github/workflows/buildkite-trigger-wait.yml
@@ -281,7 +300,7 @@ jobs:
       BUILDKITE_TOKEN: ${{ secrets.BUILDKITE_TOKEN }}
 
   create-pr:
-    needs: [release-build, smoke-tests, quicktest-core, quicktest-core-previous-minor, smoke-tests-remote-server-kubernetes, release-tests]
+    needs: [release-build, smoke-tests, quicktest-core, quicktest-core-previous-minor, smoke-tests-remote-server-kubernetes, smoke-tests-kubernetes-jobs-consolidation, release-tests]
     if: always() && needs.release-build.result == 'success'
     runs-on: ubuntu-latest
     steps:
@@ -314,21 +333,21 @@ jobs:
           if [ "$SKIP_SMOKE_TESTS" == "true" ]; then
             if [ "$IS_RC_PROMOTION" == "true" ]; then
               PR_BODY="## Promote RC to Stable Release ${RELEASE_VERSION}
-          
-          **Source:** \`$SOURCE_BRANCH\` (RC version: $RC_VERSION)  
+
+          **Source:** \`$SOURCE_BRANCH\` (RC version: $RC_VERSION)
           **Target:** Stable release \`${RELEASE_VERSION}\`
-          
+
           ⚠️ **Smoke tests were SKIPPED** - This release is being promoted from a tested RC.
-          
+
           ### Pre-release Testing
           This version was previously tested as release candidate \`$RC_VERSION\` and deemed stable by early adopters.
-          
+
           ### Changes in this PR
           - Updated \`sky/__init__.py\`: \`$RC_VERSION\` → \`${RELEASE_VERSION}\`
           - Updated \`charts/skypilot/values.yaml\`: Docker image tag \`$RC_VERSION\` → \`${RELEASE_VERSION}\`"
             else
               PR_BODY="Release ${RELEASE_VERSION}
-          
+
           ⚠️ **Smoke tests were SKIPPED** - Please ensure manual testing was performed."
             fi
           else
@@ -340,6 +359,7 @@ jobs:
           - [Quicktest Core](https://buildkite.com/skypilot-1/quicktest-core/builds/${{ needs.quicktest-core.outputs.build_number }}) - $([ "${{ needs.quicktest-core.outputs.build_status }}" == "success" ] && echo "✅ Success" || echo "❌ Failed")
           - [Quicktest Core (vs Previous Minor)](https://buildkite.com/skypilot-1/quicktest-core/builds/${{ needs.quicktest-core-previous-minor.outputs.build_number }}) - $([ "${{ needs.quicktest-core-previous-minor.outputs.build_status }}" == "success" ] && echo "✅ Success" || echo "❌ Failed")
           - [Smoke Tests Remote Server Kubernetes](https://buildkite.com/skypilot-1/smoke-tests/builds/${{ needs.smoke-tests-remote-server-kubernetes.outputs.build_number }}) - $([ "${{ needs.smoke-tests-remote-server-kubernetes.outputs.build_status }}" == "success" ] && echo "✅ Success" || echo "❌ Failed")
+          - [Smoke Tests Kubernetes (Jobs Consolidation)](https://buildkite.com/skypilot-1/smoke-tests/builds/${{ needs.smoke-tests-kubernetes-jobs-consolidation.outputs.build_number }}) - $([ "${{ needs.smoke-tests-kubernetes-jobs-consolidation.outputs.build_status }}" == "success" ] && echo "✅ Success" || echo "❌ Failed")
           - [Release Tests](https://buildkite.com/skypilot-1/release/builds/${{ needs.release-tests.outputs.build_number }}) - ⏳ (not waiting for completion)
 
           *Release Tests may take up to 24 hours to complete and might fail due to resource constraints.*"
@@ -384,6 +404,7 @@ jobs:
           - [Quicktest Core](https://buildkite.com/skypilot-1/quicktest-core/builds/${{ needs.quicktest-core.outputs.build_number }}) - $([ "${{ needs.quicktest-core.outputs.build_status }}" == "success" ] && echo "✅ Success" || echo "❌ Failed")
           - [Quicktest Core (vs Previous Minor)](https://buildkite.com/skypilot-1/quicktest-core/builds/${{ needs.quicktest-core-previous-minor.outputs.build_number }}) - $([ "${{ needs.quicktest-core-previous-minor.outputs.build_status }}" == "success" ] && echo "✅ Success" || echo "❌ Failed")
           - [Smoke Tests Remote Server Kubernetes](https://buildkite.com/skypilot-1/smoke-tests/builds/${{ needs.smoke-tests-remote-server-kubernetes.outputs.build_number }}) - $([ "${{ needs.smoke-tests-remote-server-kubernetes.outputs.build_status }}" == "success" ] && echo "✅ Success" || echo "❌ Failed")
+          - [Smoke Tests Kubernetes (Jobs Consolidation)](https://buildkite.com/skypilot-1/smoke-tests/builds/${{ needs.smoke-tests-kubernetes-jobs-consolidation.outputs.build_number }}) - $([ "${{ needs.smoke-tests-kubernetes-jobs-consolidation.outputs.build_status }}" == "success" ] && echo "✅ Success" || echo "❌ Failed")
           - [Release Tests](https://buildkite.com/skypilot-1/release/builds/${{ needs.release-tests.outputs.build_number }}) - ⏳ (not waiting for completion)
 
           *Release Tests may take up to 24 hours to complete and might fail due to resource constraints.*
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 501eca9c287..96f4dd9fb30 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -33,16 +33,16 @@ repos:
         files: "^sky/skylet/providers/ibm/.*"  # Only match IBM-specific directory
 
 -   repo: https://github.com/pre-commit/mirrors-mypy
-    rev: v1.14.1  # Match the version from requirements
+    rev: v1.19.1  # Match the version from requirements
     hooks:
     -   id: mypy
-        args:
-            # From tests/mypy_files.txt
+        args:            # Match tests/mypy_files.txt - check sky and examples/admin_policy/example_policy
             - "sky"
+            - "examples/admin_policy/example_policy"
+            - "--exclude"
+            - "sky/backends/monkey_patches"
             - "--exclude"
-            - "sky/benchmark|sky/callbacks|sky/backends/monkey_patches"
-            - "--cache-dir"
-            - "/dev/null"
+            - "examples/admin_policy/example_policy/build"
             - "--check-untyped-defs"
         pass_filenames: false
         additional_dependencies:
@@ -96,7 +96,7 @@ repos:
 
     -   id: dashboard-format
         name: dashboard format
-        entry: bash -c 'cd sky/dashboard && npm run format'
+        entry: bash -c 'cd sky/dashboard && npm run format -- --log-level warn'
         language: node
         language_version: 24.12.0
         files: ^sky/dashboard/
diff --git a/AGENTS.md b/AGENTS.md
index 43b52af0c3b..bf0e90bc959 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -105,7 +105,7 @@ From `requirements-dev.txt`:
 - yapf==0.32.0
 - pylint==2.14.5
 - black==22.10.0
-- mypy==1.14.1
+- mypy==1.19.1
 - isort==5.12.0
 - pylint-quotes==0.2.3
 
@@ -315,6 +315,32 @@ sky api start
 sky api status
 ```
 
+### Dashboard Development
+
+**For local API server development**, rebuild the dashboard before restarting:
+
+```bash
+# Install dependencies (first time or after package.json changes)
+npm --prefix sky/dashboard install
+
+# Rebuild the dashboard
+npm --prefix sky/dashboard run build
+
+# Then restart the API server
+sky api stop
+sky api start
+```
+
+**For remote API server (Docker/Kubernetes)**, the Dockerfile automatically builds the dashboard - no manual build needed before `docker build`.
+
+The dashboard is a Next.js application. For development with hot reloading:
+
+```bash
+# Run dashboard in development mode (separate from API server)
+cd sky/dashboard
+npm run dev
+```
+
 ### Mocking Remote API Server Locally
 
 To test remote API server behavior locally:
@@ -345,7 +371,7 @@ helm dependency build ./charts/skypilot
 DOCKER_IMAGE=my-repo/skypilot:v1
 docker buildx build --push --platform linux/amd64 -t $DOCKER_IMAGE -f Dockerfile .
 
-# Deploy
+# Deploy (NEW installation)
 NAMESPACE=skypilot
 RELEASE_NAME=skypilot
 helm upgrade --install $RELEASE_NAME ./charts/skypilot --devel \
@@ -354,6 +380,33 @@ helm upgrade --install $RELEASE_NAME ./charts/skypilot --devel \
     --set apiService.image=$DOCKER_IMAGE
 ```
 
+#### Upgrading Existing Deployments
+
+**CRITICAL:** Always use `--reuse-values` to preserve database/credential config:
+
+```bash
+# Upgrade existing deployment (keeps PostgreSQL, auth, etc.)
+helm upgrade skypilot ./charts/skypilot -n skypilot --reuse-values \
+    --set apiService.image=$DOCKER_IMAGE
+
+# Check current values / rollback if needed
+helm get values skypilot -n skypilot
+helm rollback skypilot <revision> -n skypilot
+```
+
+#### PostgreSQL Backend
+
+```bash
+# Create connection secret
+kubectl create secret generic db-uri -n skypilot \
+    --from-literal=uri="postgresql://user:pass@host:5432/db"
+
+# Deploy with PostgreSQL
+helm upgrade --install skypilot ./charts/skypilot -n skypilot \
+    --set apiService.dbConnectionSecretName=db-uri \
+    --set storage.enabled=false
+```
+
 ## Critical Code Paths (Handle with Care)
 
 The following modules contain complex, stateful logic that requires careful review when modifying:
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index ede9764b958..ee67a6affae 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -21,7 +21,7 @@ Follow the steps below to set up a local development environment for contributin
 #### Create a virtual environment
 To avoid package conflicts, create and activate a clean virtual environment using [uv](https://docs.astral.sh/uv/):
 ```bash
-# SkyPilot requires python 3.8-3.11.
+# SkyPilot requires python 3.9-3.11.
 # --seed is required to ensure pip is installed (needed for building wheels)
 uv venv --seed --python 3.11
 source .venv/bin/activate
@@ -100,6 +100,53 @@ py-spy top -- python -m sky.cli status # Get a live top view
 py-spy -h # For more options
 ```
 
+#### Testing WSL features on a windows VM (Azure)
+
+To test features that require Windows Subsystem for Linux (WSL), such as the automatic Windows SSH config setup, you can create a Windows VM on Azure:
+
+```bash
+# Create resource group
+az group create --name wsl-test-vm --location eastus2
+
+# Create Windows 11 VM with WSL-compatible settings
+az vm create \
+  --resource-group wsl-test-vm \
+  --name win11-wsl-test \
+  --image MicrosoftWindowsDesktop:windows-11:win11-24h2-pro:latest \
+  --size Standard_D4s_v3 \
+  --admin-username skyuser \
+  --admin-password 'YourPassword123!' \
+  --public-ip-sku Standard
+
+# Enable WSL features on the VM
+az vm run-command invoke \
+  --resource-group wsl-test-vm \
+  --name win11-wsl-test \
+  --command-id RunPowerShellScript \
+  --scripts "
+    dism.exe /online /enable-feature /featurename:Microsoft-Windows-Subsystem-Linux /all /norestart
+    dism.exe /online /enable-feature /featurename:VirtualMachinePlatform /all /norestart
+  "
+
+# Restart VM to apply WSL features
+az vm restart --resource-group wsl-test-vm --name win11-wsl-test
+
+# Get VM public IP for RDP connection
+az vm show --resource-group wsl-test-vm --name win11-wsl-test --show-details --query publicIps -o tsv
+```
+
+Connect via RDP, then in PowerShell (as Admin):
+```powershell
+wsl --install -d Ubuntu-22.04
+```
+
+After restart, set up Ubuntu and install SkyPilot to test WSL-specific features.
+
+**Cleanup:**
+```bash
+az group delete --name wsl-test-vm --yes --no-wait
+```
+
 #### Testing in a container
 
 It is often useful to test your changes in a clean environment set up from scratch. Using a container is a good way to do this.
diff --git a/Dockerfile b/Dockerfile
index 73c6645c28a..1eb285efa6d 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -70,6 +70,7 @@ ARG NEXT_BASE_PATH=/dashboard
 
 # Install system packages
 RUN apt-get update -y && \
+    apt-get upgrade -y && \
     apt-get install --no-install-recommends -y \
         git gcc rsync sudo patch openssh-server \
         pciutils nano fuse socat netcat-openbsd curl tini autossh jq logrotate && \
diff --git a/README.md b/README.md
index 981703b4494..9db0abca330 100644
--- a/README.md
+++ b/README.md
@@ -50,6 +50,7 @@ SkyPilot gives **AI teams** a simple interface to run jobs on any infra.
 :fire: *News* :fire:
 - [Dec 2025] **SkyPilot v0.11** released: Multi-Cloud Pools, Fast Managed Jobs, Enterprise-Readiness at Large Scale, Programmability. [**Release notes**](https://github.com/skypilot-org/skypilot/releases/tag/v0.11.0)
 - [Dec 2025] **SkyPilot Pools** released: Run batch inference and other jobs on a managed pool of warm workers (across clouds or clusters). [**blog**](https://blog.skypilot.co/skypilot-pools-deepseek-ocr/), [**docs**](https://docs.skypilot.co/en/latest/examples/pools.html)
+- [Dec 2025] Train **an agent to use Google Search** as a tool with RL on your Kubernetes or clouds: [**blog**](https://blog.skypilot.co/verl-tool-calling/), [**example**](./llm/verl/)
 - [Nov 2025] Serve **Kimi K2 Thinking** with reasoning capabilities on your Kubernetes or clouds: [**example**](./llm/kimi-k2-thinking/)
 - [Oct 2025] Run **RL training for LLMs** with SkyRL on your Kubernetes or clouds: [**example**](./llm/skyrl/)
 - [Oct 2025] Train and serve [Andrej Karpathy's](https://x.com/karpathy/status/1977755427569111362) **nanochat** - the best ChatGPT that $100 can buy: [**example**](./llm/nanochat)
diff --git a/charts/external-metrics/.gitignore b/charts/external-metrics/.gitignore
deleted file mode 100644
index 2946e34f050..00000000000
--- a/charts/external-metrics/.gitignore
+++ /dev/null
@@ -1,2 +0,0 @@
-Chart.lock
-charts/
diff --git a/charts/external-metrics/Chart.yaml b/charts/external-metrics/Chart.yaml
deleted file mode 100644
index ed5106c27bd..00000000000
--- a/charts/external-metrics/Chart.yaml
+++ /dev/null
@@ -1,11 +0,0 @@
-apiVersion: v2
-name: skypilot-prometheus-server
-description: A Helm chart for deploying Prometheus Server
-type: application
-version: 0.0.0
-appVersion: "0.0"
-dependencies:
-  - name: prometheus
-    version: 27.20.0
-    repository: https://prometheus-community.github.io/helm-charts
-    condition: prometheus.enabled
diff --git a/charts/external-metrics/values.yaml b/charts/external-metrics/values.yaml
deleted file mode 100644
index 4835025d3ff..00000000000
--- a/charts/external-metrics/values.yaml
+++ /dev/null
@@ -1,28 +0,0 @@
-# Set configuration for Prometheus helm chart
-prometheus:
-  enabled: true
-  # Refer to https://github.com/prometheus-community/helm-charts/blob/main/charts/prometheus/values.yaml for available values.
-  # Keep the installation minimal by default. If you want to monitor more resources other than the API server,
-  # it is recommended to install and manage prometheus separately.
-  # SkyPilot API server will be automatically discovered by the prometheus if it runs with the default kubernetes discovery configuration.
-  server:
-    persistentVolume:
-      enabled: true
-      size: 50Gi
-    retention: "1000d"
-    # The Prometheus documentations recommends setting the retention size to be 80-85% of the persistent volume size.
-    # ref: https://prometheus.io/docs/prometheus/latest/storage/#right-sizing-retention-size
-    # 43GB is roughly 80% of the 50Gi persistent volume size. We use Gi for the PV size and GB for the retention size
-    # because these are the units specified by the Prometheus chart schema for each respective field.
-    retentionSize: "43GB"
-  kube-state-metrics:
-    enabled: true
-    # TODO (kyuds): remove skypilot-cluster label in v0.12.0; deprecated in favor of skypilot-cluster-name.
-    metricLabelsAllowlist:
-      - pods=[skypilot-cluster,skypilot-cluster-name]
-  prometheus-node-exporter:
-    enabled: false
-  prometheus-pushgateway:
-    enabled: false
-  alertmanager:
-    enabled: false
diff --git a/charts/skypilot/templates/NOTES.txt b/charts/skypilot/templates/NOTES.txt
index 4e6b4e8de54..69d5e64178f 100644
--- a/charts/skypilot/templates/NOTES.txt
+++ b/charts/skypilot/templates/NOTES.txt
@@ -3,3 +3,4 @@
 {{- end }}
 {{- include "skypilot.checkUpgradeConfig" . }}
 {{- include "skypilot.validateOAuthConfig" . }}
+{{- include "skypilot.validateExternalProxyConfig" . }}
diff --git a/charts/skypilot/templates/_helpers.tpl b/charts/skypilot/templates/_helpers.tpl
index f4339c7cf71..55a0807fa4d 100644
--- a/charts/skypilot/templates/_helpers.tpl
+++ b/charts/skypilot/templates/_helpers.tpl
@@ -175,3 +175,18 @@ false
   {{- fail "Error\nauth.oauth.enabled cannot be used together with ingress OAuth2 proxy authentication (ingress.oauth2-proxy.enabled). These authentication methods are mutually exclusive. Please:\n1. Disable auth.oauth.enabled, OR\n2. Set ingress.oauth2-proxy.enabled to false\nThen try again." -}}
 {{- end -}}
 {{- end -}}
+
+{{/* Validate the external proxy config */}}
+{{- define "skypilot.validateExternalProxyConfig" -}}
+{{- $externalProxyEnabled := .Values.auth.externalProxy.enabled -}}
+{{- $authOAuthEnabled := .Values.auth.oauth.enabled -}}
+{{- $ingressOAuthEnabled := include "skypilot.ingressOAuthEnabled" . | trim | eq "true" -}}
+
+{{- if and $externalProxyEnabled $authOAuthEnabled -}}
+  {{- fail "Error\nauth.externalProxy.enabled cannot be used together with auth.oauth.enabled. These authentication methods are mutually exclusive. Please:\n1. Disable auth.externalProxy.enabled, OR\n2. Set auth.oauth.enabled to false\nThen try again." -}}
+{{- end -}}
+
+{{- if and $externalProxyEnabled $ingressOAuthEnabled -}}
+  {{- fail "Error\nauth.externalProxy.enabled cannot be used together with ingress.oauth2-proxy.enabled. These authentication methods are mutually exclusive. Please:\n1. Disable auth.externalProxy.enabled, OR\n2. Set ingress.oauth2-proxy.enabled to false\nThen try again." -}}
+{{- end -}}
+{{- end -}}
diff --git a/charts/skypilot/templates/api-deployment.yaml b/charts/skypilot/templates/api-deployment.yaml
index 8197ca29b6f..32987835615 100644
--- a/charts/skypilot/templates/api-deployment.yaml
+++ b/charts/skypilot/templates/api-deployment.yaml
@@ -11,8 +11,8 @@ spec:
   {{- if and (not .Values.apiService.dbConnectionSecretName) (not .Values.apiService.dbConnectionString) }}
   {{- fail "External database must be configured via .apiService.dbConnectionSecretName or .apiService.dbConnectionString when using RollingUpdate strategy" }}
   {{- end }}
-  {{- if .Values.storage.enabled }}
-  {{- fail "Local storage is not supported when using RollingUpdate strategy. Use recreate upgrade strategy or set storage.enabled to false." }}
+  {{- if and .Values.storage.enabled (ne .Values.storage.accessMode "ReadWriteMany") }}
+  {{- fail "Local storage with ReadWriteOnce access mode is not supported when using RollingUpdate strategy. Either use Recreate upgrade strategy, set storage.enabled to false, or use ReadWriteMany access mode with a compatible storage class (e.g., NFS-backed storage like Google Filestore)." }}
   {{- end }}
   strategy:
     type: RollingUpdate
@@ -82,7 +82,7 @@ spec:
           value: {{ .Values.apiService.skypilotDev | quote }}
         - name: SKYPILOT_RELEASE_NAME
           value: {{ $fullName | quote }}
-        {{- if include "skypilot.enableBasicAuthInAPIServer" . | trim | eq "true" }}
+        {{- if and (eq (include "skypilot.enableBasicAuthInAPIServer" . | trim) "true") (ne (include "skypilot.initialBasicAuthSecretName" . | trim) "") }}
         - name: SKYPILOT_INITIAL_BASIC_AUTH
           valueFrom:
             secretKeyRef:
@@ -136,10 +136,18 @@ spec:
         - name: SKYPILOT_ROLLING_UPDATE_ENABLED
           value: "true"
         {{- end }}
+        {{- if .Values.storage.enabled }}
+        - name: SKYPILOT_API_SERVER_STORAGE_ENABLED
+          value: "true"
+        {{- end }}
         {{- if .Values.apiService.metrics.enabled }}
         - name: SKY_API_SERVER_METRICS_ENABLED
           value: "true"
         {{- end }}
+        {{- if .Values.auth.disableBasicAuthMiddleware }}
+        - name: SKYPILOT_DISABLE_BASIC_AUTH_MIDDLEWARE
+          value: "true"
+        {{- end }}
         {{- if .Values.auth.oauth.enabled }}
         - name: SKYPILOT_AUTH_OAUTH2_PROXY_ENABLED
           value: "true"
@@ -256,16 +264,42 @@ spec:
           periodSeconds: 5
           initialDelaySeconds: 5
         volumeMounts:
+        {{- if and .Values.storage.enabled (eq .Values.apiService.upgradeStrategy "RollingUpdate") }}
+        # For RollingUpdate with storage enabled, use emptyDir for ~/.sky to avoid
+        # running SQLite on NFS. Only persist the clients directory for file mounts.
+        # An ephemeral volume is still required since we have to share ~/.sky between
+        # containers.
+        - name: sky-ephemeral
+          mountPath: /root/.sky
+        - name: state-volume
+          mountPath: /root/.sky/api_server/clients
+          subPath: api_server/clients
+        {{- else }}
         - name: state-volume
           mountPath: /root/.sky
           subPath: .sky
+        {{- end }}
         {{- if .Values.storage.enabled }}
         - name: state-volume
           mountPath: /root/.ssh # To preserve the SSH keys for the user when using the API server
           subPath: .ssh
+        # Mount only the specific subdirectories needed for managed job logs, not the entire sky_logs folder
+        # This avoids persisting transient cluster logs (sky-*) and api_server logs
+        - name: state-volume
+          mountPath: /root/sky_logs/jobs_controller # Controller logs for `sky jobs logs --controller`
+          subPath: sky_logs/jobs_controller
+        - name: state-volume
+          mountPath: /root/sky_logs/managed_jobs # Task execution logs for managed jobs
+          subPath: sky_logs/managed_jobs
         {{- end }}
         - name: skypilot-config
           mountPath: /var/skypilot/config
+        {{- if or .Values.auth.externalProxy.enabled (eq (include "skypilot.ingressOAuthEnabled" .) "true") }}
+        - name: skypilot-server-config
+          mountPath: /root/.sky/.server.yaml
+          subPath: server.yaml
+          readOnly: true
+        {{- end }}
         {{- if .Values.apiService.sshNodePools }}
         - name: skypilot-ssh-node-pools
           mountPath: /var/skypilot/ssh_node_pool
@@ -374,9 +408,14 @@ spec:
             sleep 60;
           done
         volumeMounts:
+        {{- if and .Values.storage.enabled (eq .Values.apiService.upgradeStrategy "RollingUpdate") }}
+        - name: sky-ephemeral
+          mountPath: /root/.sky
+        {{- else }}
         - name: state-volume
           mountPath: /root/.sky
           subPath: .sky
+        {{- end }}
       {{- end }}
       {{- with .Values.apiService.sidecarContainers }}
       {{- toYaml . | nindent 6 }}
@@ -660,6 +699,13 @@ spec:
       - name: state-volume
         persistentVolumeClaim:
           claimName: {{ $fullName }}-state
+      {{- if eq .Values.apiService.upgradeStrategy "RollingUpdate" }}
+      # When using RollingUpdate with storage enabled, use a separate emptyDir
+      # for ~/.sky to avoid running SQLite on NFS. Only specific subdirectories
+      # like api_server/clients are persisted to the PVC.
+      - name: sky-ephemeral
+        emptyDir: {}
+      {{- end }}
       {{- else }}
       - name: state-volume
         emptyDir: {}
@@ -719,6 +765,11 @@ spec:
       - name: skypilot-config
         configMap:
           name: {{ $fullName }}-config
+      {{- if or .Values.auth.externalProxy.enabled (eq (include "skypilot.ingressOAuthEnabled" .) "true") }}
+      - name: skypilot-server-config
+        configMap:
+          name: {{ $fullName }}-server-config
+      {{- end }}
       {{- if .Values.apiService.sshNodePools }}
       - name: skypilot-ssh-node-pools
         secret:
diff --git a/charts/skypilot/templates/server-config.yaml b/charts/skypilot/templates/server-config.yaml
new file mode 100644
index 00000000000..ab0e981ad96
--- /dev/null
+++ b/charts/skypilot/templates/server-config.yaml
@@ -0,0 +1,26 @@
+{{- $externalProxyEnabled := .Values.auth.externalProxy.enabled -}}
+{{- $ingressOAuthEnabled := include "skypilot.ingressOAuthEnabled" . | trim | eq "true" -}}
+{{- if or $externalProxyEnabled $ingressOAuthEnabled -}}
+{{- $fullName := include "skypilot.fullname" . -}}
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: {{ $fullName }}-server-config
+  namespace: {{ .Release.Namespace }}
+data:
+  server.yaml: |-
+    auth:
+      external_proxy:
+        {{- if $externalProxyEnabled }}
+        enabled: true
+        header_name: {{ .Values.auth.externalProxy.headerName | quote }}
+        header_format: {{ .Values.auth.externalProxy.headerFormat | quote }}
+        jwt_identity_claim: {{ .Values.auth.externalProxy.jwtIdentityClaim | quote }}
+        {{- else }}
+        # Enabled for ingress.oauth2-proxy compatibility
+        enabled: true
+        header_name: "X-Auth-Request-Email"
+        header_format: "plaintext"
+        jwt_identity_claim: "sub"
+        {{- end }}
+{{- end }}
diff --git a/charts/skypilot/tests/deployment_test.yaml b/charts/skypilot/tests/deployment_test.yaml
index 4e1ab79a26e..30f8e8c609a 100644
--- a/charts/skypilot/tests/deployment_test.yaml
+++ b/charts/skypilot/tests/deployment_test.yaml
@@ -148,6 +148,25 @@ tests:
             valueFrom:
               fieldRef:
                 fieldPath: metadata.uid
+      # Verify sky-ephemeral is NOT added when storage.enabled=false (backward compatibility)
+      - notContains:
+          path: spec.template.spec.volumes
+          content:
+            name: sky-ephemeral
+            emptyDir: {}
+      # Verify state-volume uses emptyDir (not PVC)
+      - contains:
+          path: spec.template.spec.volumes
+          content:
+            name: state-volume
+            emptyDir: {}
+      # Verify ~/.sky is mounted from state-volume with subPath
+      - contains:
+          path: spec.template.spec.containers[0].volumeMounts
+          content:
+            name: state-volume
+            mountPath: /root/.sky
+            subPath: .sky
 
   - it: should use RollingUpdate strategy when configured with external database via connection string
     set:
@@ -179,14 +198,162 @@ tests:
       - failedTemplate:
           errorMessage: "External database must be configured via .apiService.dbConnectionSecretName or .apiService.dbConnectionString when using RollingUpdate strategy"
 
-  - it: should fail RollingUpdate strategy with local storage enabled
+  - it: should fail RollingUpdate strategy with local storage enabled using ReadWriteOnce
     set:
       apiService.upgradeStrategy: RollingUpdate
       apiService.dbConnectionSecretName: test-db-secret
       storage.enabled: true
     asserts:
       - failedTemplate:
-          errorMessage: "Local storage is not supported when using RollingUpdate strategy. Use recreate upgrade strategy or set storage.enabled to false."
+          errorMessage: "Local storage with ReadWriteOnce access mode is not supported when using RollingUpdate strategy. Either use Recreate upgrade strategy, set storage.enabled to false, or use ReadWriteMany access mode with a compatible storage class (e.g., NFS-backed storage like Google Filestore)."
+
+  - it: should allow RollingUpdate strategy with local storage enabled using ReadWriteMany
+    set:
+      apiService.upgradeStrategy: RollingUpdate
+      apiService.dbConnectionSecretName: test-db-secret
+      storage.enabled: true
+      storage.accessMode: ReadWriteMany
+    asserts:
+      - equal:
+          path: spec.strategy.type
+          value: RollingUpdate
+      - equal:
+          path: spec.strategy.rollingUpdate.maxSurge
+          value: 1
+      - equal:
+          path: spec.strategy.rollingUpdate.maxUnavailable
+          value: 0
+
+  - it: should use emptyDir for ~/.sky when RollingUpdate with storage enabled to avoid SQLite on NFS
+    set:
+      apiService.upgradeStrategy: RollingUpdate
+      apiService.dbConnectionSecretName: test-db-secret
+      storage.enabled: true
+      storage.accessMode: ReadWriteMany
+    asserts:
+      # Should have sky-ephemeral emptyDir volume
+      - contains:
+          path: spec.template.spec.volumes
+          content:
+            name: sky-ephemeral
+            emptyDir: {}
+      # Should mount sky-ephemeral at ~/.sky
+      - contains:
+          path: spec.template.spec.containers[0].volumeMounts
+          content:
+            name: sky-ephemeral
+            mountPath: /root/.sky
+      # Should NOT mount state-volume at ~/.sky with subPath .sky
+      - notContains:
+          path: spec.template.spec.containers[0].volumeMounts
+          content:
+            name: state-volume
+            mountPath: /root/.sky
+            subPath: .sky
+
+  - it: should persist api_server/clients directory when RollingUpdate with storage enabled
+    set:
+      apiService.upgradeStrategy: RollingUpdate
+      apiService.dbConnectionSecretName: test-db-secret
+      storage.enabled: true
+      storage.accessMode: ReadWriteMany
+    asserts:
+      # Should mount api_server/clients from PVC
+      - contains:
+          path: spec.template.spec.containers[0].volumeMounts
+          content:
+            name: state-volume
+            mountPath: /root/.sky/api_server/clients
+            subPath: api_server/clients
+
+  - it: should use state-volume for ~/.sky when Recreate with storage enabled
+    set:
+      apiService.upgradeStrategy: Recreate
+      storage.enabled: true
+    asserts:
+      # Should NOT have sky-ephemeral volume
+      - notContains:
+          path: spec.template.spec.volumes
+          content:
+            name: sky-ephemeral
+            emptyDir: {}
+      # Should mount state-volume at ~/.sky with subPath
+      - contains:
+          path: spec.template.spec.containers[0].volumeMounts
+          content:
+            name: state-volume
+            mountPath: /root/.sky
+            subPath: .sky
+      # Should NOT have separate clients mount
+      - notContains:
+          path: spec.template.spec.containers[0].volumeMounts
+          content:
+            name: state-volume
+            mountPath: /root/.sky/api_server/clients
+            subPath: api_server/clients
+
+  - it: should use sky-ephemeral for logrotate sidecar when RollingUpdate with storage enabled
+    set:
+      apiService.upgradeStrategy: RollingUpdate
+      apiService.dbConnectionSecretName: test-db-secret
+      storage.enabled: true
+      storage.accessMode: ReadWriteMany
+      apiService.logs.retention.enabled: true
+    asserts:
+      # Logrotate sidecar should mount sky-ephemeral at ~/.sky
+      - contains:
+          path: spec.template.spec.containers[1].volumeMounts
+          content:
+            name: sky-ephemeral
+            mountPath: /root/.sky
+
+  - it: should use state-volume for logrotate sidecar when Recreate with storage enabled
+    set:
+      apiService.upgradeStrategy: Recreate
+      storage.enabled: true
+      apiService.logs.retention.enabled: true
+    asserts:
+      # Logrotate sidecar should mount state-volume at ~/.sky with subPath
+      - contains:
+          path: spec.template.spec.containers[1].volumeMounts
+          content:
+            name: state-volume
+            mountPath: /root/.sky
+            subPath: .sky
+
+  - it: should mount managed job log directories when storage is enabled
+    set:
+      storage.enabled: true
+    asserts:
+      - contains:
+          path: spec.template.spec.containers[0].volumeMounts
+          content:
+            name: state-volume
+            mountPath: /root/sky_logs/jobs_controller
+            subPath: sky_logs/jobs_controller
+      - contains:
+          path: spec.template.spec.containers[0].volumeMounts
+          content:
+            name: state-volume
+            mountPath: /root/sky_logs/managed_jobs
+            subPath: sky_logs/managed_jobs
+
+  - it: should not mount managed job log directories when storage is disabled
+    set:
+      storage.enabled: false
+    asserts:
+      - notContains:
+          path: spec.template.spec.containers[0].volumeMounts
+          content:
+            name: state-volume
+            mountPath: /root/sky_logs/jobs_controller
+            subPath: sky_logs/jobs_controller
+      - notContains:
+          path: spec.template.spec.containers[0].volumeMounts
+          content:
+            name: state-volume
+            mountPath: /root/sky_logs/managed_jobs
+            subPath: sky_logs/managed_jobs
 
   - it: should honor fullnameOverride for deployment names and labels
     set:
@@ -606,3 +773,90 @@ tests:
           content:
             name: setup-coreweave-credentials
             image: registry.example.com/custom/berkeleyskypilot/skypilot-nightly:latest
+
+  # Test cases for SKYPILOT_INITIAL_BASIC_AUTH environment variable
+  - it: should set SKYPILOT_INITIAL_BASIC_AUTH when basic auth enabled with initialBasicAuthSecret
+    set:
+      apiService.enableUserManagement: true
+      apiService.initialBasicAuthSecret: my-auth-secret
+    asserts:
+      - contains:
+          path: spec.template.spec.containers[0].env
+          content:
+            name: SKYPILOT_INITIAL_BASIC_AUTH
+            valueFrom:
+              secretKeyRef:
+                name: my-auth-secret
+                key: auth
+
+  - it: should set SKYPILOT_INITIAL_BASIC_AUTH when basic auth enabled with initialBasicAuthCredentials
+    set:
+      apiService.enableUserManagement: true
+      apiService.initialBasicAuthCredentials: "user:password"
+    asserts:
+      - contains:
+          path: spec.template.spec.containers[0].env
+          content:
+            name: SKYPILOT_INITIAL_BASIC_AUTH
+            valueFrom:
+              secretKeyRef:
+                name: RELEASE-NAME-initial-basic-auth
+                key: auth
+
+  - it: should not set SKYPILOT_INITIAL_BASIC_AUTH when basic auth enabled but no initial credentials
+    set:
+      apiService.enableUserManagement: true
+      # Neither initialBasicAuthSecret nor initialBasicAuthCredentials set
+    asserts:
+      - notContains:
+          path: spec.template.spec.containers[0].env
+          content:
+            name: SKYPILOT_INITIAL_BASIC_AUTH
+
+  - it: should not set SKYPILOT_INITIAL_BASIC_AUTH when basic auth disabled
+    set:
+      apiService.enableUserManagement: false
+      apiService.initialBasicAuthSecret: my-auth-secret
+    asserts:
+      - notContains:
+          path: spec.template.spec.containers[0].env
+          content:
+            name: SKYPILOT_INITIAL_BASIC_AUTH
+
+  - it: should not set SKYPILOT_INITIAL_BASIC_AUTH when oauth2-proxy is enabled
+    set:
+      ingress.oauth2-proxy.enabled: true
+      apiService.enableUserManagement: true
+      apiService.initialBasicAuthSecret: my-auth-secret
+    asserts:
+      - notContains:
+          path: spec.template.spec.containers[0].env
+          content:
+            name: SKYPILOT_INITIAL_BASIC_AUTH
+
+  # Test cases for SKYPILOT_DISABLE_BASIC_AUTH_MIDDLEWARE environment variable
+  - it: should set SKYPILOT_DISABLE_BASIC_AUTH_MIDDLEWARE when auth.disableBasicAuthMiddleware is true
+    set:
+      auth.disableBasicAuthMiddleware: true
+    asserts:
+      - contains:
+          path: spec.template.spec.containers[0].env
+          content:
+            name: SKYPILOT_DISABLE_BASIC_AUTH_MIDDLEWARE
+            value: "true"
+
+  - it: should not set SKYPILOT_DISABLE_BASIC_AUTH_MIDDLEWARE when auth.disableBasicAuthMiddleware is false
+    set:
+      auth.disableBasicAuthMiddleware: false
+    asserts:
+      - notContains:
+          path: spec.template.spec.containers[0].env
+          content:
+            name: SKYPILOT_DISABLE_BASIC_AUTH_MIDDLEWARE
+
+  - it: should not set SKYPILOT_DISABLE_BASIC_AUTH_MIDDLEWARE by default
+    asserts:
+      - notContains:
+          path: spec.template.spec.containers[0].env
+          content:
+            name: SKYPILOT_DISABLE_BASIC_AUTH_MIDDLEWARE
diff --git a/charts/skypilot/values.schema.json b/charts/skypilot/values.schema.json
index 6ae7a2685f4..2d7e9fbaffb 100644
--- a/charts/skypilot/values.schema.json
+++ b/charts/skypilot/values.schema.json
@@ -172,6 +172,26 @@
                 "null"
             ],
             "properties": {
+                "externalProxy": {
+                    "type": [
+                        "object",
+                        "null"
+                    ],
+                    "properties": {
+                        "enabled": {
+                            "type": "boolean"
+                        },
+                        "headerFormat": {
+                            "type": "string"
+                        },
+                        "headerName": {
+                            "type": "string"
+                        },
+                        "jwtIdentityClaim": {
+                            "type": "string"
+                        }
+                    }
+                },
                 "oauth": {
                     "type": [
                         "object",
diff --git a/charts/skypilot/values.yaml b/charts/skypilot/values.yaml
index 2cf87dbf1f4..9d4a4a2b980 100644
--- a/charts/skypilot/values.yaml
+++ b/charts/skypilot/values.yaml
@@ -36,6 +36,9 @@ apiService:
   # - Recreate: delete the old pod first and create a new one (has downtime).
   # - RollingUpdate: [EXPERIMENTAL] create a new pod first, wait for it to be ready, then delete the old one (zero downtime).
   # Default to Recreate. When set to RollingUpdate, an external database must be configured via .apiService.dbConnectionSecretName or .apiService.dbConnectionString.
+  # For persistent storage with RollingUpdate, use storage.accessMode=ReadWriteMany with an RWX-capable storage class.
+  # If storage.enabled=false with RollingUpdate, file mounts and logs will be lost on pod restart; consider configuring
+  # 'jobs.bucket' in the SkyPilot config to persist file mounts to cloud storage.
   upgradeStrategy: Recreate
   # Deprecated: use other auth methods instead.
   # Refer to https://docs.skypilot.co/en/latest/reference/auth.html for more details.
@@ -257,14 +260,53 @@ auth:
     # @schema type: [boolean, null]
     enabled: null
 
+  # Proxy authentication configuration.
+  # Use this when deploying behind an external authentication proxy
+  # (e.g., AWS ALB with Cognito, Azure Front Door, custom ingress auth).
+  # When enabled, the API server trusts the identity header from the proxy.
+  # This is mutually exclusive with auth.oauth and ingress.oauth2-proxy.
+  # @schema type: [object, null]
+  externalProxy:
+    # Enable proxy authentication.
+    # @schema type: [boolean]
+    enabled: false
+    # Header name containing the user identity.
+    # @schema type: [string]
+    headerName: 'X-Auth-Request-Email'
+    # Header format: 'plaintext' or 'jwt'.
+    # Use 'jwt' for headers that contain JWT tokens.
+    # Use 'plaintext' for headers that contain plain identity strings.
+    # @schema type: [string]
+    headerFormat: 'plaintext'
+    # JWT claim to extract identity from (only used when headerFormat is 'jwt').
+    # @schema type: [string]
+    jwtIdentityClaim: 'sub'
+
 storage:
   # Enable/disable persistent storage
   # With this enabled, SkyPilot will use a PV to persist the internal data like states, logs, lock files, catalog, etc.
+  # Persisted data includes:
+  # - Managed job logs (accessible via `sky jobs logs <job_id>` and `sky jobs logs --controller <job_id>`)
+  # - File mounts uploaded during managed job submission
+  # - API server state and configuration
+  # Note: Transient cluster logs (sky-*) and api_server logs are NOT persisted to minimize storage usage.
   # Refer to https://docs.skypilot.co/en/latest/reference/architecture/state.html for more details.
+  #
+  # IMPORTANT: When using RollingUpdate upgrade strategy:
+  # - ReadWriteOnce (RWO): NOT supported - the PVC cannot be mounted by both old and new pods during rolling update.
+  # - ReadWriteMany (RWX): Supported - requires an RWX-capable storage class (e.g., NFS-backed storage like Google Filestore,
+  #   AWS EFS, Azure Files, or an NFS provisioner). Both pods can mount the same PVC during the rolling update.
+  # - storage.enabled=false: Supported - but file mounts and logs will be lost on pod restart. Consider configuring
+  #   'jobs.bucket' in the SkyPilot config to use cloud storage for file mounts.
   enabled: true
   # Storage class name - leave empty to use cluster default
+  # For RWX storage with RollingUpdate, use a storage class that supports ReadWriteMany access mode:
+  # - GKE: Create a Filestore-backed storage class (https://cloud.google.com/filestore/docs/accessing-fileshares)
+  # - EKS: Use EFS CSI driver (https://docs.aws.amazon.com/eks/latest/userguide/efs-csi.html)
+  # - AKS: Use Azure Files (https://docs.microsoft.com/azure/aks/azure-files-dynamic-pv)
   storageClassName: ""
   # Access modes - ReadWriteOnce or ReadWriteMany depending on what is supported by the storage class
+  # When using RollingUpdate upgrade strategy, ReadWriteMany is required for persistent storage.
   accessMode: ReadWriteOnce
   # Storage size
   size: 10Gi
diff --git a/docs/source/_static/custom.js b/docs/source/_static/custom.js
index 89d53b38256..9f7c75d3d64 100644
--- a/docs/source/_static/custom.js
+++ b/docs/source/_static/custom.js
@@ -35,16 +35,14 @@ document.addEventListener('DOMContentLoaded', () => {
     // New items:
     const newItems = [
         { selector: '.toctree-l2 > a', text: 'HTTPS Encryption' },
-        { selector: '.toctree-l1 > a', text: 'Examples' },
-        { selector: '.toctree-l1 > a', text: 'Workspaces: Isolating Teams' },
         { selector: '.toctree-l1 > a', text: 'External Logging Storage' },
-        { selector: '.toctree-l1 > a', text: 'Authentication and RBAC' },
         { selector: '.toctree-l1 > a', text: 'Volumes' },
         { selector: '.toctree-l2 > a', text: 'Upgrading API Server' },
         { selector: '.toctree-l1 > a', text: 'High Availability Controller' },
         { selector: '.toctree-l2 > a', text: 'High Availability Controller' },
         { selector: '.toctree-l3 > a', text: 'Advanced: High Availability Controller' },
         { selector: '.toctree-l1 > a', text: 'Using a Pool of Workers' },
+        { selector: '.toctree-l1 > a', text: 'Job Groups' },
         { selector: '.toctree-l1 > a', text: 'Using Slurm' },
     ];
     newItems.forEach(({ selector, text }) => {
diff --git a/docs/source/cloud-setup/cloud-permissions/aws.rst b/docs/source/cloud-setup/cloud-permissions/aws.rst
index 9f268b41e8f..3b5d3071774 100644
--- a/docs/source/cloud-setup/cloud-permissions/aws.rst
+++ b/docs/source/cloud-setup/cloud-permissions/aws.rst
@@ -463,11 +463,44 @@ These are the minimal policy rules required by SkyPilot:
            {
                 "Effect": "Allow",
                 "Action": [
-                    "s3:*"
+                    "s3:GetObject",
+                    "s3:PutObject",
+                    "s3:DeleteObject"
                 ],
+                "Resource": "arn:aws:s3:::*/*"
+            },
+            {
+                "Effect": "Allow",
+                "Action": [
+                    "s3:ListBucket",
+                    "s3:GetBucketLocation"
+                ],
+                "Resource": "arn:aws:s3:::*"
+            },
+            {
+                "Effect": "Allow",
+                "Action": "s3:ListAllMyBuckets",
                 "Resource": "*"
             }
 
+**Optional**: If you also want to allow SkyPilot to create and delete S3 buckets (for ``sky storage`` commands), add these additional permissions:
+
+.. code-block:: json
+
+           {
+                "Effect": "Allow",
+                "Action": [
+                    "s3:CreateBucket",
+                    "s3:DeleteBucket",
+                    "s3:PutBucketTagging"
+                ],
+                "Resource": "arn:aws:s3:::*"
+            }
+
+.. tip::
+
+    If you are using EKS and want to set up S3 access with IAM roles, see :ref:`aws-eks-iam-roles`.
+
 **Once you have added all needed policies, click Next** and follow the instructions to finish creating the policy. You can give the policy a descriptive name, such as ``minimal-skypilot-policy``.
 
 
diff --git a/docs/source/cloud-setup/cloud-permissions/kubernetes.rst b/docs/source/cloud-setup/cloud-permissions/kubernetes.rst
index c58bd28a5d2..31fa303d830 100644
--- a/docs/source/cloud-setup/cloud-permissions/kubernetes.rst
+++ b/docs/source/cloud-setup/cloud-permissions/kubernetes.rst
@@ -266,7 +266,6 @@ To create a service account that has all necessary permissions for SkyPilot (inc
     apiVersion: rbac.authorization.k8s.io/v1
     metadata:
       name: sky-sa-cluster-role  # Can be changed if needed
-      namespace: default  # Change to your namespace if using a different one.
       labels:
         parent: skypilot
     rules:
@@ -291,7 +290,6 @@ To create a service account that has all necessary permissions for SkyPilot (inc
     kind: ClusterRoleBinding
     metadata:
       name: sky-sa-cluster-role-binding  # Can be changed if needed
-      namespace: default  # Change to your namespace if using a different one.
       labels:
         parent: skypilot
     subjects:
@@ -300,7 +298,7 @@ To create a service account that has all necessary permissions for SkyPilot (inc
         namespace: default  # Change to your namespace if using a different one.
     roleRef:
       kind: ClusterRole
-      name: sky-sa-cluster-role  # Use the same name as the cluster role at line 43
+      name: sky-sa-cluster-role  # Use the same name as the cluster role at line 56
       apiGroup: rbac.authorization.k8s.io
     ---
     # Optional: If using object store mounting, create the skypilot-system namespace
diff --git a/docs/source/docs/index.rst b/docs/source/docs/index.rst
index 63cac22835a..f0fe1a8d7d4 100644
--- a/docs/source/docs/index.rst
+++ b/docs/source/docs/index.rst
@@ -306,6 +306,7 @@ Read the research:
    Many Parallel Jobs <../running-jobs/many-jobs>
    Model Training Guide <../reference/training-guide>
    Using a Pool of Workers <../examples/pools>
+   Job Groups <../examples/job-groups>
 
 .. toctree::
    :hidden:
diff --git a/docs/source/examples/interactive-development.rst b/docs/source/examples/interactive-development.rst
index 316d601f71f..1015c553816 100644
--- a/docs/source/examples/interactive-development.rst
+++ b/docs/source/examples/interactive-development.rst
@@ -115,6 +115,13 @@ For more details, please refer to the `VSCode documentation <https://code.visual
   :align: center
   :alt: Connect to the cluster with VSCode
 
+
+.. tip::
+
+   **Windows (WSL) users**: SkyPilot automatically configures both WSL and Windows SSH
+   settings, so VSCode's Remote-SSH extension works out of the box with clusters launched
+   from WSL.
+
 .. _dev-notebooks:
 
 Jupyter notebooks
diff --git a/docs/source/examples/job-groups.rst b/docs/source/examples/job-groups.rst
new file mode 100644
index 00000000000..77ae2f7152d
--- /dev/null
+++ b/docs/source/examples/job-groups.rst
@@ -0,0 +1,345 @@
+.. _job-groups:
+
+Job Groups
+==========
+
+.. warning::
+
+  **This is an experimental feature.** The interface may change in future versions.
+
+Job Groups allow you to run multiple related tasks in parallel as a single managed unit.
+Unlike :ref:`managed jobs <managed-jobs>` which run tasks sequentially (pipelines),
+Job Groups launch all tasks simultaneously, enabling complex distributed architectures.
+
+.. figure:: ../images/job-groups-dashboard.png
+   :width: 100%
+   :align: center
+   :alt: Job Groups in SkyPilot Dashboard
+
+   A Job Group with 4 tasks (data-server, rollout-server, reward-server, ppo-trainer)
+   running in parallel on Kubernetes. Each task has different resource requirements
+   and can be monitored independently through the dashboard.
+
+Overview
+--------
+
+**Key Features:**
+
+- **Parallel execution**: Launch multiple tasks simultaneously, each running independently
+- **Heterogeneous resources**: Different resource requirements per task (e.g., GPUs for training, CPUs for data serving)
+- **Automatic service discovery**: Tasks discover each other and communicate via hostnames
+- **Independent recovery**: Each task recovers from preemptions without affecting other tasks
+
+**When to Use Job Groups:**
+
+Job Groups are ideal for workloads where multiple components with different requirements need to run together and communicate. Common use cases include:
+
+- **RL post-training**: Separate tasks for trainer, reward modeling, rollout server, and data serving
+- **Parallel train-eval**: Training and evaluation running in parallel with shared storage
+
+.. tip::
+
+   Use Job Groups when your workload has **heterogeneous tasks** that need to run
+   **in parallel** and **communicate with each other**. For homogeneous multi-node
+   training within a single task, use :ref:`distributed jobs <dist-jobs>` instead.
+   For sequential task execution, use :ref:`managed job pipelines <pipeline>`.
+
+.. contents:: Contents
+   :local:
+   :backlinks: none
+
+
+Creating a job group
+--------------------
+
+A Job Group is defined using a multi-document YAML file. The first document is the
+**header** that defines the group's properties, followed by individual task definitions:
+
+.. code-block:: yaml
+
+    # job-group.yaml
+    ---
+    # Header: Job Group configuration
+    name: my-job-group
+    execution: parallel      # Required: indicates this is a Job Group
+    ---
+    # Task 1: Trainer
+    name: trainer
+    resources:
+      accelerators: A100:1
+    run: |
+      python train.py
+    ---
+    # Task 2: Evaluator
+    name: evaluator
+    resources:
+      accelerators: A100:1
+    run: |
+      python evaluate.py
+
+Launch the Job Group with:
+
+.. code-block:: console
+
+    $ sky jobs launch job-group.yaml
+
+Header fields
+~~~~~~~~~~~~~
+
+The header document supports the following fields:
+
+.. list-table::
+   :widths: 20 20 60
+   :header-rows: 1
+
+   * - Field
+     - Default
+     - Description
+   * - ``name``
+     - Required
+     - Name of the Job Group
+   * - ``execution``
+     - Required
+     - Must be ``parallel`` to indicate this is a Job Group
+   * - ``primary_tasks``
+     - None
+     - List of task names that are "primary". Tasks not in this list are
+       "auxiliary" - long-running services (e.g., data servers, replay buffers)
+       that wait for a signal to terminate. When all primary tasks complete,
+       auxiliary tasks are terminated. If not set, all tasks are primary.
+   * - ``termination_delay``
+     - None
+     - Delay before terminating auxiliary tasks when primary tasks complete,
+       allowing them to finish pending work (e.g., flushing data). Can be a
+       string (e.g., ``"30s"``, ``"5m"``) or a dict with per-task delays
+       (e.g., ``{"default": "30s", "replay-buffer": "1m"}``).
+
+Each task document after the header follows the standard :ref:`SkyPilot task YAML format <yaml-spec>`.
+
+.. note::
+
+    Every task in a Job Group **must have a unique name**. The name is used for
+    service discovery and log viewing.
+
+
+Service discovery
+-----------------
+
+Tasks in a Job Group can discover each other using hostnames. SkyPilot automatically
+configures networking so that tasks can communicate.
+
+Hostname format
+~~~~~~~~~~~~~~~
+
+Each task's head node is accessible via the hostname:
+
+.. code-block:: text
+
+    {task_name}-0.{job_group_name}
+
+For multi-node tasks, worker nodes use:
+
+.. code-block:: text
+
+    {task_name}-{node_index}.{job_group_name}
+
+For example, in a Job Group named ``rlhf-experiment`` with a 2-node ``trainer`` task:
+
+- ``trainer-0.rlhf-experiment`` - Head node (rank 0)
+- ``trainer-1.rlhf-experiment`` - Worker node (rank 1)
+
+Environment variables
+~~~~~~~~~~~~~~~~~~~~~
+
+SkyPilot injects the following environment variables into all tasks:
+
+.. list-table::
+   :widths: 40 60
+   :header-rows: 1
+
+   * - Variable
+     - Description
+   * - ``SKYPILOT_JOBGROUP_NAME``
+     - Name of the Job Group
+
+Example usage in a task:
+
+.. code-block:: bash
+
+    # Access the trainer task from the evaluator using the hostname
+    curl http://trainer-0.${SKYPILOT_JOBGROUP_NAME}:8000/status
+
+
+Viewing logs
+------------
+
+View logs for a specific task within a Job Group:
+
+.. code-block:: console
+
+    # View logs for a specific task by name
+    $ sky jobs logs <job_id> trainer
+
+    # View logs for a specific task by task ID
+    $ sky jobs logs <job_id> 0
+
+    # View all task logs (default)
+    $ sky jobs logs <job_id>
+
+When viewing logs for a multi-task job, SkyPilot displays a hint:
+
+.. code-block:: console
+
+    Hint: This job has 3 tasks. Use 'sky jobs logs 42 TASK' to view logs
+    for a specific task (TASK can be task ID or name).
+
+
+Examples
+--------
+
+Parallel train-eval with shared storage
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+This example runs training and evaluation in parallel, sharing checkpoints via
+a Kubernetes PVC volume:
+
+.. figure:: ../images/job-groups-train-eval-architecture.png
+   :width: 80%
+   :align: center
+   :alt: Parallel Train-Eval Architecture with Job Groups
+
+   Parallel training and evaluation with shared storage. The trainer saves checkpoints
+   to a shared volume while the evaluator monitors and evaluates new checkpoints on-the-fly.
+
+.. code-block:: yaml
+
+    ---
+    name: train-eval
+    execution: parallel
+    ---
+    name: trainer
+    resources:
+      accelerators: A100:1
+    volumes:
+      /checkpoints: my-checkpoint-volume
+    run: |
+      python train.py --checkpoint-dir /checkpoints
+    ---
+    name: evaluator
+    resources:
+      accelerators: A100:1
+    volumes:
+      /checkpoints: my-checkpoint-volume
+    run: |
+      python evaluate.py --checkpoint-dir /checkpoints
+
+See the full example at ``llm/train-eval-jobgroup/`` in the SkyPilot repository.
+
+RL post-training architecture
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+This example demonstrates a distributed RL post-training architecture with 5 tasks:
+
+.. code-block:: yaml
+
+    ---
+    name: rlhf-training
+    execution: parallel
+    ---
+    name: data-server
+    resources:
+      cpus: 4+
+    run: |
+      python data_server.py
+    ---
+    name: rollout-server
+    num_nodes: 2
+    resources:
+      accelerators: A100:1
+    run: |
+      python rollout_server.py
+    ---
+    name: reward-server
+    resources:
+      cpus: 8+
+    run: |
+      python reward_server.py
+    ---
+    name: replay-buffer
+    resources:
+      cpus: 4+
+      memory: 32+
+    run: |
+      python replay_buffer.py
+    ---
+    name: ppo-trainer
+    num_nodes: 2
+    resources:
+      accelerators: A100:1
+    run: |
+      python ppo_trainer.py \
+        --data-server data-server-0.${SKYPILOT_JOBGROUP_NAME}:8000 \
+        --rollout-server rollout-server-0.${SKYPILOT_JOBGROUP_NAME}:8001 \
+        --reward-server reward-server-0.${SKYPILOT_JOBGROUP_NAME}:8002
+
+See the full RL post-training example at ``llm/rl-post-training-jobgroup/`` in the SkyPilot repository.
+
+Primary and auxiliary tasks
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+In many distributed workloads, you have a main task (e.g., trainer) and supporting
+services (e.g., data servers, replay buffers) that run indefinitely until the main
+task signals completion. These supporting services are "auxiliary tasks" - they
+don't have a natural termination point and need to be told when to shut down.
+
+Use ``primary_tasks`` to designate which tasks drive the job's lifecycle. Auxiliary
+tasks (those not listed) will be automatically terminated when all primary tasks
+complete:
+
+.. code-block:: yaml
+
+    ---
+    name: train-with-services
+    execution: parallel
+    primary_tasks: [trainer]      # Only trainer is primary
+    termination_delay: 30s        # Give services 30s to finish after trainer completes
+    ---
+    name: trainer
+    resources:
+      accelerators: A100:1
+    run: |
+      python train.py             # Primary task: job completes when this finishes
+    ---
+    name: data-server
+    resources:
+      cpus: 4+
+    run: |
+      python data_server.py       # Auxiliary: terminated 30s after trainer completes
+
+When the trainer task finishes, the data-server (auxiliary) task will receive a
+termination signal after the 30-second delay, allowing it to flush pending data
+or perform cleanup.
+
+
+Current limitations
+-------------------
+
+- **Co-location**: All tasks in a Job Group run on the same infrastructure
+  (same Kubernetes cluster or cloud zone).
+
+- **Networking**: Service discovery (hostname-based communication between tasks)
+  currently only works on Kubernetes. On other clouds, tasks can run in parallel
+  but cannot communicate with each other using the hostname format.
+
+.. note::
+
+   Job Groups require ``execution: parallel`` in the header. For sequential task
+   execution, use :ref:`managed job pipelines <pipeline>` instead (omit the
+   ``execution`` field or set it to ``serial``).
+
+
+.. seealso::
+
+   :ref:`managed-jobs` for single tasks or sequential pipelines.
+
+   :ref:`dist-jobs` for multi-node distributed training within a single task.
diff --git a/docs/source/examples/managed-jobs.rst b/docs/source/examples/managed-jobs.rst
index 61e3d71267e..82e70db2b85 100644
--- a/docs/source/examples/managed-jobs.rst
+++ b/docs/source/examples/managed-jobs.rst
@@ -7,6 +7,13 @@ Managed Jobs
 
   This feature is great for scaling out: running a single job for long durations, or running many jobs in parallel.
 
+.. seealso::
+
+   :doc:`pools` for running batch inference workloads across multiple infrastructure.
+
+   :ref:`job-groups` for running multiple heterogeneous tasks in parallel that
+   can communicate with each other.
+
 SkyPilot supports **managed jobs** (:code:`sky jobs`), which can automatically retry failures, recover from spot instance preemptions, and clean up when done.
 
 To start a managed job, use :code:`sky jobs launch`:
@@ -403,8 +410,12 @@ A pipeline is a managed job that contains a sequence of tasks running one after
 This is useful for running a sequence of tasks that depend on each other, e.g., training a model and then running inference on it.
 Different tasks can have different resource requirements to use appropriate per-task resources, which saves costs, while  keeping the burden of managing the tasks off the user.
 
+.. seealso::
+
+  :ref:`job-groups` for running multiple tasks **in parallel** instead of sequentially.
+
 .. note::
-  In other words, a managed job is either a single task or a pipeline of tasks. All managed jobs are submitted by :code:`sky jobs launch`.
+  In other words, a managed job is either a single task, a pipeline (sequential tasks), or a :ref:`job group <job-groups>` (parallel tasks). All managed jobs are submitted by :code:`sky jobs launch`.
 
 To run a pipeline, specify the sequence of tasks in a YAML file. Here is an example:
 
@@ -461,6 +472,13 @@ second task has name :code:`eval`. The tasks are separated by a line with three
 dashes :code:`---`. Each task has its own :code:`resources`, :code:`setup`, and
 :code:`run` sections. Tasks are executed sequentially. If a task fails, later tasks are skipped.
 
+.. tip::
+
+   To explicitly indicate a pipeline (sequential execution), you can add
+   :code:`execution: serial` to the header. This is optional since pipelines
+   are the default when :code:`execution` is omitted. Use :code:`execution: parallel`
+   for :ref:`job groups <job-groups>` instead.
+
 To pass data between the tasks, use a shared file mount. In this example, the :code:`train` task writes its output to the :code:`/checkpoint` file mount, which the :code:`eval` task is then able to read from.
 
 To submit the pipeline, the same command :code:`sky jobs launch` is used. The pipeline will be automatically launched and monitored by SkyPilot. You can check the status of the pipeline with :code:`sky jobs queue` or :code:`sky dashboard`.
diff --git a/docs/source/examples/models/index.rst b/docs/source/examples/models/index.rst
index b318907cfe9..ab837535265 100644
--- a/docs/source/examples/models/index.rst
+++ b/docs/source/examples/models/index.rst
@@ -17,7 +17,7 @@ Models
    CodeLlama <codellama>
    Pixtral <pixtral>
    Mixtral <mixtral>
-   Mistral 7B <https://docs.mistral.ai/self-deployment/skypilot/>
+   Mistral 7B <https://docs.mistral.ai/deployment/self-deployment/skypilot/>
    Qwen 3 <qwen>
    Kimi K2 <kimi-k2>
    Kimi K2 Thinking <kimi-k2-thinking>
diff --git a/docs/source/examples/performance/index.rst b/docs/source/examples/performance/index.rst
index 81cc46c9ee6..baa805389e4 100644
--- a/docs/source/examples/performance/index.rst
+++ b/docs/source/examples/performance/index.rst
@@ -8,3 +8,4 @@ AI Performance
    GCP/GKE GPUDirect <gcp_gpu_direct_tcpx>
    Coreweave with InfiniBand <coreweave_infiniband>
    Nebius with InfiniBand <nebius_infiniband>
+   Together AI with InfiniBand <together_infiniband>
diff --git a/docs/source/examples/performance/together_infiniband.md b/docs/source/examples/performance/together_infiniband.md
new file mode 120000
index 00000000000..606573580ae
--- /dev/null
+++ b/docs/source/examples/performance/together_infiniband.md
@@ -0,0 +1 @@
+../../generated-examples/together_infiniband.md
\ No newline at end of file
diff --git a/docs/source/getting-started/installation.rst b/docs/source/getting-started/installation.rst
index 9a54228311b..1ca456d06b2 100644
--- a/docs/source/getting-started/installation.rst
+++ b/docs/source/getting-started/installation.rst
@@ -21,10 +21,9 @@ SkyPilot supports installation with ``uv`` or ``pip``.
     .. code-block:: shell
 
       # Create a virtual environment with pip pre-installed (required for SkyPilot)
-      # SkyPilot requires 3.7 <= python <= 3.13.
+      # SkyPilot requires 3.9 <= python <= 3.13.
       uv venv --seed --python 3.10
       source .venv/bin/activate  # Use WSL on Windows
-
       uv pip install skypilot
 
       # install dependencies for the clouds you want to use
@@ -34,14 +33,14 @@ SkyPilot supports installation with ``uv`` or ``pip``.
 
       The ``--seed`` flag is **required** as it ensures ``pip`` is installed in the virtual environment.
       SkyPilot needs ``pip`` to build wheels for remote cluster setup.
-    
+
   .. tab-item:: uv tool
     :sync: uv-tool-tab
 
     .. code-block:: shell
 
       # Install as a globally available tool with pip included
-      # SkyPilot requires 3.7 <= python <= 3.13.
+      # SkyPilot requires 3.9 <= python <= 3.13.
       uv tool install --with pip skypilot
 
       # install dependencies for the clouds you want to use
@@ -67,6 +66,7 @@ SkyPilot supports installation with ``uv`` or ``pip``.
       # install dependencies for the clouds you want to use
       pip install "skypilot[kubernetes,aws,gcp]"
 
+
 .. dropdown:: Install SkyPilot from nightly build or source
 
     SkyPilot provides nightly builds and source code for the latest features and for development.
@@ -131,7 +131,7 @@ SkyPilot supports installation with ``uv`` or ``pip``.
       git clone https://github.com/skypilot-org/skypilot.git
       cd skypilot
 
-      pip install -e . 
+      pip install -e .
 
 Alternatively, we also provide a :ref:`Docker image <docker-image>` as a quick way to try out SkyPilot.
 
@@ -431,7 +431,7 @@ Install the necessary dependencies for AWS.
     :sync: pip-tab
 
     .. code-block:: shell
-      
+
       # SkyPilot requires 3.7 <= python <= 3.13.
       # From stable release
       pip install "skypilot[aws]"
@@ -586,7 +586,7 @@ Install the necessary dependencies for Azure.
     :sync: pip-tab
 
     .. code-block:: shell
-      
+
       # SkyPilot requires 3.7 <= python <= 3.13.
       # From stable release
       pip install "skypilot[azure]"
@@ -641,7 +641,7 @@ CoreWeave
     :sync: pip-tab
 
     .. code-block:: shell
-      
+
       # SkyPilot requires 3.7 <= python <= 3.13.
       # From stable release
       pip install "skypilot[coreweave]"
@@ -737,7 +737,7 @@ Install the necessary dependencies for Nebius.
     :sync: pip-tab
 
     .. code-block:: shell
-      
+
       # Nebius requires 3.10 <= python <= 3.13.
       # From stable release
       pip install "skypilot[nebius]"
@@ -818,7 +818,7 @@ Install the necessary dependencies for RunPod
     :sync: pip-tab
 
     .. code-block:: shell
-      
+
       # SkyPilot requires 3.7 <= python <= 3.13.
       # From stable release
       pip install "skypilot[runpod]"
@@ -868,7 +868,7 @@ Install the necessary dependencies for OCI.
     :sync: pip-tab
 
     .. code-block:: shell
-      
+
       # SkyPilot requires 3.7 <= python <= 3.13.
       # From stable release
       pip install "skypilot[oci]"
@@ -939,7 +939,7 @@ Install the necessary dependencies for Lambda Cloud.
     :sync: pip-tab
 
     .. code-block:: shell
-      
+
       # SkyPilot requires 3.7 <= python <= 3.13.
       # From stable release
       pip install "skypilot[lambda]"
@@ -989,7 +989,7 @@ Together AI |community-badge|
     :sync: pip-tab
 
     .. code-block:: shell
-      
+
       # SkyPilot requires 3.7 <= python <= 3.13.
       # From stable release
       pip install "skypilot[kubernetes]"
@@ -1042,7 +1042,7 @@ Install the necessary dependencies for Paperspace.
     :sync: pip-tab
 
     .. code-block:: shell
-      
+
       # SkyPilot requires 3.7 <= python <= 3.13.
       # From stable release
       pip install "skypilot[paperspace]"
@@ -1092,7 +1092,7 @@ Install the necessary dependencies for Vast.
     :sync: pip-tab
 
     .. code-block:: shell
-      
+
       # SkyPilot requires 3.7 <= python <= 3.13.
       # From stable release
       pip install "skypilot[vast]"
@@ -1143,7 +1143,7 @@ Install the necessary dependencies for Fluidstack.
     :sync: pip-tab
 
     .. code-block:: shell
-      
+
       # SkyPilot requires 3.7 <= python <= 3.13.
       # From stable release
       pip install "skypilot[fluidstack]"
@@ -1193,7 +1193,7 @@ Cudo Compute |community-badge|
     :sync: pip-tab
 
     .. code-block:: shell
-      
+
       # SkyPilot requires 3.7 <= python <= 3.13.
       # From stable release
       pip install "skypilot[cudo]"
@@ -1255,7 +1255,7 @@ Install the necessary dependencies for Shadeform.
     :sync: pip-tab
 
     .. code-block:: shell
-      
+
       # SkyPilot requires 3.7 <= python <= 3.13.
       # From stable release
       pip install "skypilot[shadeform]"
@@ -1308,7 +1308,7 @@ Install the necessary dependencies for IBM.
     :sync: pip-tab
 
     .. code-block:: shell
-      
+
       # IBM requires 3.7 <= python <= 3.11.
       # From stable release
       pip install "skypilot[ibm]"
@@ -1388,7 +1388,7 @@ Install the necessary dependencies for SCP.
     :sync: pip-tab
 
     .. code-block:: shell
-      
+
       # SCP requires 3.7 <= python <= 3.11.
       # From stable release
       pip install "skypilot[scp]"
@@ -1446,7 +1446,7 @@ Install the necessary dependencies for VMware vSphere.
     :sync: pip-tab
 
     .. code-block:: shell
-      
+
       # SkyPilot requires 3.7 <= python <= 3.13.
       # From stable release
       pip install "skypilot[vsphere]"
@@ -1524,7 +1524,7 @@ Install the necessary dependencies for Cloudflare R2.
     :sync: pip-tab
 
     .. code-block:: shell
-      
+
       # SkyPilot requires 3.7 <= python <= 3.13.
       # From stable release
       pip install "skypilot[cloudflare]"
@@ -1566,7 +1566,7 @@ Next, get your `Account ID <https://developers.cloudflare.com/fundamentals/get-s
 Prime Intellect |community-badge|
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-`Prime Intellect <https://primeintellect.ai/>`__ makes it easy to find global compute resources and train state-of-the-art models through distributed training across clusters. To configure Prime Intellect access: 
+`Prime Intellect <https://primeintellect.ai/>`__ makes it easy to find global compute resources and train state-of-the-art models through distributed training across clusters. To configure Prime Intellect access:
 
 Install the necessary dependencies for Prime Intellect.
 
@@ -1597,7 +1597,7 @@ Install the necessary dependencies for Prime Intellect.
     :sync: pip-tab
 
     .. code-block:: shell
-      
+
       # SkyPilot requires 3.7 <= python <= 3.13.
       # From stable release
       pip install "skypilot[primeintellect]"
@@ -1653,7 +1653,7 @@ Seeweb |community-badge|
     :sync: pip-tab
 
     .. code-block:: shell
-      
+
       # Seeweb requires 3.10 <= python <= 3.13.
       # From stable release
       pip install "skypilot[seeweb]"
@@ -1715,4 +1715,4 @@ Finally, you can stop the container with:
 
 See more details about the dev container image
 ``berkeleyskypilot/skypilot-nightly`` `here
-<https://github.com/skypilot-org/skypilot/blob/master/CONTRIBUTING.md#testing-in-a-container>`_.
\ No newline at end of file
+<https://github.com/skypilot-org/skypilot/blob/master/CONTRIBUTING.md#testing-in-a-container>`_.
diff --git a/docs/source/images/dashboard-clusters.png b/docs/source/images/dashboard-clusters.png
index 7de83651f30..d3708661261 100644
Binary files a/docs/source/images/dashboard-clusters.png and b/docs/source/images/dashboard-clusters.png differ
diff --git a/docs/source/images/dashboard-managed-jobs.png b/docs/source/images/dashboard-managed-jobs.png
index 9513f786479..52aff598bcb 100644
Binary files a/docs/source/images/dashboard-managed-jobs.png and b/docs/source/images/dashboard-managed-jobs.png differ
diff --git a/docs/source/images/job-groups-dashboard.png b/docs/source/images/job-groups-dashboard.png
new file mode 100644
index 00000000000..759b4b7aa6a
Binary files /dev/null and b/docs/source/images/job-groups-dashboard.png differ
diff --git a/docs/source/images/job-groups-rl-architecture.jpg b/docs/source/images/job-groups-rl-architecture.jpg
new file mode 100644
index 00000000000..f608e172ddb
Binary files /dev/null and b/docs/source/images/job-groups-rl-architecture.jpg differ
diff --git a/docs/source/images/job-groups-train-eval-architecture.png b/docs/source/images/job-groups-train-eval-architecture.png
new file mode 100644
index 00000000000..206a916fd1a
Binary files /dev/null and b/docs/source/images/job-groups-train-eval-architecture.png differ
diff --git a/docs/source/images/metrics/deploy-prom-operator.png b/docs/source/images/metrics/deploy-prom-operator.png
deleted file mode 100644
index 2be5229b386..00000000000
Binary files a/docs/source/images/metrics/deploy-prom-operator.png and /dev/null differ
diff --git a/docs/source/images/metrics/search-prom-operator.png b/docs/source/images/metrics/search-prom-operator.png
deleted file mode 100644
index 3260ab3b13b..00000000000
Binary files a/docs/source/images/metrics/search-prom-operator.png and /dev/null differ
diff --git a/docs/source/images/metrics/status-prom-operator.png b/docs/source/images/metrics/status-prom-operator.png
deleted file mode 100644
index 688b6e33da6..00000000000
Binary files a/docs/source/images/metrics/status-prom-operator.png and /dev/null differ
diff --git a/docs/source/images/slurm-cluster-details-page.png b/docs/source/images/slurm-cluster-details-page.png
new file mode 100644
index 00000000000..fbed25568af
Binary files /dev/null and b/docs/source/images/slurm-cluster-details-page.png differ
diff --git a/docs/source/images/slurm-infra-page.png b/docs/source/images/slurm-infra-page.png
new file mode 100644
index 00000000000..17114ab9a89
Binary files /dev/null and b/docs/source/images/slurm-infra-page.png differ
diff --git a/docs/source/reference/api-server/examples/api-server-gpu-metrics-setup.rst b/docs/source/reference/api-server/examples/api-server-gpu-metrics-setup.rst
index e8fe887ac37..2050d25ee9d 100644
--- a/docs/source/reference/api-server/examples/api-server-gpu-metrics-setup.rst
+++ b/docs/source/reference/api-server/examples/api-server-gpu-metrics-setup.rst
@@ -146,21 +146,37 @@ Prometheus setup
 
 In the cluster where you deploy the API server, Prometheus is installed automatically as part of :ref:`api-server-setup-dcgm-metrics-scraping`.
 
-For other Kubernetes clusters (external clusters), deploy Prometheus manually. SkyPilot also requires a Service ``skypilot-prometheus-server`` in the ``skypilot`` namespace to scrape metrics from external clusters.
+For other Kubernetes clusters (external clusters), deploy Prometheus manually. SkyPilot requires a Service named ``skypilot-prometheus-server`` in the ``skypilot`` namespace to scrape metrics from external clusters.
 
-If you use the `Prometheus operator <https://prometheus-operator.dev/docs/getting-started/introduction/>`_, e.g., the `kube-prometheus-stack <https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack#install-helm-chart>`_, install it in the ``skypilot`` namespace, then create the ``skypilot-prometheus-server`` Service in the same namespace.
+First, create a ``prometheus-values.yaml`` file with the following configuration:
+
+.. literalinclude:: ../../../../../examples/metrics/prometheus-values.yaml
+   :language: yaml
+
+Then install Prometheus using ``skypilot-prometheus`` as the release name (this creates the required ``skypilot-prometheus-server`` service):
 
 .. code-block:: bash
 
-    kubectl create -f https://raw.githubusercontent.com/skypilot-org/skypilot/refs/heads/master/examples/metrics/skypilot_prometheus_server_service.yaml -n skypilot
+    helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
+    helm repo update
+    helm upgrade --install skypilot-prometheus prometheus-community/prometheus \
+      --namespace skypilot \
+      --create-namespace \
+      -f prometheus-values.yaml
 
-Alternatively, install the SkyPilot Prometheus server chart; it will create the ``skypilot-prometheus-server`` Service automatically:
+Verify the service was created:
 
 .. code-block:: bash
 
-    helm upgrade --install skypilot skypilot/skypilot-prometheus-server --devel \
-     --namespace skypilot \
-     --create-namespace
+    kubectl get svc skypilot-prometheus-server -n skypilot
+
+Refer to the `Prometheus helm chart values <https://github.com/prometheus-community/helm-charts/blob/main/charts/prometheus/values.yaml>`_ for additional configuration options.
+
+.. note::
+
+    Do not use the Prometheus Operator (kube-prometheus-stack) for GPU metrics.
+    The Prometheus Operator adds an ``exported_`` prefix to pod and namespace labels,
+    which breaks the PromQL queries used by SkyPilot.
 
 If you are using the Nebius Kubernetes cluster, refer to :ref:`api-server-gpu-metrics-setup-nebius` for how to setup the GPU metrics.
 
diff --git a/docs/source/reference/api-server/examples/example-deploy-gke-nebius-okta.rst b/docs/source/reference/api-server/examples/example-deploy-gke-nebius-okta.rst
index bb3c023c709..1c75e202b6e 100644
--- a/docs/source/reference/api-server/examples/example-deploy-gke-nebius-okta.rst
+++ b/docs/source/reference/api-server/examples/example-deploy-gke-nebius-okta.rst
@@ -428,38 +428,29 @@ Setup GPU metrics in Nebius Kubernetes cluster
 
 If you are using Nebius Kubernetes cluster, you can setup GPU metrics in the cluster to get real-time GPU metrics in the SkyPilot dashboard.
 
-1. Install the Prometheus operator.
+1. Install Prometheus.
 
-On Nebius console, in the detail page of the Nebius Kubernetes cluster, go to ``Applications`` -> Search for ``Prometheus Operator`` -> ``Deploy`` -> Enter ``skypilot`` for the ``Namespace`` -> ``Deploy application``.
+First, create a ``prometheus-values.yaml`` file with the following configuration:
 
-.. image:: ../../../images/metrics/search-prom-operator.png
-    :alt: Search for Prometheus Operator
-    :align: center
-    :width: 60%
-
-.. image:: ../../../images/metrics/deploy-prom-operator.png
-    :alt: Deploy Prometheus Operator
-    :align: center
-    :width: 60%
+.. literalinclude:: ../../../../../examples/metrics/prometheus-values.yaml
+   :language: yaml
 
-Wait for the Prometheus operator to be installed, the status badge will become ``Deployed``.
-
-.. image:: ../../../images/metrics/status-prom-operator.png
-    :alt: Status of Prometheus Operator
-    :align: center
-    :width: 60%
-
-You can also check the Pod status to verify the installation.
+Then install Prometheus using ``skypilot-prometheus`` as the release name:
 
 .. code-block:: bash
 
-  kubectl get pods -n skypilot
+    helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
+    helm repo update
+    helm upgrade --install skypilot-prometheus prometheus-community/prometheus \
+      --namespace skypilot \
+      --create-namespace \
+      -f prometheus-values.yaml
 
-By default, the CPU and memory metrics exported by node exporter do not include the ``node`` label, which is required for the SkyPilot dashboard to display the metrics. You can add the ``node`` label to the metrics by applying the following config to the node exporter service monitor resource:
+Verify the ``skypilot-prometheus-server`` service was created:
 
 .. code-block:: bash
 
-  kubectl apply -f https://raw.githubusercontent.com/skypilot-org/skypilot/refs/heads/master/examples/metrics/kube_prometheus_node_exporter_service_monitor.yaml -n skypilot
+    kubectl get svc skypilot-prometheus-server -n skypilot
 
 2. Install the Nvidia Device Plugin.
 
@@ -490,21 +481,7 @@ You can also check the Pod status to verify the installation.
 
 The dcgm exporter will be installed automatically.
 
-3. Create the Prometheus service for SkyPilot API server to retrieve the GPU metrics:
-
-   .. code-block:: bash
-
-     kubectl create -f https://raw.githubusercontent.com/skypilot-org/skypilot/refs/heads/master/examples/metrics/skypilot_prometheus_server_service.yaml -n skypilot
-
-Confirm that the service endpoint is created by running the following command:
-
-.. code-block:: bash
-
-  kubectl get endpoints skypilot-prometheus-server -n skypilot
-  NAME                         ENDPOINTS           AGE
-  skypilot-prometheus-server   10.24.20.128:9090   62s
-
-4. If you are using multiple Kubernetes clusters, you will need to add the context names to ``allowed_contexts`` in the SkyPilot config.
+3. If you are using multiple Kubernetes clusters, you will need to add the context names to ``allowed_contexts`` in the SkyPilot config.
 
 An example config file that allows using the hosting Kubernetes cluster and two additional Kubernetes clusters is shown below:
 
diff --git a/docs/source/reference/api-server/helm-values-spec.rst b/docs/source/reference/api-server/helm-values-spec.rst
index b87246f76d9..4c9daa9604e 100644
--- a/docs/source/reference/api-server/helm-values-spec.rst
+++ b/docs/source/reference/api-server/helm-values-spec.rst
@@ -27,10 +27,6 @@ Values
 
 Below is the available helm value keys and the default value of each key:
 
-..
-  Omitted values:
-  * storage.accessMode: accessMode other than ReadWriteOnce is not tested yet.
-
 .. parsed-literal::
 
   :ref:`global <helm-values-global>`:
@@ -100,6 +96,11 @@ Below is the available helm value keys and the default value of each key:
       :ref:`cookie-expire <helm-values-auth-oauth-cookie-expire>`: null
     :ref:`serviceAccount <helm-values-auth-serviceAccount>`:
       :ref:`enabled <helm-values-auth-serviceAccount-enabled>`: null
+    :ref:`externalProxy <helm-values-auth-externalProxy>`:
+      :ref:`enabled <helm-values-auth-externalProxy-enabled>`: false
+      :ref:`headerName <helm-values-auth-externalProxy-headerName>`: 'X-Auth-Request-Email'
+      :ref:`headerFormat <helm-values-auth-externalProxy-headerFormat>`: 'plaintext'
+      :ref:`jwtIdentityClaim <helm-values-auth-externalProxy-jwtIdentityClaim>`: 'sub'
 
   :ref:`storage <helm-values-storage>`:
     :ref:`enabled <helm-values-storage-enabled>`: true
@@ -423,6 +424,11 @@ Upgrade strategy for the API server deployment. Available options are:
 
 When set to ``RollingUpdate``, an external database must be configured via :ref:`apiService.dbConnectionSecretName <helm-values-apiService-dbConnectionSecretName>` or :ref:`apiService.dbConnectionString <helm-values-apiService-dbConnectionString>`.
 
+For persistent storage with RollingUpdate:
+
+- If :ref:`storage.enabled=true <helm-values-storage-enabled>`, use :ref:`storage.accessMode <helm-values-storage-accessMode>` =ReadWriteMany with an RWX-capable storage class (e.g., NFS-backed storage). This sets the ``SKYPILOT_API_SERVER_STORAGE_ENABLED`` environment variable, ensuring managed job logs and file mounts persist across rolling updates.
+- If ``storage.enabled=false``, file mounts and logs will be lost on pod restart. Consider configuring ``jobs.bucket`` in the SkyPilot config to persist file mounts to cloud storage.
+
 Default: ``"Recreate"``
 
 .. code-block:: yaml
@@ -1130,6 +1136,95 @@ Default: ``null``
     serviceAccount:
       enabled: true
 
+.. _helm-values-auth-externalProxy:
+
+``auth.externalProxy``
+^^^^^^^^^^^^^^^^^^^^^^
+
+Configuration for trusting an external authentication proxy in front of the API server. Use this when your infrastructure has a reverse proxy or load balancer that handles authentication (e.g., AWS ALB with Cognito, Azure Front Door with Azure AD, or a custom ingress controller with authentication middleware).
+
+When enabled, the API server extracts user identity from the HTTP header set by the proxy. The proxy is trusted to have already authenticated the user.
+
+This is mutually exclusive with :ref:`auth.oauth <helm-values-auth-oauth>` and :ref:`ingress.oauth2-proxy <helm-values-ingress-oauth2-proxy>`.
+
+Default: see the yaml below.
+
+.. code-block:: yaml
+
+  auth:
+    externalProxy:
+      enabled: false
+      headerName: 'X-Auth-Request-Email'
+      headerFormat: 'plaintext'
+
+.. _helm-values-auth-externalProxy-enabled:
+
+``auth.externalProxy.enabled``
+''''''''''''''''''''''''''''''
+
+Enable external proxy authentication. When enabled, the API server will extract user identity from the header specified by ``headerName``.
+
+Default: ``false``
+
+.. code-block:: yaml
+
+  auth:
+    externalProxy:
+      enabled: true
+
+.. _helm-values-auth-externalProxy-headerName:
+
+``auth.externalProxy.headerName``
+'''''''''''''''''''''''''''''''''
+
+The HTTP header name containing the user identity.
+
+Default: ``'X-Auth-Request-Email'``
+
+.. code-block:: yaml
+
+  auth:
+    externalProxy:
+      headerName: 'X-WEBAUTH-USER'
+
+.. _helm-values-auth-externalProxy-headerFormat:
+
+``auth.externalProxy.headerFormat``
+'''''''''''''''''''''''''''''''''''
+
+The format of the header value. Available options:
+
+- ``plaintext``: The header value is the user identity directly (e.g., ``user@example.com``)
+- ``jwt``: The header value is a JWT token from which the identity should be extracted using ``jwtIdentityClaim``
+
+Use ``jwt`` when integrating with load balancers that pass JWT tokens.
+
+Default: ``'plaintext'``
+
+.. code-block:: yaml
+
+  auth:
+    externalProxy:
+      headerFormat: 'jwt'
+
+.. _helm-values-auth-externalProxy-jwtIdentityClaim:
+
+``auth.externalProxy.jwtIdentityClaim``
+'''''''''''''''''''''''''''''''''''''''
+
+The JWT claim to extract the user identity from when ``headerFormat`` is ``jwt``.
+
+Only used when ``headerFormat`` is ``jwt``.
+
+Default: ``'sub'``
+
+.. code-block:: yaml
+
+  auth:
+    externalProxy:
+      headerFormat: 'jwt'
+      jwtIdentityClaim: 'email'
+
 
 .. _helm-values-storage:
 
@@ -1143,6 +1238,19 @@ Default: ``null``
 
 Enable persistent storage for the API server, setting this to ``false`` is prone to data loss and should only be used for testing.
 
+When enabled, SkyPilot creates a PersistentVolumeClaim (PVC) to persist:
+
+- **Managed job logs**: Accessible via ``sky jobs logs <job_id>`` and ``sky jobs logs --controller <job_id>``
+- **File mounts**: Local files uploaded during managed job submission
+
+.. note::
+
+  Setting ``storage.enabled=true`` sets the environment variable ``SKYPILOT_API_SERVER_STORAGE_ENABLED=true`` on the API server pod. This ensures that managed job logs and file mounts persist across API server restarts and rolling updates.
+
+  Transient logs (api_server logs, sky-* cluster logs) are NOT persisted to minimize storage usage.
+
+For RollingUpdate upgrade strategy, see :ref:`apiService.upgradeStrategy <helm-values-apiService-upgradeStrategy>` for storage access mode requirements.
+
 Default: ``true``
 
 .. code-block:: yaml
@@ -1169,15 +1277,32 @@ Default: ``""``
 ``storage.accessMode``
 ^^^^^^^^^^^^^^^^^^^^^^
 
-Access mode for the persistent storage volume. Can be set to ``ReadWriteOnce`` or ``ReadWriteMany`` depending on what is supported by the storage class.
+Access mode for the persistent storage volume. Available options:
+
+- ``ReadWriteOnce`` (RWO): The volume can be mounted as read-write by a single node. This is the default and works with most storage classes. Compatible with ``Recreate`` upgrade strategy. **Not compatible with RollingUpdate upgrade strategy** since the PVC cannot be mounted by both old and new pods simultaneously during rolling updates.
+
+- ``ReadWriteMany`` (RWX): The volume can be mounted as read-write by multiple nodes. Compatible with both ``Recreate`` and ``RollingUpdate`` upgrade strategies. Requires an RWX-capable storage class such as:
+
+  - GKE: Filestore-backed storage class
+  - EKS: EFS CSI driver
+  - AKS: Azure Files
+  - On-prem: NFS provisioner
+
+For more details on upgrade strategies, see :ref:`apiService.upgradeStrategy <helm-values-apiService-upgradeStrategy>`.
 
 Default: ``ReadWriteOnce``
 
 .. code-block:: yaml
 
+  # For Recreate upgrade strategy (default), ReadWriteOnce is sufficient
   storage:
     accessMode: ReadWriteOnce
 
+  # For RollingUpdate upgrade strategy with persistent storage, use ReadWriteMany
+  storage:
+    accessMode: ReadWriteMany
+    storageClassName: <your-rwx-storage-class>
+
 .. _helm-values-storage-size:
 
 ``storage.size``
diff --git a/docs/source/reference/auto-stop.rst b/docs/source/reference/auto-stop.rst
index 0c27f05927b..dec39e47209 100644
--- a/docs/source/reference/auto-stop.rst
+++ b/docs/source/reference/auto-stop.rst
@@ -139,3 +139,146 @@ Alternatively, pass the ``--wait-for`` flag to either ``sky autostop`` or ``sky
 
    # Hard time limit: Stop after 10 minutes, regardless of running jobs or SSH sessions.
    sky autostop mycluster -i 10 --wait-for none
+
+.. _auto-stop-hooks:
+
+Autostop hooks
+~~~~~~~~~~~~~~
+
+To execute a script before autostopping, specify a hook in the autostop configuration.
+The hook script runs on the remote cluster before the cluster is stopped or torn down.
+This is useful for tasks like committing code, saving checkpoints, or performing cleanup operations.
+
+.. code-block:: yaml
+
+   resources:
+     autostop:
+       idle_minutes: 10
+       hook: |
+         cd my-code-base
+         git add .
+         git commit -m "Commit my code"
+         git push
+       hook_timeout: 300
+
+The hook script runs on the cluster and has access to the cluster's filesystem and environment variables.
+If the hook script fails (non-zero exit code), the autostop process will still continue,
+but a warning will be logged.
+
+**Hook Timeout**
+
+By default, autostop hooks have a **1-hour (3600 seconds) timeout**. If your hook
+takes longer than this, it will be killed and autostop will proceed. To
+customize the timeout in your YAML configuration:
+
+.. code-block:: yaml
+
+   resources:
+     autostop:
+       idle_minutes: 10
+       hook: |
+         # Long-running backup operation
+         tar -czf backup.tar.gz /large/dataset
+         aws s3 cp backup.tar.gz s3://my-bucket/
+       hook_timeout: 7200  # 2 hours in seconds
+
+**Important Notes:**
+
+- If the hook times out, autostop will proceed after logging a warning
+- The minimum timeout is 1 second
+- Hook execution will keep the cluster from terminating while it runs, occupying the resources. Be aware of that when setting ``idle_minutes``
+
+Common use cases for autostop hooks:
+
+.. dropdown:: Committing and pushing code changes
+
+    .. code-block:: yaml
+
+       resources:
+         autostop:
+           idle_minutes: 10
+           hook: |
+             cd my-code-base
+             git add .
+             git commit -m "Auto-commit before shutdown"
+             git push
+
+.. dropdown:: Saving model checkpoints to persistent storage
+
+    .. code-block:: yaml
+
+       resources:
+         autostop:
+           idle_minutes: 10
+           hook: |
+             # Save checkpoints to a mounted volume or cloud storage
+             cp -r /workspace/checkpoints/* /mnt/persistent-storage/checkpoints/
+             # Or upload to S3
+             aws s3 sync /workspace/checkpoints/ s3://my-bucket/checkpoints/
+
+.. dropdown:: Uploading logs or results to cloud storage
+
+    .. code-block:: yaml
+
+       resources:
+         autostop:
+           idle_minutes: 10
+           hook: |
+             # Upload logs to S3
+             aws s3 sync /workspace/logs/ s3://my-bucket/logs/$(date +%Y%m%d)/
+             # Or upload to GCS
+             gcloud storage cp -r /workspace/results/ gs://my-bucket/results/$(date +%Y%m%d)/
+
+.. dropdown:: Syncing W&B runs before shutdown
+
+    .. code-block:: yaml
+
+       resources:
+         autostop:
+           idle_minutes: 10
+           hook: |
+             # Sync W&B runs to the cloud before shutdown
+             # Sync all runs in the wandb directory
+             wandb sync ./wandb
+             # Or sync a specific run
+             # wandb sync ./wandb/run-20250813_124246-n67z9ude
+
+.. dropdown:: Sending notifications about the cluster shutdown
+
+    .. code-block:: yaml
+
+       resources:
+         autostop:
+           idle_minutes: 10
+           hook: |
+             # Send email notification
+             echo "Cluster shutting down after idle period" | \
+               mail -s "Cluster Autostop" user@example.com
+             # Or send Slack notification via webhook
+             curl -X POST -H 'Content-type: application/json' \
+               --data '{"text":"Cluster shutting down after idle period"}' \
+               https://hooks.slack.com/services/YOUR/WEBHOOK/URL
+
+.. dropdown:: Triggering downstream workflows
+
+    .. code-block:: yaml
+
+       resources:
+         autostop:
+           idle_minutes: 10
+           hook: |
+             # Trigger an evaluation pipeline in Airflow
+             curl -X POST https://airflow.example.com/api/v1/dags/model_eval/dag_runs \
+                  -H "Content-Type: application/json" \
+                  -d '{"conf": {"model_path": "s3://my-bucket/models/v1"}}'
+
+.. dropdown:: Pushing model to Hugging Face Hub
+
+    .. code-block:: yaml
+
+       resources:
+         autostop:
+           idle_minutes: 10
+           hook: |
+             # Upload the trained model to Hugging Face Hub
+             huggingface-cli upload my-org/my-model /workspace/model-output .
diff --git a/docs/source/reference/config.rst b/docs/source/reference/config.rst
index c182d8427e0..eafde9ced37 100644
--- a/docs/source/reference/config.rst
+++ b/docs/source/reference/config.rst
@@ -63,6 +63,7 @@ Below is the configuration syntax and some example values. See detailed explanat
 
   :ref:`provision <config-yaml-provision>`:
     :ref:`ssh_timeout <config-yaml-provision-ssh-timeout>`: 10
+    :ref:`install_conda <config-yaml-provision-install-conda>`: false
 
   :ref:`kubernetes <config-yaml-kubernetes>`:
     :ref:`ports <config-yaml-kubernetes-ports>`: loadbalancer
@@ -122,6 +123,9 @@ Below is the configuration syntax and some example values. See detailed explanat
       map-migrated: my-value
       Owner: user-unique-name
     :ref:`vpc_name <config-yaml-aws-vpc-name>`: skypilot-vpc
+    :ref:`vpc_names <config-yaml-aws-vpc-names>`:
+      - skypilot-vpc-1
+      - skypilot-vpc-2
     :ref:`use_internal_ips <config-yaml-aws-use-internal-ips>`: true
     :ref:`use_ssm <config-yaml-aws-use-ssm>`: true
     :ref:`ssh_proxy_command <config-yaml-aws-ssh-proxy-command>`: ssh -W %h:%p user@host
@@ -615,6 +619,31 @@ determines how long to wait for the connection to be established.
 
 Default: ``10``.
 
+.. _config-yaml-provision-install-conda:
+
+``provision.install_conda``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Whether to install conda on the remote cluster (optional).
+
+Skypilot clusters come with conda preinstalled for convenience.
+When set to ``false``, SkyPilot will not install conda on the cluster.
+
+Default: ``true``.
+
+Example:
+
+.. code-block:: yaml
+
+  provision:
+    install_conda: false
+
+.. note::
+
+  Default SkyPilot images often come with conda preinstalled.
+  To fully avoid installing conda, use a custom Docker image that does not have conda preinstalled
+  along with ``install_conda: false``.
+
 .. _config-yaml-aws:
 
 ``aws``
@@ -670,6 +699,24 @@ Regions without a VPC with this name will not be used to launch nodes.
 
 Default: ``null`` (use the default VPC in each region).
 
+Deprecated: use ``aws.vpc_names`` instead.
+
+.. _config-yaml-aws-vpc-names:
+
+``aws.vpc_names``
+~~~~~~~~~~~~~~~~~
+
+VPCs to use in each region (optional).
+
+If this is set, SkyPilot will attempt each VPC for failover in regions
+that contain the attempted VPCs (provisioner automatically looks for such
+regions). Regions without any matching VPCs will not be used to launch nodes.
+
+It is possible to set either a ``string`` (one VPC), or a ``list`` (multiple
+target VPCs).
+
+Default: ``null`` (use the default VPC in each region).
+
 .. _config-yaml-aws-use-internal-ips:
 
 ``aws.use_internal_ips``
@@ -1364,6 +1411,7 @@ Example:
         myannotation: myvalue
     provision_timeout: 10
     autoscaler: gke
+    set_pod_resource_limits: true  # or a multiplier like 1.5
     pod_config:
       metadata:
         labels:
@@ -1471,6 +1519,47 @@ Example:
     post_provision_runcmd:
       - echo "hello world!"
 
+.. _config-yaml-kubernetes-set-pod-resource-limits:
+
+``kubernetes.set_pod_resource_limits``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Set pod CPU/memory limits relative to requests (optional).
+
+This is useful for Kubernetes clusters that require resource limits to be set
+(e.g., for LimitRange enforcement, resource quotas, or cluster policies).
+
+Can be one of:
+
+- ``false`` (default): Do not set CPU/memory limits (only requests are set).
+- ``true``: Set limits equal to requests (multiplier of 1).
+- A positive number: Set limits to requests multiplied by this value (e.g., ``1.5`` for 50% headroom).
+
+Default: ``false``.
+
+Example:
+
+.. code-block:: yaml
+
+  kubernetes:
+    # Set limits equal to requests
+    set_pod_resource_limits: true
+
+.. code-block:: yaml
+
+  kubernetes:
+    # Set limits to 1.5x requests (50% headroom)
+    set_pod_resource_limits: 1.5
+
+This can also be configured per-context using ``context_configs``:
+
+.. code-block:: yaml
+
+  kubernetes:
+    context_configs:
+      prod-cluster:
+        set_pod_resource_limits: 2.0
+
 .. _config-yaml-kubernetes-context-configs:
 
 ``kubernetes.context_configs``
diff --git a/docs/source/reference/kubernetes/kubernetes-getting-started.rst b/docs/source/reference/kubernetes/kubernetes-getting-started.rst
index 6083119d6a3..f64b79c9acd 100644
--- a/docs/source/reference/kubernetes/kubernetes-getting-started.rst
+++ b/docs/source/reference/kubernetes/kubernetes-getting-started.rst
@@ -142,32 +142,26 @@ Once your cluster administrator has :ref:`setup a Kubernetes cluster <kubernetes
 Viewing cluster status
 ----------------------
 
-To view the status of all SkyPilot resources in the Kubernetes cluster, run :code:`sky status --k8s`.
+To view the status of your SkyPilot clusters, use :code:`sky status`:
 
-Unlike :code:`sky status` which lists only the SkyPilot resources launched by the current user,
-:code:`sky status --k8s` lists all SkyPilot resources in the Kubernetes cluster across all users.
+.. code-block:: console
+
+    $ sky status
+    Clusters
+    NAME       WORKSPACE  INFRA                      RESOURCES                    STATUS  AUTOSTOP  LAUNCHED
+    mycluster  prod       Kubernetes (k8s-context1)  1x(cpus=2, mem=4, ...)       UP      -         10 mins ago
+    dev        ml-team    Kubernetes (k8s-context2)  1x(gpus=H100:1, cpus=4, ...) UP      10m       1 hr ago
+
+When connected to a shared :ref:`SkyPilot API server <sky-api-server>`, you can view resources from all users with :code:`sky status -u`:
 
 .. code-block:: console
 
-    $ sky status --k8s
-    Kubernetes cluster state (context: mycluster)
-    SkyPilot clusters
-    USER     NAME                           LAUNCHED    INFRA      RESOURCES                 STATUS
-    alice    infer-svc-1                    23 hrs ago  Kubernetes 1x(gpus=L4:1, ...)        UP
-    alice    sky-jobs-controller-80b50983   2 days ago  Kubernetes 1x(cpus=4, mem=4, ...)    UP
-    alice    sky-serve-controller-80b50983  23 hrs ago  Kubernetes 1x(cpus=4, mem=4, ...)    UP
-    bob      dev                            1 day ago   Kubernetes 1x(gpus=H100:1, ...)      UP
-    bob      multinode-dev                  1 day ago   Kubernetes 2x(cpus=2, mem=2, ...)    UP
-    bob      sky-jobs-controller-2ea485ea   2 days ago  Kubernetes 1x(cpus=4, mem=4, ...)    UP
-
-    Managed jobs
-    In progress tasks: 1 STARTING
-    USER     ID  TASK  NAME      REQUESTED   SUBMITTED   TOT. DURATION  JOB DURATION  #RECOVERIES  STATUS
-    alice    1   -     eval      1x[CPU:1+]  2 days ago  49s            8s            0            SUCCEEDED
-    bob      4   -     pretrain  1x[H100:4]  1 day ago   1h 1m 11s      1h 14s        0            SUCCEEDED
-    bob      3   -     bigjob    1x[CPU:16]  1 day ago   1d 21h 11m 4s  -             0            STARTING
-    bob      2   -     failjob   1x[CPU:1+]  1 day ago   54s            9s            0            FAILED
-    bob      1   -     shortjob  1x[CPU:1+]  2 days ago  1h 1m 19s      1h 16s        0            SUCCEEDED
+    $ sky status -u
+    Clusters
+    NAME       USER              WORKSPACE  INFRA                      RESOURCES                            STATUS  AUTOSTOP  LAUNCHED
+    mycluster  alice@example.com prod       Kubernetes (k8s-context1)  1x(cpus=2, mem=4, ...)               UP      -         10 mins ago
+    dev        alice@example.com ml-team    Kubernetes (k8s-context2)  1x(gpus=H100:1, cpus=4, mem=16, ...) UP      10m       1 hr ago
+    training   bob@example.com   ml-team    Kubernetes (k8s-context1)  1x(gpus=L4:4, cpus=8, mem=32, ...)   UP      -         2 hrs ago
 
 You can also inspect the real-time GPU usage on the cluster with :code:`sky show-gpus --infra k8s`.
 
@@ -298,6 +292,23 @@ To use images from private repositories (e.g., Private DockerHub, Amazon ECR, Go
                 --docker-server=nvcr.io
 
 
+
+
+.. _kubernetes-using-volumes:
+
+Mounting NFS and other volumes
+------------------------------
+
+SkyPilot supports mounting various types of volumes to your pods on Kubernetes:
+
+* :ref:`Persistent volumes <volumes-quickstart>`: Independently managed volumes with lifecycle separate from clusters, ideal for long-term data storage and sharing datasets across clusters. These are backed by Kubernetes PVCs on block storage (e.g., AWS EBS, GCP Persistent Disk) or distributed file systems (e.g., JuiceFS, Nebius shared file system, AWS EFS, GCP Filestore).
+
+* :ref:`Ephemeral volumes <ephemeral-volumes>`: Automatically created and deleted with your cluster, suitable for temporary storage and caches that are cluster-specific. Also backed by Kubernetes PVCs.
+
+* :ref:`Other volume types <advanced-mount-pvc-with-kubernetes-configs>`: Mount hostPath, NFS, and other Kubernetes volume types by overriding SkyPilot's ``pod_config``.
+
+For detailed information on configuring and using volumes, see :ref:`Volumes on Kubernetes <volumes-quickstart>`.
+
 Opening ports
 -------------
 
@@ -382,21 +393,6 @@ For example, to set custom environment variables and use GPUDirect RDMA, you can
            pod_config:
              ...
 
-.. _kubernetes-using-volumes:
-
-Mounting volumes
-------------------------------
-
-SkyPilot supports mounting various types of volumes to your pods on Kubernetes:
-
-* **Persistent volumes**: Independently managed volumes with lifecycle separate from clusters, ideal for long-term data storage and sharing datasets across clusters. These can be backed by block storage (e.g., AWS EBS, GCP Persistent Disk) or distributed file systems (e.g., JuiceFS, Nebius shared file system, AWS EFS, GCP Filestore).
-
-* **Ephemeral volumes**: Automatically created and deleted with your cluster, suitable for temporary storage and caches that are cluster-specific.
-
-* **Other volume types**: You can also mount hostPath, NFS, etc. as needed.
-
-For detailed information on configuring and using volumes, see :ref:`volumes-on-kubernetes`.
-
 FAQs
 ----
 
diff --git a/docs/source/reference/kubernetes/kubernetes-setup.rst b/docs/source/reference/kubernetes/kubernetes-setup.rst
index 9ddd52bae28..d8b5cbad687 100644
--- a/docs/source/reference/kubernetes/kubernetes-setup.rst
+++ b/docs/source/reference/kubernetes/kubernetes-setup.rst
@@ -257,18 +257,18 @@ The following setup steps are optional and can be performed based on your specif
 
 .. _kubernetes-setup-volumes:
 
-Set up volumes
-^^^^^^^^^^^^^^^
+Set up NFS and other volumes
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 SkyPilot supports mounting various types of volumes to your pods on Kubernetes:
 
-* **Persistent volumes**: Independently managed volumes with lifecycle separate from clusters, ideal for long-term data storage and sharing datasets across clusters. These can be backed by block storage (e.g., AWS EBS, GCP Persistent Disk) or distributed file systems (e.g., JuiceFS, Nebius shared file system, AWS EFS, GCP Filestore).
+* :ref:`Persistent volumes <volumes-quickstart>`: Independently managed volumes with lifecycle separate from clusters, ideal for long-term data storage and sharing datasets across clusters. These are backed by Kubernetes PVCs on block storage (e.g., AWS EBS, GCP Persistent Disk) or distributed file systems (e.g., JuiceFS, Nebius shared file system, AWS EFS, GCP Filestore).
 
-* **Ephemeral volumes**: Automatically created and deleted with your cluster, suitable for temporary storage and caches that are cluster-specific.
+* :ref:`Ephemeral volumes <ephemeral-volumes>`: Automatically created and deleted with your cluster, suitable for temporary storage and caches that are cluster-specific. Also backed by Kubernetes PVCs.
 
-* **Other volume types**: You can also mount hostPath, NFS, etc. as needed.
+* :ref:`Other volume types <advanced-mount-pvc-with-kubernetes-configs>`: Mount hostPath, NFS, and other Kubernetes volume types by overriding SkyPilot's ``pod_config``.
 
-For detailed information on configuring and using volumes, see :ref:`volumes-on-kubernetes`.
+For detailed information on configuring and using volumes, see :ref:`Volumes on Kubernetes <volumes-quickstart>`.
 
 .. _kubernetes-setup-priority:
 
@@ -394,33 +394,27 @@ Below, we provide tips on how to monitor SkyPilot resources on your Kubernetes c
 List SkyPilot resources across all users
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-We provide a convenience command, :code:`sky status --k8s`, to view the status of all SkyPilot resources in the cluster.
+When using the :ref:`SkyPilot API server <sky-api-server>`, you can use the :ref:`SkyPilot dashboard <dashboard>` to view SkyPilot resources across all users:
 
-Unlike :code:`sky status` which lists only the SkyPilot resources launched by the current user,
-:code:`sky status --k8s` lists all SkyPilot resources in the cluster across all users.
 
-.. code-block:: console
+.. image:: ../../images/dashboard-clusters.png
+    :width: 100%
+    :align: center
+    :alt: SkyPilot Dashboard
 
-    $ sky status --k8s
-    Kubernetes cluster state (context: mycluster)
-    SkyPilot clusters
-    USER     NAME                           LAUNCHED    RESOURCES                                  STATUS
-    alice    infer-svc-1                    23 hrs ago  1x Kubernetes(cpus=1, mem=1, {'L4': 1})    UP
-    alice    sky-jobs-controller-80b50983   2 days ago  1x Kubernetes(cpus=4, mem=4)               UP
-    alice    sky-serve-controller-80b50983  23 hrs ago  1x Kubernetes(cpus=4, mem=4)               UP
-    bob      dev                            1 day ago   1x Kubernetes(cpus=2, mem=8, {'H100': 1})  UP
-    bob      multinode-dev                  1 day ago   2x Kubernetes(cpus=2, mem=2)               UP
-    bob      sky-jobs-controller-2ea485ea   2 days ago  1x Kubernetes(cpus=4, mem=4)               UP
-
-    Managed jobs
-    In progress tasks: 1 STARTING
-    USER     ID  TASK  NAME      RESOURCES   SUBMITTED   TOT. DURATION  JOB DURATION  #RECOVERIES  STATUS
-    alice    1   -     eval      1x[CPU:1+]  2 days ago  49s            8s            0            SUCCEEDED
-    bob      4   -     pretrain  1x[H100:4]  1 day ago   1h 1m 11s      1h 14s        0            SUCCEEDED
-    bob      3   -     bigjob    1x[CPU:16]  1 day ago   1d 21h 11m 4s  -             0            STARTING
-    bob      2   -     failjob   1x[CPU:1+]  1 day ago   54s            9s            0            FAILED
-    bob      1   -     shortjob  1x[CPU:1+]  2 days ago  1h 1m 19s      1h 16s        0            SUCCEEDED
+|
+
+Or run :code:`sky status -u`:
+
+.. code-block:: console
 
+    $ sky status -u
+    Clusters
+    NAME                USER               WORKSPACE         INFRA                   RESOURCES                                   STATUS   AUTOSTOP  LAUNCHED
+    training-multinode  alice@skypilot.co  ml-team           Kubernetes (nebius)     2x(gpus=H100:8, cpus=200, mem=800, ...)     RUNNING  60m       5d ago
+    dev-alice           alice@skypilot.co  research-private  Kubernetes (coreweave)  1x(gpus=H200:1, cpus=8, mem=32, ...)        RUNNING  -         6d ago
+    inference           mike@skypilot.co   default           AWS (us-west-2)         1x(gpus=L4:1, g6.2xlarge, ...)              RUNNING  30m       4d ago
+    dev-bob             bob@skypilot.co    default           GCP (us-west1)          1x(cpus=4, mem=15, n1-standard-4, ...)      STOPPED  -         6d ago
 
 .. _kubernetes-observability-dashboard:
 
@@ -434,6 +428,7 @@ SkyPilot resources on your cluster.
     :align: center
     :alt: Kubernetes dashboard
 
+|
 
 As a demo, we provide a sample Kubernetes dashboard deployment manifest that you can deploy with:
 
diff --git a/docs/source/reference/kubernetes/kubernetes-troubleshooting.rst b/docs/source/reference/kubernetes/kubernetes-troubleshooting.rst
index d7e2aeb62c5..54b5761c194 100644
--- a/docs/source/reference/kubernetes/kubernetes-troubleshooting.rst
+++ b/docs/source/reference/kubernetes/kubernetes-troubleshooting.rst
@@ -5,7 +5,7 @@ Kubernetes Troubleshooting
 
 If you're unable to run SkyPilot tasks on your Kubernetes cluster, this guide will help you debug common issues.
 
-If this guide does not help resolve your issue, please reach out to us on `Slack <https://slack.skypilot.co>`_ or `GitHub <http://www.github.com/skypilot-org/skypilot>`_.
+If this guide does not help resolve your issue, please reach out to us on `Slack <https://slack.skypilot.co>`_ or `GitHub <https://github.com/skypilot-org/skypilot>`_.
 
 .. _kubernetes-troubleshooting-basic:
 
diff --git a/docs/source/reference/slurm/slurm-getting-started.rst b/docs/source/reference/slurm/slurm-getting-started.rst
index 2e72503301a..b616bfdbbac 100644
--- a/docs/source/reference/slurm/slurm-getting-started.rst
+++ b/docs/source/reference/slurm/slurm-getting-started.rst
@@ -74,7 +74,9 @@ Create the configuration file:
 
 .. note::
 
-    ``HostName``, ``User``, and ``IdentityFile`` are required fields.
+    ``HostName`` and ``User`` are required fields. ``IdentityFile`` is optional;
+    if not specified, SSH will use keys from ssh-agent or default key locations
+    (e.g., ``~/.ssh/id_rsa``, ``~/.ssh/id_ed25519``).
 
 Verify your SSH connection works by running:
 
@@ -178,6 +180,63 @@ SkyPilot will translate this to the appropriate ``--gres=gpu:`` directive for Sl
     Common names include ``H100``, ``H200``, ``L4`` etc.
 
 
+Viewing GPU availability
+------------------------
+
+SkyPilot provides a unified dashboard to monitor GPU availability and utilization across **all** your Slurm clusters.
+
+To open the dashboard:
+
+.. code-block:: bash
+
+    $ sky dashboard
+
+Navigate to the **Infra** tab to see the real-time GPU availability across all your Slurm clusters:
+
+.. image:: /images/slurm-infra-page.png
+   :alt: SkyPilot Dashboard showing Slurm GPU availability overview
+   :width: 100%
+
+|
+
+Click on a cluster name to see detailed GPU availability per node:
+
+.. image:: /images/slurm-cluster-details-page.png
+   :alt: SkyPilot Dashboard showing Slurm cluster GPU details
+   :width: 100%
+
+|
+
+You can also view GPU availability from the CLI:
+
+.. code-block:: console
+
+    $ sky show-gpus --infra slurm
+    Slurm GPUs
+    GPU    UTILIZATION
+    L40S   3 of 8 free
+    GH200  1 of 2 free
+    H100   8 of 8 free
+
+    Slurm Cluster: mycluster1
+    GPU   REQUESTABLE_QTY_PER_NODE  UTILIZATION
+    L40S  1, 2, 4                   3 of 8 free
+
+    Slurm Cluster: mycluster2
+    GPU    REQUESTABLE_QTY_PER_NODE  UTILIZATION
+    GH200  1                         1 of 2 free
+
+    Slurm Cluster: mycluster3
+    GPU   REQUESTABLE_QTY_PER_NODE  UTILIZATION
+    H100  1, 2, 4, 8                8 of 8 free
+
+    Slurm per node GPU availability
+    CLUSTER     NODE            PARTITION  STATE  GPU   UTILIZATION
+    mycluster1  ip-10-3-132-97  dev*,gpus  mix    L40S  1 of 4 free
+    mycluster1  ip-10-3-168-59  dev*,gpus  mix    L40S  2 of 4 free
+    ...
+
+
 Shared filesystem (NFS)
 -----------------------
 
diff --git a/docs/source/reference/volumes.rst b/docs/source/reference/volumes.rst
index e9c50b6b811..6ddb9257ca7 100644
--- a/docs/source/reference/volumes.rst
+++ b/docs/source/reference/volumes.rst
@@ -11,70 +11,13 @@ Benefits of using volumes:
 * **Data persistence**: Volumes can persist data independently of task life cycles, making it easy to share data between different tasks (e.g., datasets, caches) or preserve results.
 * **Size control**: You can set volume size limits to manage costs and limit storage usage.
 
-SkyPilot supports creating and managing volumes directly through the ``sky`` CLI and the web dashboard.
+Volumes are currently supported on Kubernetes clusters and RunPod.
 
-Supported volume types:
 
-- Kubernetes: `Persistent Volume Claims (PVCs) <https://kubernetes.io/docs/concepts/storage/persistent-volumes/#persistentvolumeclaims/>`_
-
-  - Tested storage backends: AWS EBS, GCP Persistent Disk, Nebius network SSD, JuiceFS, Nebius shared file system, GCP Filestore
-
-- RunPod: `Network Volumes <https://docs.runpod.io/pods/storage/types#network-volume>`_
-
-With SSH node pools, you can mount host volumes or directories into SkyPilot clusters and managed jobs. See :ref:`SSH node pools <ssh-volumes>` for details.
-
-.. _volumes-on-kubernetes:
-
-Volumes on Kubernetes
----------------------
-
-In Kubernetes clusters, PVCs (Persistent Volume Claims) request and bind to PV (Persistent Volume) resources. These persistent volumes can be backed by various storage backends, including **block storage** solutions (AWS EBS, GCP Persistent Disk) and **distributed file systems** (JuiceFS, Nebius shared file system, AWS EFS, GCP Filestore), etc.
-
-SkyPilot supports two types of volumes on Kubernetes:
-
-1. **Persistent volumes**: Managed independently through CLI commands with lifecycle separate from clusters
-2. **Ephemeral volumes**: Bound to cluster lifecycle, automatically created and deleted with the cluster
-
-.. list-table::
-   :widths: 30 35 35
-   :header-rows: 1
-
-   * - Feature
-     - Persistent Volumes
-     - Ephemeral Volumes
-   * - Lifecycle
-     - Independent (manually managed)
-     - Bound to cluster
-   * - Creation
-     - ``sky volumes apply``
-     - Automatic (in task YAML)
-   * - Deletion
-     - ``sky volumes delete``
-     - Automatic (with cluster)
-   * - Sharing across clusters
-     - Yes
-     - No (cluster-specific)
-   * - Use case
-     - Long-term data, shared datasets
-     - Temporary storage, caches
-
-In addition to the above, you can also mount PVCs, NFS or hostPath with Kubernetes configs. See :ref:`advanced-mount-pvc-with-kubernetes-configs` and :ref:`advanced-mount-nfs-hostpath-with-kubernetes-configs` for details.
-
-Persistent volumes
-~~~~~~~~~~~~~~~~~~
-
-Persistent volumes are created and managed independently using the following commands:
-
-- ``sky volumes apply``: Create a new volume
-- ``sky volumes ls``: List all volumes
-- ``sky volumes delete``: Delete a volume
-
-.. note::
-
-  Volumes are shared across users on a SkyPilot API server. A user can mount volumes created by other users. This is useful for sharing caches and data across users.
+.. _volumes-quickstart:
 
 Quickstart
-^^^^^^^^^^
+----------
 
 1. Prepare a volume YAML file:
 
@@ -83,17 +26,12 @@ Quickstart
      # volume.yaml
      name: new-pvc
      type: k8s-pvc
-     infra: kubernetes  # or k8s or k8s/context
+     infra: k8s  # or `k8s/context` or `runpod`
      size: 10Gi
-     # If the PVC already exists, set `use_existing` to true and
-     # set the `name` to the existing PVC name
+     
+     # Optional: To use an existing PVC on k8s instead of creating a new one, set to `true` and set `name` to the existing PVC name.
      # use_existing: true
-     labels:
-       key: value
-     config:
-       namespace: default  # optional
-       storage_class_name: csi-mounted-fs-path-sc  # optional
-       access_mode: ReadWriteMany  # optional
+
 
 2. Create the volume with ``sky volumes apply volume.yaml``:
 
@@ -114,34 +52,23 @@ Quickstart
      run: |
        echo "Hello, World!" > /mnt/data/hello.txt
 
-.. note::
+.. tip::
 
-  - For multi-node clusters, volumes are mounted to all nodes. You must configure ``config.access_mode`` to ``ReadWriteMany`` and use a ``storage_class_name`` that supports the ``ReadWriteMany`` access mode. Otherwise, SkyPilot will fail to launch the cluster.
-  - If you want to mount a volume to all the cluster or jobs by default, you can use the admin policy to inject the volume path into the task YAML. See :ref:`add-volumes-policy` for details.
+   For temporary or cache data that should only last for the lifetime of a SkyPilot cluster, use :ref:`ephemeral volumes <ephemeral-volumes>`.
 
 .. _volumes-on-kubernetes-manage:
 
 Managing volumes
-^^^^^^^^^^^^^^^^
+----------------
 
 List all volumes with ``sky volumes ls``:
 
 .. code-block:: console
 
-  $ sky volumes ls
-  NAME     TYPE     INFRA                         SIZE  USER   WORKSPACE  AGE   STATUS  LAST_USE     USED_BY
-  new-pvc  k8s-pvc  Kubernetes/nebius-mk8s-vol    1Gi   alice  default    8m    IN_USE  <timestamp>  <cluster_name>
-
+  $ sky volumes ls -v
+  NAME     TYPE     INFRA                         SIZE  USER   WORKSPACE  AGE   STATUS  LAST_USE             USED_BY   NAME_ON_CLOUD              STORAGE_CLASS           ACCESS_MODE
+  new-pvc  k8s-pvc  Kubernetes/nebius-mk8s-vol    1Gi   alice  default    8m    IN_USE  2025-06-24 10:18:32  training  new-pvc-73ec42f2-5c6c4e    csi-mounted-fs-path-sc  ReadWriteMany
 
-.. tip::
-
-  Use ``-v`` to view detailed information about a volume.
-
-  .. code-block:: console
-
-    $ sky volumes ls -v
-    NAME     TYPE     INFRA                         SIZE  USER   WORKSPACE  AGE   STATUS  LAST_USE             USED_BY   NAME_ON_CLOUD              STORAGE_CLASS           ACCESS_MODE
-    new-pvc  k8s-pvc  Kubernetes/nebius-mk8s-vol    1Gi   alice  default    8m    IN_USE  2025-06-24 10:18:32  training  new-pvc-73ec42f2-5c6c4e    csi-mounted-fs-path-sc  ReadWriteMany
 
 Delete a volume with ``sky volumes delete``:
 
@@ -161,183 +88,100 @@ You can also check the volumes in the SkyPilot dashboard.
     :align: center
     :width: 80%
 
-Filesystem volume examples
-^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-This section demonstrates how to configure and use distributed filesystems as SkyPilot volumes. We'll cover options like `JuiceFS <https://juicefs.com/docs/community/introduction/>`_ (a cloud-native distributed filesystem) and `Nebius shared file system <https://docs.nebius.com/compute/storage/types#filesystems>`_ (a high-performance shared storage solution).
-
-
-.. tab-set::
-
-    .. tab-item:: JuiceFS
-        :sync: juicefs-tab
-
-        To use JuiceFS as a SkyPilot volume:
-
-        1. **Install the JuiceFS CSI driver** on your Kubernetes cluster. Follow the official `installation guide <https://juicefs.com/docs/csi/getting_started>`_ for detailed instructions.
-
-        2. **Verify the driver installation** - Confirm that the JuiceFS CSI Driver pods are running:
-
-        .. code-block:: console
-
-          $ kubectl -n kube-system get pod -l app.kubernetes.io/name=juicefs-csi-driver
-          NAME                       READY   STATUS    RESTARTS   AGE
-          juicefs-csi-controller-0   2/2     Running   0          10m
-          juicefs-csi-node-8rd96     3/3     Running   0          10m
-
-        3. **Set up JuiceFS storage and create a SkyPilot volume** - You can use either dynamic provisioning (with a StorageClass) or static provisioning (with a pre-created PV):
 
-        .. tab-set::
-
-            .. tab-item:: Dynamic Provisioning (StorageClass)
-                :sync: dynamic-tab
-
-                Create a StorageClass for dynamic provisioning. Refer to the `JuiceFS StorageClass guide <https://juicefs.com/docs/csi/guide/pv/#create-storage-class>`_ for details.
-
-                .. code-block:: console
-
-                  $ kubectl get storageclass juicefs-sc
-                  NAME         PROVISIONER       RECLAIMPOLICY   VOLUMEBINDINGMODE   ALLOWVOLUMEEXPANSION   AGE
-                  juicefs-sc   csi.juicefs.com   Retain          Immediate           false                  10m
-
-                Create a SkyPilot volume YAML referencing the StorageClass:
-
-                .. code-block:: yaml
-
-                  # juicefs-volume.yaml
-                  name: juicefs-volume
-                  type: k8s-pvc
-                  infra: k8s
-                  size: 100Gi
-                  config:
-                    storage_class_name: juicefs-sc
-                    access_mode: ReadWriteMany
-
-                .. code-block:: console
-
-                  $ sky volumes apply juicefs-volume.yaml
-
-            .. tab-item:: Static Provisioning (PV)
-                :sync: static-tab
-
-                Create a PersistentVolume and PVC manually. Refer to the `JuiceFS static provisioning guide <https://juicefs.com/docs/csi/guide/pv/#static-provisioning>`_ for details.
-
-                .. code-block:: console
-
-                  $ kubectl get pv juicefs-pv
-                  NAME         CAPACITY   ACCESS MODES   RECLAIM POLICY   STATUS   CLAIM                 STORAGECLASS   AGE
-                  juicefs-pv   100Gi      RWX            Retain           Bound    default/juicefs-pvc                  10m
-
-                  $ kubectl get pvc juicefs-pvc
-                  NAME          STATUS   VOLUME       CAPACITY   ACCESS MODES   STORAGECLASS   AGE
-                  juicefs-pvc   Bound    juicefs-pv   100Gi      RWX                           10m
-
-                Create a SkyPilot volume YAML with ``use_existing: true`` to reference the existing PVC:
-
-                .. code-block:: yaml
-
-                  # juicefs-volume.yaml
-                  name: juicefs-volume
-                  type: k8s-pvc
-                  infra: k8s
-                  use_existing: true
-                  config:
-                    access_mode: ReadWriteMany
-
-                .. code-block:: console
-
-                  $ sky volumes apply juicefs-volume.yaml
-
-        4. **Mount the volume to SkyPilot task** in your SkyPilot YAML:
-
-        .. code-block:: yaml
-
-          # task.yaml
-          num_nodes: 2
-
-          volumes:
-            # Mount the JuiceFS volume to /mnt/data across all nodes
-            /mnt/data: juicefs-volume
+.. _volumes-on-kubernetes:
 
-          run: |
-            # Verify the volume is mounted and accessible
-            df -h /mnt/data
-            ls -la /mnt/data
+Volumes on Kubernetes
+---------------------
 
-        .. code-block:: console
+In Kubernetes clusters, SkyPilot Volumes map to `PVCs (Persistent Volume Claims) <https://kubernetes.io/docs/concepts/storage/persistent-volumes/#persistentvolumeclaims>`_.
 
-          # Launch the cluster with the JuiceFS volume
-          $ sky launch -c juicefs-cluster task.yaml
+PVCs can be backed by various storage backends, including **block storage** solutions (AWS EBS, GCP Persistent Disk) and **distributed file systems** (JuiceFS, Nebius shared file system, AWS EFS, GCP Filestore).
 
-    .. tab-item:: Nebius shared file system
-        :sync: nebius-tab
+SkyPilot Volumes can be of two types:
 
-        To use Nebius shared file system as a SkyPilot volume:
+1. :ref:`Persistent volumes <persistent-volumes>`: Managed through ``sky volumes`` CLI commands with lifecycle separate from SkyPilot clusters.
+2. :ref:`Ephemeral volumes <ephemeral-volumes>`: Bound to SkyPilot cluster lifecycle, automatically created and deleted when ``sky launch`` or ``sky down`` is run.
 
-        1. **Set up the Nebius filesystem infrastructure** by following the official documentation:
+.. list-table::
+   :widths: 35 35 30
+   :header-rows: 1
 
-           - `Create a shared filesystem <https://docs.nebius.com/kubernetes/storage/filesystem-over-csi#create-filesystem>`_
-           - `Create a node group and mount the filesystem <https://docs.nebius.com/kubernetes/storage/filesystem-over-csi#create-node-group>`_
-           - `Install the CSI driver <https://docs.nebius.com/kubernetes/storage/filesystem-over-csi#install-csi>`_
+   * - Feature
+     - :ref:`Persistent volumes <persistent-volumes>`
+     - :ref:`Ephemeral volumes <ephemeral-volumes>`
+   * - Lifecycle
+     - Independent (managed via ``sky volumes``)
+     - Bound to SkyPilot cluster
+   * - Creation
+     - ``sky volumes apply``
+     - Automatic (in task YAML)
+   * - Deletion
+     - ``sky volumes delete``
+     - Automatic (with cluster)
+   * - Sharing across SkyPilot clusters
+     - Yes
+     - No (cluster-specific)
+   * - Use case
+     - Long-term data, code, shared datasets
+     - Temporary storage, caches
 
-        2. **Verify the storage class** - Confirm that the ``csi-mounted-fs-path-sc`` storage class has been created:
+.. tip::
 
-        .. code-block:: console
+   For advanced use cases, you can also mount PVCs, NFS, or hostPath volumes by overriding SkyPilot's pod configs.
+   See :ref:`advanced-mount-pvc-with-kubernetes-configs` for details.
 
-          $ kubectl get storageclass
-          NAME                     PROVISIONER                    RECLAIMPOLICY   VOLUMEBINDINGMODE      ALLOWVOLUMEEXPANSION   AGE
-          csi-mounted-fs-path-sc   mounted-fs-path.csi.nebius.ai  Delete          WaitForFirstConsumer   false                  10m
+.. _persistent-volumes:
 
-        3. **Create a SkyPilot volume for Nebius file system** with a volume YAML:
+Persistent volumes
+~~~~~~~~~~~~~~~~~~
 
-        .. code-block:: yaml
+Persistent volumes are created and managed independently using the ``sky volumes`` CLI commands described in the :ref:`Quickstart <volumes-quickstart>` and :ref:`Managing volumes <volumes-on-kubernetes-manage>` sections above.
 
-          # nebius-volume.yaml
-          name: nebius-pvc
-          type: k8s-pvc
-          infra: k8s
-          size: 100Gi
-          config:
-            storage_class_name: csi-mounted-fs-path-sc
-            access_mode: ReadWriteMany
+.. note::
 
-        .. code-block:: console
+  Persistent volumes are shared across users on a SkyPilot API server. A user can mount volumes created by other users. This is useful for sharing caches and data across users.
 
-          $ sky volumes apply nebius-volume.yaml
+**Volume YAML configuration options:**
 
-        4. **Mount the volume to SkyPilot task** in your SkyPilot YAML:
+.. code-block:: yaml
 
-        .. code-block:: yaml
+  # volume.yaml
+  name: my-volume
+  type: k8s-pvc
+  infra: k8s  # or k8s/<context>
+  size: 10Gi
 
-          # task.yaml
-          num_nodes: 2
+  # Optional: To use an existing PVC instead of creating a new one, set to `true` and set `name` to the existing PVC name.
+  use_existing: true
 
-          volumes:
-            # Mount the Nebius shared filesystem to /mnt/data across all nodes
-            /mnt/data: nebius-pvc
+  # Optional: add labels to the PVC
+  labels:
+    key: value
 
-          run: |
-            # Verify the volume is mounted and accessible
-            df -h /mnt/data
-            ls -la /mnt/data
+  # Optional: additional configuration
+  config:
+    namespace: default
+    storage_class_name: csi-mounted-fs-path-sc
+    access_mode: ReadWriteMany  # Required for multi-node clusters
 
-        .. code-block:: console
+.. note::
 
-          # Launch the cluster with the Nebius volume
-          $ sky launch -c nebius-cluster task.yaml
+  - For multi-node clusters, volumes are mounted to all nodes. You must set ``config.access_mode`` to ``ReadWriteMany`` and use a ``storage_class_name`` that supports this access mode. Otherwise, SkyPilot will fail to launch the cluster.
+  - To mount a volume to all clusters or jobs by default, use the admin policy to inject the volume path into the task YAML. See :ref:`add-volumes-policy` for details.
 
+.. _ephemeral-volumes:
 
 Ephemeral volumes
 ~~~~~~~~~~~~~~~~~
 
-Unlike persistent volumes that are managed independently, ephemeral volumes are automatically created when a cluster is launched and deleted when the cluster is terminated. This makes them ideal for temporary storage needs such as caches, intermediate results, or any data that should only exist for the duration of a cluster's lifetime.
-
-**Key characteristics:**
+Unlike persistent volumes, which must be managed independently via ``sky volumes`` CLI commands, ephemeral volumes are automatically created when a cluster is launched via ``sky launch`` and deleted when the cluster is terminated via ``sky down`` or autodowned.
 
 - **Automatic lifecycle management**: No need to manually create or delete volumes
 - **Cluster-bound**: Created with the cluster and deleted when the cluster is terminated
 - **Simplified usage**: Defined directly in the task YAML with the cluster configuration
-- **Currently Kubernetes-only**: Only supported on Kubernetes clusters
+- **Ideal for temporary storage**: caches, intermediate results, or any data that should only exist for the duration of a cluster's lifetime
+
 
 To use an ephemeral volume, simply specify the ``size`` field in the volumes section of your task YAML:
 
@@ -385,93 +229,18 @@ When you terminate the cluster, the ephemeral volumes are automatically deleted:
   # Cluster and its ephemeral volumes are deleted
 
 .. _advanced-mount-pvc-with-kubernetes-configs:
-
-Advanced: Mount PVCs with Kubernetes configs
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Using SkyPilot volumes allows you to mount different volumes to different tasks. SkyPilot also offers an advanced way to mount a Kubernetes PVC with the detailed Kubernetes configs. This allows you to:
-
-1. Mount a PVC with additional configurations that is not supported by SkyPilot volumes.
-
-2. Specify a global (per Kubernetes context) PVC to be mounted on all SkyPilot clusters.
-
-Mount a PVC with additional configuration
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-To mount a PVC with additional configuration, you can set the ``kubernetes.pod_config`` in the :ref:`advanced config <config-yaml-kubernetes-pod-config>`:
-
-.. code-block:: yaml
-
-    kubernetes:
-      pod_config:
-        spec:
-          securityContext:
-            fsGroup: 1000
-            fsGroupChangePolicy: OnRootMismatch
-          containers:
-            - volumeMounts:
-              - mountPath: /mnt/data
-                name: my-pvc
-          volumes:
-            - name: my-pvc
-              persistentVolumeClaim:
-                claimName: my-pvc
-
-.. note::
-
-   The ``kubernetes.pod_config`` in the advanced config applies to every cluster launched on Kubernetes. To mount different PVCs per cluster, set the ``kubernetes.pod_config`` in the task YAML file as described in the :ref:`per-task configuration <yaml-spec-config>`. Refer to Kubernetes `volume mounts <https://kubernetes.io/docs/reference/generated/kubernetes-api/latest/#volumemount-v1-core>`_ and `volumes <https://kubernetes.io/docs/reference/generated/kubernetes-api/latest/#volume-v1-core>`_ documentation for more details.
-
-Mount a PVC to all clusters in each context
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-If you want to mount different PVCs for different Kubernetes contexts, you can set the ``allowed_contexts`` and ``context_configs`` in the :ref:`advanced config <config-yaml-kubernetes-pod-config>`.
-
-.. code-block:: yaml
-
-    kubernetes:
-      allowed_contexts:
-        - context1
-        - context2
-      context_configs:
-        context1:
-          pod_config:
-            spec:
-              securityContext:
-                fsGroup: 1000
-                fsGroupChangePolicy: OnRootMismatch
-              containers:
-                - volumeMounts:
-                  - mountPath: /mnt/data
-                    name: my-pvc
-              volumes:
-                - name: my-pvc
-                  persistentVolumeClaim:
-                    claimName: pvc1
-        context2:
-          pod_config:
-            spec:
-              securityContext:
-                fsGroup: 1000
-                fsGroupChangePolicy: OnRootMismatch
-              containers:
-                - volumeMounts:
-                  - mountPath: /mnt/data
-                    name: my-pvc
-              volumes:
-                - name: my-pvc
-                  persistentVolumeClaim:
-                    claimName: pvc2
-
 .. _advanced-mount-nfs-hostpath-with-kubernetes-configs:
 
-Advanced: Mount NFS or hostPath with Kubernetes configs
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Advanced: Use Kubernetes configs to mount PVCs, NFS, or hostPath
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-`Kubernetes volumes <https://kubernetes.io/docs/concepts/storage/volumes/>`_ can be attached to your SkyPilot pods using the :ref:`pod_config <kubernetes-custom-pod-config>` field. This is useful for accessing shared storage such as NFS or local high-performance storage like NVMe drives.
+In addition to using SkyPilot volumes, you can also mount `Kubernetes volumes <https://kubernetes.io/docs/concepts/storage/volumes/>`_ (PVCs, NFS, hostPath) by overriding SkyPilot's :ref:`pod_config <config-yaml-kubernetes-pod-config>`. This is useful for:
 
-Volume mounting can be done directly in the task YAML on a per-task basis, or globally for all tasks in `SkyPilot config <https://docs.skypilot.co/en/latest/reference/config.html>`_.
+1. Mounting a PVC with additional configurations not supported by SkyPilot volumes (e.g., ``fsGroup``, ``fsGroupChangePolicy``).
+2. Specifying a global (per Kubernetes context) volume to be mounted on all SkyPilot clusters.
+3. Accessing shared storage such as NFS or local high-performance storage like NVMe drives.
 
-Examples:
+Volume mounting can be done directly in the task YAML on a per-task basis, or globally for all tasks in `SkyPilot config <https://docs.skypilot.co/en/latest/reference/config.html>`_.
 
 .. tab-set::
 
@@ -612,8 +381,102 @@ Examples:
                       path: /path/on/host/nvme
                       type: Directory
 
+    .. tab-item:: PVC
+      :name: kubernetes-volumes-pvc
+
+      Mount a PVC with additional configurations like ``fsGroup`` and ``fsGroupChangePolicy``.
+
+      **Per-task configuration:**
+
+      .. code-block:: yaml
+
+          # task.yaml
+          run: |
+            echo "Hello, world!" > /mnt/data/hello.txt
+            ls -la /mnt/data
+          config:
+            kubernetes:
+              pod_config:
+                spec:
+                  securityContext:
+                    fsGroup: 1000
+                    fsGroupChangePolicy: OnRootMismatch
+                  containers:
+                    - volumeMounts:
+                        - mountPath: /mnt/data
+                          name: my-pvc
+                  volumes:
+                    - name: my-pvc
+                      persistentVolumeClaim:
+                        claimName: my-pvc
+
+      **Global configuration:**
+
+      .. code-block:: yaml
+
+          # SkyPilot config
+          kubernetes:
+            pod_config:
+              spec:
+                securityContext:
+                  fsGroup: 1000
+                  fsGroupChangePolicy: OnRootMismatch
+                containers:
+                  - volumeMounts:
+                      - mountPath: /mnt/data
+                        name: my-pvc
+                volumes:
+                  - name: my-pvc
+                    persistentVolumeClaim:
+                      claimName: my-pvc
+
+      **Mount different PVCs per context:**
+
+      If you want to mount different PVCs for different Kubernetes contexts, you can set the ``allowed_contexts`` and ``context_configs`` in the :ref:`advanced config <config-yaml-kubernetes-pod-config>`.
+
+      .. code-block:: yaml
+
+          # SkyPilot config
+          kubernetes:
+            allowed_contexts:
+              - context1
+              - context2
+            context_configs:
+              context1:
+                pod_config:
+                  spec:
+                    securityContext:
+                      fsGroup: 1000
+                      fsGroupChangePolicy: OnRootMismatch
+                    containers:
+                      - volumeMounts:
+                          - mountPath: /mnt/data
+                            name: my-pvc
+                    volumes:
+                      - name: my-pvc
+                        persistentVolumeClaim:
+                          claimName: pvc1
+              context2:
+                pod_config:
+                  spec:
+                    securityContext:
+                      fsGroup: 1000
+                      fsGroupChangePolicy: OnRootMismatch
+                    containers:
+                      - volumeMounts:
+                          - mountPath: /mnt/data
+                            name: my-pvc
+                    volumes:
+                      - name: my-pvc
+                        persistentVolumeClaim:
+                          claimName: pvc2
+
+      .. note::
+
+         The ``kubernetes.pod_config`` in the advanced config applies to every cluster launched on Kubernetes. To mount different PVCs per cluster, set the ``kubernetes.pod_config`` in the task YAML file as described in the :ref:`per-task configuration <yaml-spec-config>`. Refer to Kubernetes `volume mounts <https://kubernetes.io/docs/reference/generated/kubernetes-api/latest/#volumemount-v1-core>`_ and `volumes <https://kubernetes.io/docs/reference/generated/kubernetes-api/latest/#volume-v1-core>`_ documentation for more details.
+
     .. tab-item:: Nebius shared filesystem
-      :name: kubernetes-volumes-nebius-shared-filesystem
+      :name: primitives-volumes-nebius-vm-hostpath
 
       When creating a node group on the Nebius console, attach your desired shared file system to the node group (``Create Node Group`` -> ``Attach shared filesystem``):
 
@@ -666,12 +529,183 @@ Examples:
                       path: /mnt/<mount_tag> # e.g. /mnt/filesystem-d0
                       type: Directory
 
+
 .. note::
 
   When using `hostPath volumes <https://kubernetes.io/docs/concepts/storage/volumes/#hostpath>`_, the specified paths must already exist on the Kubernetes node where the pod is scheduled.
 
   For NFS mounts using hostPath, ensure the NFS mount is already configured on all Kubernetes nodes.
 
+Advanced: Installing additional storage backends
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+SkyPilot volumes work with any Kubernetes StorageClass already available in your cluster. If your cluster doesn't have a StorageClass that meets your needs, you can optionally install one.
+
+Below are example configurations for setting up shared filesystems like JuiceFS or Nebius Shared Filesystem as SkyPilot volumes. Any storage backend that provides a Kubernetes StorageClass will work.
+
+.. dropdown:: Installing additional storage backends - JuiceFS, Nebius Shared Filesystem
+   :animate: fade-in
+
+   .. tab-set::
+
+       .. tab-item:: JuiceFS
+           :sync: juicefs-tab
+
+           To use `JuiceFS <https://juicefs.com/docs/community/introduction/>`_ as a SkyPilot volume:
+
+           1. **Install the JuiceFS CSI driver** on your Kubernetes cluster. Follow the official `installation guide <https://juicefs.com/docs/csi/getting_started>`_ for detailed instructions.
+
+           2. **Verify the driver installation** - Confirm that the JuiceFS CSI Driver pods are running:
+
+           .. code-block:: console
+
+             $ kubectl -n kube-system get pod -l app.kubernetes.io/name=juicefs-csi-driver
+             NAME                       READY   STATUS    RESTARTS   AGE
+             juicefs-csi-controller-0   2/2     Running   0          10m
+             juicefs-csi-node-8rd96     3/3     Running   0          10m
+
+           3. **Set up JuiceFS storage and create a SkyPilot volume** - You can use either dynamic provisioning (with a StorageClass) or static provisioning (with a pre-created PV):
+
+           .. tab-set::
+
+               .. tab-item:: Dynamic Provisioning (StorageClass)
+                   :sync: dynamic-tab
+
+                   Create a StorageClass for dynamic provisioning. Refer to the `JuiceFS StorageClass guide <https://juicefs.com/docs/csi/guide/pv/#create-storage-class>`_ for details.
+
+                   .. code-block:: console
+
+                     $ kubectl get storageclass juicefs-sc
+                     NAME         PROVISIONER       RECLAIMPOLICY   VOLUMEBINDINGMODE   ALLOWVOLUMEEXPANSION   AGE
+                     juicefs-sc   csi.juicefs.com   Retain          Immediate           false                  10m
+
+                   Create a SkyPilot volume YAML referencing the StorageClass:
+
+                   .. code-block:: yaml
+
+                     # juicefs-volume.yaml
+                     name: juicefs-volume
+                     type: k8s-pvc
+                     infra: k8s
+                     size: 100Gi
+                     config:
+                       storage_class_name: juicefs-sc
+                       access_mode: ReadWriteMany
+
+                   .. code-block:: console
+
+                     $ sky volumes apply juicefs-volume.yaml
+
+               .. tab-item:: Static Provisioning (PV)
+                   :sync: static-tab
+
+                   Create a PersistentVolume and PVC manually. Refer to the `JuiceFS static provisioning guide <https://juicefs.com/docs/csi/guide/pv/#static-provisioning>`_ for details.
+
+                   .. code-block:: console
+
+                     $ kubectl get pv juicefs-pv
+                     NAME         CAPACITY   ACCESS MODES   RECLAIM POLICY   STATUS   CLAIM                 STORAGECLASS   AGE
+                     juicefs-pv   100Gi      RWX            Retain           Bound    default/juicefs-pvc                  10m
+
+                     $ kubectl get pvc juicefs-pvc
+                     NAME          STATUS   VOLUME       CAPACITY   ACCESS MODES   STORAGECLASS   AGE
+                     juicefs-pvc   Bound    juicefs-pv   100Gi      RWX                           10m
+
+                   Create a SkyPilot volume YAML with ``use_existing: true`` to reference the existing PVC:
+
+                   .. code-block:: yaml
+
+                     # juicefs-volume.yaml
+                     name: juicefs-volume
+                     type: k8s-pvc
+                     infra: k8s
+                     use_existing: true
+                     config:
+                       access_mode: ReadWriteMany
+
+                   .. code-block:: console
+
+                     $ sky volumes apply juicefs-volume.yaml
+
+           4. **Mount the volume to SkyPilot task** in your SkyPilot YAML:
+
+           .. code-block:: yaml
+
+             # task.yaml
+             num_nodes: 2
+
+             volumes:
+               # Mount the JuiceFS volume to /mnt/data across all nodes
+               /mnt/data: juicefs-volume
+
+             run: |
+               # Verify the volume is mounted and accessible
+               df -h /mnt/data
+               ls -la /mnt/data
+
+           .. code-block:: console
+
+             # Launch the cluster with the JuiceFS volume
+             $ sky launch -c juicefs-cluster task.yaml
+
+       .. tab-item:: Nebius shared file system
+           :sync: nebius-tab
+
+           To use `Nebius shared file system <https://docs.nebius.com/compute/storage/types#filesystems>`_ as a SkyPilot volume using the CSI driver. For a simpler setup, we recommend using the :ref:`hostPath-based method <primitives-volumes-nebius-vm-hostpath>` described above, which mounts the filesystem directly from the host without requiring a CSI driver.
+
+           1. **Set up the Nebius filesystem infrastructure** by following the official documentation:
+
+              - `Create a shared filesystem <https://docs.nebius.com/kubernetes/storage/filesystem-over-csi#create-filesystem>`_
+              - `Create a node group and mount the filesystem <https://docs.nebius.com/kubernetes/storage/filesystem-over-csi#create-node-group>`_
+              - `Install the CSI driver <https://docs.nebius.com/kubernetes/storage/filesystem-over-csi#install-csi>`_
+
+           2. **Verify the storage class** - Confirm that the ``csi-mounted-fs-path-sc`` storage class has been created:
+
+           .. code-block:: console
+
+             $ kubectl get storageclass
+             NAME                     PROVISIONER                    RECLAIMPOLICY   VOLUMEBINDINGMODE      ALLOWVOLUMEEXPANSION   AGE
+             csi-mounted-fs-path-sc   mounted-fs-path.csi.nebius.ai  Delete          WaitForFirstConsumer   false                  10m
+
+           3. **Create a SkyPilot volume for Nebius file system** with a volume YAML:
+
+           .. code-block:: yaml
+
+             # nebius-volume.yaml
+             name: nebius-pvc
+             type: k8s-pvc
+             infra: k8s
+             size: 100Gi
+             config:
+               storage_class_name: csi-mounted-fs-path-sc
+               access_mode: ReadWriteMany
+
+           .. code-block:: console
+
+             $ sky volumes apply nebius-volume.yaml
+
+           4. **Mount the volume to SkyPilot task** in your SkyPilot YAML:
+
+           .. code-block:: yaml
+
+             # task.yaml
+             num_nodes: 2
+
+             volumes:
+               # Mount the Nebius shared filesystem to /mnt/data across all nodes
+               /mnt/data: nebius-pvc
+
+             run: |
+               # Verify the volume is mounted and accessible
+               df -h /mnt/data
+               ls -la /mnt/data
+
+           .. code-block:: console
+
+             # Launch the cluster with the Nebius volume
+             $ sky launch -c nebius-cluster task.yaml
+
+
 .. _volumes-on-runpod:
 
 Volumes on RunPod
@@ -728,3 +762,10 @@ Managing volumes
 ~~~~~~~~~~~~~~~~
 
 Same as Kubernetes volumes, refer to :ref:`volumes-on-kubernetes-manage` for more details.
+
+.. _ssh-node-pool-volumes:
+
+Volumes on SSH node pools
+-------------------------
+
+With SSH node pools, you can mount host volumes or directories into SkyPilot clusters and managed jobs. See :ref:`Volumes on SSH node pools <ssh-volumes>` for details.
diff --git a/docs/source/reference/yaml-spec.rst b/docs/source/reference/yaml-spec.rst
index 181f1c0d6a9..9ab1e9132d9 100644
--- a/docs/source/reference/yaml-spec.rst
+++ b/docs/source/reference/yaml-spec.rst
@@ -49,6 +49,12 @@ Below is the configuration syntax and some example values.  See details under ea
     :ref:`autostop <yaml-spec-resources-autostop>`:
       idle_minutes: 10
       wait_for: none
+      :ref:`hook <auto-stop-hooks>`: |
+        cd my-code-base
+        git add .
+        git commit -m "Auto-commit before shutdown"
+        git push
+      hook_timeout: 300
 
     :ref:`any_of <yaml-spec-resources-any-of>`:
       - infra: aws/us-west-2
@@ -270,6 +276,12 @@ Format:
     - ``jobs_and_ssh`` (default): Wait for in‑progress jobs and SSH connections to finish
     - ``jobs``: Only wait for in‑progress jobs
     - ``none``: Wait for nothing; autostop right after ``idle_minutes``
+  - ``hook``: Optional script to execute before autostop. The script runs on the remote cluster before stopping or tearing down. If the hook fails, autostop will still proceed but a warning will be logged.
+
+    See :ref:`Autostop hooks <auto-stop-hooks>` for detailed explanation and examples.
+
+  - ``hook_timeout``: Timeout in seconds for hook execution (default: 3600 = 1 hour, minimum: 1).
+    If the hook exceeds this timeout, it will be terminated and autostop continues.
 
 ``<unit>`` can be one of:
 - ``m``: minutes (default if not specified)
@@ -317,6 +329,20 @@ OR
       idle_minutes: 10
       wait_for: none  # Stop after 10 minutes, regardless of running jobs or SSH connections
 
+OR
+
+.. code-block:: yaml
+
+  resources:
+    autostop:
+      idle_minutes: 10
+      hook: |
+        cd my-code-base
+        git add .
+        git commit -m "Auto-commit before shutdown"
+        git push
+      hook_timeout: 300
+
 
 .. _yaml-spec-resources-accelerators:
 
@@ -912,7 +938,7 @@ We can also specify the exit codes that should always trigger recovery, regardle
 
 We can specify multiple exit codes:
 
-.. code-block:: yaml  
+.. code-block:: yaml
 
   resources:
     job_recovery:
diff --git a/docs/source/running-jobs/environment-variables.rst b/docs/source/running-jobs/environment-variables.rst
index 74433cc50b9..cc3d217c01b 100644
--- a/docs/source/running-jobs/environment-variables.rst
+++ b/docs/source/running-jobs/environment-variables.rst
@@ -188,8 +188,8 @@ Environment variables for ``setup``
          3.4.5.6
    * - ``SKYPILOT_SETUP_NUM_GPUS_PER_NODE``
      - Number of GPUs per node in the cluster.
-     
-       Note that GPUs may not be available at this stage. Do not assume 
+
+       Note that GPUs may not be available at this stage. Do not assume
        GPUs are available during setup.
      - 1
 
@@ -214,6 +214,9 @@ Environment variables for ``setup``
          )['cloud']
 
      - {"cluster_name": "my-cluster-name", "cloud": "GCP", "region": "us-central1", "zone": "us-central1-a"}
+   * - ``SKYPILOT_USER``
+     - The username of the user who launched the job.
+     - alice
    * - ``SKYPILOT_SERVE_REPLICA_ID``
      - The ID of a replica within the service (starting from 1). Available only for a :ref:`service <sky-serve>`'s replica task.
      - 1
@@ -270,6 +273,9 @@ Environment variables for ``run``
            os.environ['SKYPILOT_CLUSTER_INFO']
          )['cloud']
      - {"cluster_name": "my-cluster-name", "cloud": "GCP", "region": "us-central1", "zone": "us-central1-a"}
+   * - ``SKYPILOT_USER``
+     - The username of the user who launched the job.
+     - alice
    * - ``SKYPILOT_SERVE_REPLICA_ID``
      - The ID of a replica within the service (starting from 1). Available only for a :ref:`service <sky-serve>`'s replica task.
-     - 1
\ No newline at end of file
+     - 1
diff --git a/examples/airflow/README.md b/examples/airflow/README.md
index 69faf5728fc..92998eb5c5b 100644
--- a/examples/airflow/README.md
+++ b/examples/airflow/README.md
@@ -11,7 +11,7 @@ This example uses a remote SkyPilot API Server to manage shared state across inv
 </p>
 
 
-**💡 Tip:**  SkyPilot also supports defining and running pipelines without Airflow. Check out [Jobs Pipelines](https://skypilot.readthedocs.io/en/latest/examples/managed-jobs.html#job-pipelines) for more information.
+**💡 Tip:**  SkyPilot also supports defining and running pipelines without Airflow. Check out [Jobs Pipelines](https://docs.skypilot.co/en/latest/examples/managed-jobs.html#job-pipelines) for more information.
 
 ## Why use SkyPilot with Airflow?
 In AI workflows, **the transition from development to production is hard**.
@@ -28,7 +28,7 @@ production Airflow cluster. Behind the scenes, SkyPilot handles environment setu
 
 Here's how you can use SkyPilot to take your dev workflows to production in Airflow:
 1. **Define and test your workflow as SkyPilot tasks**.
-    - Use `sky launch` and [Sky VSCode integration](https://skypilot.readthedocs.io/en/latest/examples/interactive-development.html#dev-vscode) to run, debug and iterate on your code.
+    - Use `sky launch` and [Sky VSCode integration](https://docs.skypilot.co/en/latest/examples/interactive-development.html#dev-vscode) to run, debug and iterate on your code.
 2. **Orchestrate SkyPilot tasks in Airflow** by invoking `sky launch` on their YAMLs as a task in the Airflow DAG.
     - Airflow does the scheduling, logging, and monitoring, while SkyPilot handles the infra setup and task execution.
 
@@ -78,7 +78,7 @@ The train and eval step can be run in a similar way:
 sky launch -c train --env DATA_BUCKET_NAME=<bucket-name> --env DATA_BUCKET_STORE_TYPE=s3 train.yaml
 ```
 
-Hint: You can use `ssh` and VSCode to [interactively develop](https://skypilot.readthedocs.io/en/latest/examples/interactive-development.html) and debug the tasks.
+Hint: You can use `ssh` and VSCode to [interactively develop](https://docs.skypilot.co/en/latest/examples/interactive-development.html) and debug the tasks.
 
 Note: `eval` can be optionally run on the same cluster as `train` with `sky exec`.
 
diff --git a/examples/aws_efa/README.md b/examples/aws_efa/README.md
index 1a23e306a2d..747c7f79749 100644
--- a/examples/aws_efa/README.md
+++ b/examples/aws_efa/README.md
@@ -6,23 +6,15 @@ Elastic Fabric Adapter (EFA) is an AWS alternative to Nvidia infiniband that ena
 
 ### TL;DR: enable EFA with SkyPilot
 
-You can enable EFA on AWS HyperPod/EKS clusters with an simple additional setting in your SkyPilot YAML:
+You can enable EFA on AWS HyperPod/EKS clusters by simply adding ``network_tier: best`` to your resources specification:
 
 ```yaml
-config:
-  kubernetes:
-    pod_config:
-      spec:
-        containers:
-        - resources:
-            limits:
-              vpc.amazonaws.com/efa: 4
-            requests:
-              vpc.amazonaws.com/efa: 4
+resources:
+  infra: k8s
+  accelerators: A100:8
+  network_tier: best
 ```
 
-
-
 ### Enable EFA with HyperPod/EKS
 
 * On HyperPod (backed by EKS), EFA is enabled by default, and you don't need to do anything.
@@ -40,42 +32,15 @@ hyperpod-i-0da69b9076c7ff6a4   ml.p4d.24xlarge   8     4
 ...
 ```
 
-### Access HyperPod and run distributed job with SkyPilot
-
-To access HyperPod and run distributed job with SkyPilot, see the SkyPilot [HyperPod example](https://github.com/skypilot-org/skypilot/blob/master/examples/hyperpod-eks).
-
-#### Adding EFA configurations in SkyPilot YAML
-
-To enable EFA in SkyPilot YAML, you can specify the following section in the SkyPilot YAML:
-
-```yaml
-config:
-  kubernetes:
-    pod_config:
-      spec:
-        containers:
-        - resources:
-            limits:
-              vpc.amazonaws.com/efa: 4
-            requests:
-              vpc.amazonaws.com/efa: 4
-```
-
-This section is important for EFA integration:
-
-- `config.kubernetes.pod_config`: Provides Kubernetes-specific pod configuration
-- `spec.containers[0].resources`: Defines resource requirements
-  - `limits.vpc.amazonaws.com/efa: 4`: Limits the Pod to use 4 EFA devices
-  - `requests.vpc.amazonaws.com/efa: 4`: Requests 4 EFA devices for the Pod
-
-
-The `vpc.amazonaws.com/efa` resource type is exposed by the AWS EFA device plugin in Kubernetes.
+The `vpc.amazonaws.com/efa` resource is exposed by the AWS EFA device plugin in Kubernetes.
 To see how many EFA are available for each instance types that have EFA, see the [Network cards](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/using-eni.html#network-cards) list in the Amazon EC2 User Guide.
 
 Check the following table for the GPU and EFA count mapping for AWS instance types:
 
 | Instance Type | GPU Type | #EFA |
 |---------------|----------|------|
+| p6-b300.48xlarge | B300:8 | 16  |
+| p6-b200.48xlarge | B200:8 | 8   |
 | p4d.24xlarge  | A100:8   | 4    |
 | p4de.24xlarge | A100:8   | 4    |
 | p5.48xlarge   | H100:8   | 32   |
@@ -100,15 +65,13 @@ Check the following table for the GPU and EFA count mapping for AWS instance typ
 | g6e.16xlarge  | L40S:1   | 1    |
 | g6e.24xlarge  | L40S:4   | 2    |
 | g6e.48xlarge  | L40S:8   | 4    |
-
-
-Update the EFA number in the [`nccl_efa.yaml`](https://github.com/skypilot-org/skypilot/blob/master/examples/aws_efa/nccl_efa.yaml) for the GPUs you use.
+| gr6.8xlarge   | L4:1     | 1    |
 
 ### Running NCCL test with EFA using SkyPilot
 
 Check the [`nccl_efa.yaml`](https://github.com/skypilot-org/skypilot/blob/master/examples/aws_efa/nccl_efa.yaml) for the complete SkyPilot cluster yaml configurations.
 
-The `image_id` provides the environment setup for [NCCL](https://developer.nvidia.com/nccl) (NVIDIA Collective Communications Library) and EFA (Elastic Fabric Adapter).
+The image [public.ecr.aws/hpc-cloud/nccl-tests:latest](https://github.com/aws-samples/awsome-distributed-training/blob/main/micro-benchmarks/nccl-tests/nccl-tests.Dockerfile) provides the environment setup for [NCCL](https://developer.nvidia.com/nccl) (NVIDIA Collective Communications Library) and EFA (Elastic Fabric Adapter).
 
 To run the NCCL test with EFA support:
 
@@ -123,10 +86,7 @@ SkyPilot will:
 4. Output performance metrics showing the benefits of EFA for distributed training
 
 > **NOTE:**
-> We can turn off EFA with `nccl_efa.yaml` by passing an env:
-> ```bash
-> sky launch -c efa --env USE_EFA=false nccl_efa.yaml
-> ```
+> We can turn off EFA with `nccl_efa.yaml` by commenting out `network_tier: best`.
 
 #### Benchmark results
 
@@ -178,7 +138,7 @@ EFA provides much higher throughput than the traditional TCP transport. Enabling
 
 ## Using EFA on AWS VM
 
-For the instance types listed in the GPU and EFA count mapping table in the [Adding EFA configurations in SkyPilot YAML](#adding-efa-configurations-in-skypilot-yaml) section, the EFA can be enabled by setting `resources.network_tier: best` in the task YAML.
+For the instance types listed in the GPU and EFA count mapping table in the [Enable EFA with HyperPod/EKS](#enable-efa-with-hyperpodeks) section, the EFA can be enabled by setting `resources.network_tier: best` in the task YAML.
 
 ```yaml
 resources:
diff --git a/examples/aws_efa/nccl_efa.yaml b/examples/aws_efa/nccl_efa.yaml
index 809bd0c4b9b..ab2937e597c 100644
--- a/examples/aws_efa/nccl_efa.yaml
+++ b/examples/aws_efa/nccl_efa.yaml
@@ -5,14 +5,11 @@ name: nccl-efa-eks
 resources:
   infra: k8s
   accelerators: A100:8
-  cpus: 90+
   image_id: docker:public.ecr.aws/hpc-cloud/nccl-tests:latest
+  network_tier: best
 
 num_nodes: 2
 
-envs:
-  USE_EFA: "true"
-
 run: |
   if [ "${SKYPILOT_NODE_RANK}" == "0" ]; then
     echo "Head node"
@@ -28,22 +25,6 @@ run: |
     nodes=${nodes::-1}
     echo "All nodes: ${nodes}"
 
-    # Set environment variables
-    export PATH=$PATH:/usr/local/cuda-12.2/bin:/opt/amazon/efa/bin:/usr/bin
-    export LD_LIBRARY_PATH=/usr/local/cuda-12.2/lib64:/opt/amazon/openmpi/lib:/opt/nccl/build/lib:/opt/amazon/efa/lib:/opt/aws-ofi-nccl/install/lib:/usr/local/nvidia/lib:$LD_LIBRARY_PATH
-    export NCCL_HOME=/opt/nccl
-    export CUDA_HOME=/usr/local/cuda-12.2
-    export NCCL_DEBUG=INFO
-    export NCCL_BUFFSIZE=8388608
-    export NCCL_P2P_NET_CHUNKSIZE=524288
-    export NCCL_TUNER_PLUGIN=/opt/aws-ofi-nccl/install/lib/libnccl-ofi-tuner.so
-
-    if [ "${USE_EFA}" == "true" ]; then
-      export FI_PROVIDER="efa"
-    else
-      export FI_PROVIDER=""
-    fi
-
     /opt/amazon/openmpi/bin/mpirun \
       --allow-run-as-root \
       --tag-output \
@@ -51,13 +32,9 @@ run: |
       -np $NP \
       -N $SKYPILOT_NUM_GPUS_PER_NODE \
       --bind-to none \
-      -x FI_PROVIDER \
       -x PATH \
       -x LD_LIBRARY_PATH \
       -x NCCL_DEBUG=INFO \
-      -x NCCL_BUFFSIZE \
-      -x NCCL_P2P_NET_CHUNKSIZE \
-      -x NCCL_TUNER_PLUGIN \
       --mca pml ^cm,ucx \
       --mca btl tcp,self \
       --mca btl_tcp_if_exclude lo,docker0,veth_def_agent \
@@ -72,14 +49,3 @@ run: |
   else
     echo "Worker nodes"
   fi
-
-config:
-  kubernetes:
-    pod_config:
-      spec:
-        containers:
-        - resources:
-            limits:
-              vpc.amazonaws.com/efa: 4
-            requests:
-              vpc.amazonaws.com/efa: 4
diff --git a/examples/distributed_ray_train/ray_train.yaml b/examples/distributed_ray_train/ray_train.yaml
index 0ba202b884d..9a9a9314bff 100644
--- a/examples/distributed_ray_train/ray_train.yaml
+++ b/examples/distributed_ray_train/ray_train.yaml
@@ -5,6 +5,9 @@
 resources:
   accelerators: L4:2
   memory: 64+
+  # On SLURM, it is recommended to use a Docker image to avoid permission
+  # issues with /tmp: https://github.com/ray-project/ray/issues/3899
+  # image_id: docker:rayproject/ray:nightly-py39-gpu
 
 num_nodes: 2
 
diff --git a/examples/hyperpod-eks/README.md b/examples/hyperpod-eks/README.md
index 8c11be05272..951b97fe32d 100644
--- a/examples/hyperpod-eks/README.md
+++ b/examples/hyperpod-eks/README.md
@@ -5,7 +5,7 @@ This example shows how to run SkyPilot on AWS SageMaker HyperPod with EKS.
 ## Prerequisites
 
 - An existing SageMaker HyperPod with EKS (or you can create one with AWS [doc](https://catalog.workshops.aws/sagemaker-hyperpod-eks/en-US/00-setup/own-account/01-workshop-infra-script))
-- SkyPilot installed: [installation doc](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html)
+- SkyPilot installed: [installation doc](https://docs.skypilot.co/en/latest/getting-started/installation.html)
 ```bash
 pip install skypilot-nightly[kubernetes]
 ```
diff --git a/examples/managed_spot_queued_resource.yaml b/examples/managed_spot_queued_resource.yaml
new file mode 100644
index 00000000000..9a6347a5872
--- /dev/null
+++ b/examples/managed_spot_queued_resource.yaml
@@ -0,0 +1,27 @@
+name: minimal
+
+resources:
+  use_spot: true
+  infra: gcp/us-central1/us-central1-a
+  accelerators: tpu-v5litepod-16:1
+  accelerator_args:
+    runtime_version: v2-alpha-tpuv5-lite
+    gcp_queued_resource: true
+
+setup: |
+  echo "running setup"
+  pip install tqdm
+  pip install jax[tpu]
+
+run: |
+  conda env list
+  python -u - << EOF
+  import time
+  import tqdm
+  import jax
+  print(jax.devices())
+
+  for i in tqdm.trange(240):
+    time.sleep(1)
+  
+  EOF
diff --git a/examples/metrics/kube_prometheus_node_exporter_service_monitor.yaml b/examples/metrics/kube_prometheus_node_exporter_service_monitor.yaml
deleted file mode 100644
index 2298eb24440..00000000000
--- a/examples/metrics/kube_prometheus_node_exporter_service_monitor.yaml
+++ /dev/null
@@ -1,26 +0,0 @@
-apiVersion: monitoring.coreos.com/v1
-kind: ServiceMonitor
-metadata:
-  annotations:
-    meta.helm.sh/release-name: kube-prometheus
-    meta.helm.sh/release-namespace: skypilot
-  labels:
-    app.kubernetes.io/instance: kube-prometheus
-    app.kubernetes.io/managed-by: Helm
-    app.kubernetes.io/name: node-exporter
-  name: kube-prometheus-node-exporter
-  namespace: skypilot
-spec:
-  endpoints:
-  - port: metrics
-    relabelings:
-    - sourceLabels: [__meta_kubernetes_pod_node_name]
-      targetLabel: node
-  jobLabel: jobLabel
-  namespaceSelector:
-    matchNames:
-    - skypilot
-  selector:
-    matchLabels:
-      app.kubernetes.io/instance: kube-prometheus
-      app.kubernetes.io/name: node-exporter
diff --git a/examples/metrics/prometheus-values.yaml b/examples/metrics/prometheus-values.yaml
new file mode 100644
index 00000000000..f2299785f0e
--- /dev/null
+++ b/examples/metrics/prometheus-values.yaml
@@ -0,0 +1,16 @@
+server:
+  persistentVolume:
+    enabled: true
+    size: 50Gi
+  retention: "1000d"
+  retentionSize: "43GB"
+kube-state-metrics:
+  enabled: true
+  metricLabelsAllowlist:
+    - pods=[skypilot-cluster,skypilot-cluster-name]
+prometheus-node-exporter:
+  enabled: false
+prometheus-pushgateway:
+  enabled: false
+alertmanager:
+  enabled: false
diff --git a/examples/metrics/skypilot_prometheus_server_service.yaml b/examples/metrics/skypilot_prometheus_server_service.yaml
deleted file mode 100644
index 1af9d7712a8..00000000000
--- a/examples/metrics/skypilot_prometheus_server_service.yaml
+++ /dev/null
@@ -1,22 +0,0 @@
-apiVersion: v1
-kind: Service
-metadata:
-  labels:
-    app.kubernetes.io/component: prometheus
-  name: skypilot-prometheus-server
-  namespace: skypilot
-spec:
-  internalTrafficPolicy: Cluster
-  ipFamilies:
-  - IPv4
-  ipFamilyPolicy: SingleStack
-  ports:
-  - name: http
-    port: 80
-    protocol: TCP
-    targetPort: 9090
-  selector:
-    app.kubernetes.io/component: prometheus
-    app.kubernetes.io/name: prometheus
-  sessionAffinity: None
-  type: ClusterIP
diff --git a/examples/plugin/README.md b/examples/plugin/README.md
index f1b1e5c0f4d..dbe1d655673 100644
--- a/examples/plugin/README.md
+++ b/examples/plugin/README.md
@@ -1,9 +1,69 @@
 # Example Plugins for SkyPilot API Server
 
-Usage:
+## Usage
 
 ```bash
 $ pip install .
 $ cp plugins.yaml ~/.sky/plugins.yaml
 $ sky api stop; sky api start
 ```
+
+## Remote Controller Support
+
+Plugins can be automatically deployed to remote controllers (jobs controller, serve
+controller) by creating a separate `remote_plugins.yaml` file that specifies which
+plugins should be uploaded to controllers.
+
+### Setup
+
+1. Create `~/.sky/plugins.yaml` for API server plugins with `controller_wheel_path`:
+
+```yaml
+controller_wheel_path: dist
+
+plugins:
+- class: example_plugin.ExamplePlugin
+```
+
+2. Create `~/.sky/remote_plugins.yaml` for remote controller plugins:
+
+```yaml
+plugins:
+- class: example_plugin.ExamplePatchPlugin
+```
+
+When `remote_plugins.yaml` exists and contains plugins:
+1. All `.whl` files found in the directory specified in `controller_wheel_path` (in `plugins.yaml`) are uploaded to remote clusters via file mounts
+2. The wheels are installed in the SkyPilot runtime environment on the cluster
+3. The `remote_plugins.yaml` config is uploaded to the cluster (as `plugins.yaml`)
+
+This allows your plugins to run on both the API server (if specified in `plugins.yaml`) and on job/serve controllers (if specified in `remote_plugins.yaml`).
+
+**Note:** You must build the wheel files yourself before configuring them in `plugins.yaml`. All `.whl` files in the specified directory will be uploaded. For example:
+```bash
+python -m build  # or python setup.py bdist_wheel
+# This typically creates wheel files in the dist/ directory
+```
+
+### Configuration
+
+The `plugins.yaml` schema supports the following top-level fields:
+
+- `controller_wheel_path` (optional): Path to a directory containing prebuilt plugin wheel files (.whl). All `.whl` files in this directory will be uploaded to controllers. If no `.whl` files are found in the directory, nothing will be uploaded.
+
+The `plugins.yaml` schema supports the following fields per plugin:
+
+- `class` (required): The Python class path of the plugin (e.g., `module.ClassName`)
+- `parameters` (optional): Dictionary of parameters to pass to the plugin constructor
+
+The `remote_plugins.yaml` schema supports the following fields per plugin:
+
+- `class` (required): The Python class path of the plugin (e.g., `module.ClassName`)
+- `parameters` (optional): Dictionary of parameters to pass to the plugin constructor
+
+### Environment Variables
+
+You can customize the paths to these configuration files using environment variables:
+
+- `SKYPILOT_SERVER_PLUGINS_CONFIG`: Path to `plugins.yaml` (default: `~/.sky/plugins.yaml`)
+- `SKYPILOT_SERVER_REMOTE_PLUGINS_CONFIG`: Path to `remote_plugins.yaml` (default: `~/.sky/remote_plugins.yaml`)
diff --git a/examples/plugin/plugins.yaml b/examples/plugin/plugins.yaml
index 31b807daf29..2b0a58923e2 100644
--- a/examples/plugin/plugins.yaml
+++ b/examples/plugin/plugins.yaml
@@ -1,3 +1,8 @@
+# Path to a directory containing prebuilt wheel files (.whl) that will be uploaded
+# to remote clusters (jobs controller, serve controller).
+# All .whl files in this directory will be uploaded and installed.
+controller_wheel_path: dist
+
 plugins:
 - class: example_plugin.ExamplePlugin
 - class: example_plugin.ExampleParameterizedPlugin
diff --git a/examples/plugin/remote_plugin.yaml b/examples/plugin/remote_plugin.yaml
new file mode 100644
index 00000000000..a072d06c14d
--- /dev/null
+++ b/examples/plugin/remote_plugin.yaml
@@ -0,0 +1,5 @@
+# Plugins specified here will be uploaded to remote controllers.
+# These plugins will be available on both the API server (if also in plugins.yaml)
+# and on remote controllers (jobs controller, serve controller).
+plugins:
+- class: example_plugin.ExamplePatchPlugin
diff --git a/examples/ray_basic/ray.yaml b/examples/ray_basic/ray.yaml
index 4d5041c7468..0c4fd6c07fc 100644
--- a/examples/ray_basic/ray.yaml
+++ b/examples/ray_basic/ray.yaml
@@ -6,6 +6,9 @@
 
 resources:
   cpus: 2+
+  # On SLURM, it is recommended to use a Docker image to avoid permission
+  # issues with /tmp: https://github.com/ray-project/ray/issues/3899
+  # image_id: docker:rayproject/ray:nightly-py39-cpu
 
 num_nodes: 2
 
diff --git a/examples/redisvl-vector-search/README.md b/examples/redisvl-vector-search/README.md
index f5361613723..a7d1c8ab900 100644
--- a/examples/redisvl-vector-search/README.md
+++ b/examples/redisvl-vector-search/README.md
@@ -1,6 +1,6 @@
 # RedisVL + SkyPilot: Vector Search at Scale
 
-Distributed vector search over [1M research papers](https://www.kaggle.com/datasets/nechbamohammed/research-papers-dataset) using [RedisVL](https://docs.redisvl.com/en/latest/) and [SkyPilot](https://skypilot.readthedocs.io/en/latest/).
+Distributed vector search over [1M research papers](https://www.kaggle.com/datasets/nechbamohammed/research-papers-dataset) using [RedisVL](https://docs.redisvl.com/en/latest/) and [SkyPilot](https://docs.skypilot.co/en/latest/).
 
 📖 [Read the full blog post](https://blog.skypilot.co/redisvl-skypilot/).
 
diff --git a/examples/serve/nvidia-dynamo/README.md b/examples/serve/nvidia-dynamo/README.md
index ddf5b5e5cab..933025c503e 100644
--- a/examples/serve/nvidia-dynamo/README.md
+++ b/examples/serve/nvidia-dynamo/README.md
@@ -19,9 +19,17 @@ NVIDIA Dynamo is a high-performance inference framework designed for serving gen
 - **Disaggregated Prefill & Decode**: Separates inference phases for optimal resource utilization
 - **Dynamic GPU Scheduling**: Intelligent workload distribution across available GPUs
 - **LLM-Aware Request Routing**: Smart routing based on model characteristics and cache states
-- **Accelerated Data Transfer**: High-performance data movement between nodes
+- **Accelerated Data Transfer**: High-performance data movement between nodes via NIXL
 - **KV Cache Offloading**: Multi-tiered memory management for efficient cache utilization
 
+## Container Image
+
+These examples use the official NVIDIA Dynamo container images from NGC:
+- `nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.7.1` - SGLang backend (used in these examples)
+- `nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.7.1` - vLLM backend (alternative)
+
+The NGC images are freely accessible and include all necessary dependencies (NATS, etcd, NIXL, etc.).
+
 ## Launching Nvidia Dynamo with SkyPilot
 
 ### Single-Node Example (`nvidia-dynamo.sky.yaml`)
@@ -29,6 +37,7 @@ NVIDIA Dynamo is a high-performance inference framework designed for serving gen
 - ✅ **OpenAI-Compatible API**: Drop-in replacement for OpenAI endpoints
 - ✅ **Basic Load Balancing**: Round-robin request distribution
 - ✅ **Auto-Discovery**: Dynamic worker registration
+- ✅ **No etcd Required**: Uses file-based KV store for single-node simplicity
 
 ### Multi-Node Example (`nvidia-dynamo-multinode.sky.yaml`)
 - ✅ **KV-Aware Routing**: Intelligent cache-based request routing (`--router-mode kv`)
@@ -36,6 +45,7 @@ NVIDIA Dynamo is a high-performance inference framework designed for serving gen
 - ✅ **Data Parallel Attention**: DP=2 across nodes (`--enable-dp-attention`)
 - ✅ **Tensor Parallelism**: TP=8 per node for large model support
 - ✅ **Disaggregated Transfer**: NIXL backend for KV cache transfers
+- ✅ **Centralized Services**: NATS and etcd run on head node, workers connect automatically
 
 **Model**: `Qwen/Qwen3-8B` (8B parameter reasoning model)
 
diff --git a/examples/serve/nvidia-dynamo/nvidia-dynamo-multinode.sky.yaml b/examples/serve/nvidia-dynamo/nvidia-dynamo-multinode.sky.yaml
index 9d10ebf6d92..d07c5ab5063 100644
--- a/examples/serve/nvidia-dynamo/nvidia-dynamo-multinode.sky.yaml
+++ b/examples/serve/nvidia-dynamo/nvidia-dynamo-multinode.sky.yaml
@@ -2,7 +2,7 @@
 #
 # Usage:
 #
-#  sky launch -c dynamo-multi nvidia-dynamo-multinode.sky.yaml 
+#  sky launch -c dynamo-multi nvidia-dynamo-multinode.sky.yaml
 #
 # This config uses 2 nodes with 8x H100 GPUs each for disaggregated serving.
 # Optionally override the model:
@@ -10,8 +10,10 @@
 #  sky launch -c dynamo-multi nvidia-dynamo-multinode.sky.yaml --env MODEL_NAME=meta-llama/Llama-3.1-8B-Instruct --env HF_TOKEN
 
 resources:
-  accelerators: H100:8
+  accelerators: {H100:8, H200:8}
   ports: 8080
+  # Use the official NVIDIA Dynamo SGLang runtime image from NGC
+  image_id: docker:nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.7.1
 
 num_nodes: 2
 
@@ -20,16 +22,7 @@ envs:
   DIST_INIT_PORT: 29500
   HF_TOKEN: "" # needed if a model is gated in HF Hub. Pass the value with `--env HF_TOKEN`
 
-setup: |
-  sudo usermod -aG docker $USER
-  sudo chmod 666 /var/run/docker.sock
-  uv pip install "ai-dynamo[sglang]==0.5.0" accelerate --system --prerelease=allow
-  uv pip install "sglang[all]==0.5.2" --system --prerelease=allow
-  curl -fsSL -o docker-compose.yml https://raw.githubusercontent.com/ai-dynamo/dynamo/v0.5.0/deploy/docker-compose.yml
-  docker compose -f docker-compose.yml up -d
-
 run: |
-  export GLOO_SOCKET_IFNAME=$(ip -o -4 route show to default | awk '{print $5}')
   HEAD_IP=$(echo "$SKYPILOT_NODE_IPS" | head -n1)
   TOTAL_GPUS=$((SKYPILOT_NUM_NODES * SKYPILOT_NUM_GPUS_PER_NODE))
 
@@ -38,11 +31,31 @@ run: |
   TP_SIZE=$((TOTAL_GPUS / 2))
   DP_SIZE=2
 
+  # Get the network interface for GLOO
+  export GLOO_SOCKET_IFNAME=$(ip -o -4 route show to default | awk '{print $5}')
+
   if [ "${SKYPILOT_NODE_RANK}" == "0" ]; then
+    # Head node: Start NATS and etcd services
+    echo "Starting NATS and etcd on head node..."
+    nats-server -js &
+    etcd --listen-client-urls http://0.0.0.0:2379 \
+         --advertise-client-urls http://${HEAD_IP}:2379 \
+         --data-dir /tmp/etcd &
+    sleep 3
+
     # Start frontend with KV-aware routing enabled
     python -m dynamo.frontend --router-mode kv --http-port 8080 &
+  else
+    # Worker nodes: Wait for head node services to be ready
+    echo "Waiting for head node services..."
+    sleep 5
   fi
 
+  # Set connection endpoints for NATS and etcd (all nodes connect to head)
+  export NATS_SERVER=nats://${HEAD_IP}:4222
+  export ETCD_ENDPOINTS=http://${HEAD_IP}:2379
+
+  # All nodes run SGLang workers
   python -m dynamo.sglang \
     --model-path $MODEL_NAME \
     --tp $TP_SIZE \
@@ -57,4 +70,15 @@ run: |
     --mem-fraction-static 0.82 \
     --disaggregation-transfer-backend nixl \
     --disaggregation-bootstrap-port 30001 \
-    --page-size 16
\ No newline at end of file
+    --page-size 16
+
+# Kubernetes-specific configuration
+config:
+  kubernetes:
+    pod_config:
+      spec:
+        containers:
+        - securityContext:
+            # Run as root to allow SkyPilot to install necessary packages
+            runAsUser: 0
+            runAsGroup: 0
diff --git a/examples/serve/nvidia-dynamo/nvidia-dynamo.sky.yaml b/examples/serve/nvidia-dynamo/nvidia-dynamo.sky.yaml
index 44c0bb425d3..3bb455f581a 100644
--- a/examples/serve/nvidia-dynamo/nvidia-dynamo.sky.yaml
+++ b/examples/serve/nvidia-dynamo/nvidia-dynamo.sky.yaml
@@ -2,28 +2,40 @@
 #
 # Usage:
 #
-#  sky launch -c dynamo nvidia-dynamo.sky.yaml 
+#  sky launch -c dynamo nvidia-dynamo.sky.yaml
 #
 # Optionally override the model:
 #
-#  sky launch -c dynamo nvidia-dynamo.sky.yaml  --env MODEL_NAME=meta-llama/Llama-3.1-8B-Instruct --env HF_TOKEN
+#  sky launch -c dynamo nvidia-dynamo.sky.yaml --env MODEL_NAME=meta-llama/Llama-3.1-8B-Instruct --env HF_TOKEN
 
 resources:
-  accelerators: H100:1
+  accelerators: {H100:1, H200:1}
   ports: 8080
+  # Use the official NVIDIA Dynamo SGLang runtime image from NGC
+  image_id: docker:nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.7.1
 
 envs:
   MODEL_NAME: Qwen/Qwen3-8B
   HF_TOKEN: "" # needed if a model is gated in HF Hub. Pass the value with `--env HF_TOKEN`
 
-setup: |
-  sudo usermod -aG docker $USER
-  sudo chmod 666 /var/run/docker.sock
+run: |
+  # Start NATS server with JetStream enabled (required for Dynamo messaging)
+  nats-server -js &
+  sleep 2
 
-  uv pip install "ai-dynamo[sglang]==0.4.1" accelerate --system --prerelease=allow
-  curl -fsSL -o docker-compose.yml https://raw.githubusercontent.com/ai-dynamo/dynamo/release/0.4.1/deploy/docker-compose.yml
-  docker compose -f docker-compose.yml up -d
+  # Start the Dynamo frontend (HTTP server + router)
+  python -m dynamo.frontend --http-port 8080 --store-kv file &
 
-run: |
-  python -m dynamo.frontend &
-  python -m dynamo.sglang --model $MODEL_NAME
\ No newline at end of file
+  # Start the SGLang worker
+  python -m dynamo.sglang --model $MODEL_NAME --store-kv file
+
+# Kubernetes-specific configuration
+config:
+  kubernetes:
+    pod_config:
+      spec:
+        containers:
+        - securityContext:
+            # Run as root to allow SkyPilot to install necessary packages
+            runAsUser: 0
+            runAsGroup: 0
diff --git a/examples/streamlit/README.md b/examples/streamlit/README.md
index 29fbf080144..29a9885a037 100644
--- a/examples/streamlit/README.md
+++ b/examples/streamlit/README.md
@@ -76,5 +76,5 @@ resources:
 
 ## Learn more
 
-- [SkyPilot Documentation](https://skypilot.readthedocs.io/)
+- [SkyPilot Documentation](https://docs.skypilot.co/)
 - [Streamlit Documentation](https://docs.streamlit.io/)
diff --git a/examples/together_infiniband/README.md b/examples/together_infiniband/README.md
new file mode 100644
index 00000000000..2d60464cfc7
--- /dev/null
+++ b/examples/together_infiniband/README.md
@@ -0,0 +1,54 @@
+# Using InfiniBand in Together AI with SkyPilot
+
+SkyPilot provides the `network_tier: best` configuration option that automatically enables InfiniBand support on Together AI Kubernetes clusters. This eliminates the need for manual configuration of security contexts and environment variables.
+
+## InfiniBand on Together AI Kubernetes clusters
+
+Simply add ``network_tier: best`` to your resources specification:
+
+```yaml
+resources:
+  infra: k8s
+  accelerators: H100:8
+  network_tier: best
+```
+
+This enables the InfiniBand for inter-GPU communication, and SkyPilot will automatically setup the environment variables for you.
+
+## Running NCCL test using SkyPilot
+
+Check the [`nccl_network_tier.yaml`](https://github.com/skypilot-org/skypilot/blob/master/examples/together_infiniband/nccl_network_tier.yaml) for the complete SkyPilot cluster yaml configurations.
+
+The `image_id` provides the environment setup for [NCCL](https://developer.nvidia.com/nccl) (NVIDIA Collective Communications Library).
+
+To run the NCCL test with InfiniBand support:
+
+```bash
+sky launch -c infiniband nccl_network_tier.yaml
+```
+
+SkyPilot will:
+1. Schedule the job on the Kubernetes cluster with required GPU nodes
+2. Launch Pods and execute the NCCL performance test
+3. Output performance metrics showing the benefits of InfiniBand for distributed training
+
+The example result is as below:
+
+```
+#                                                              out-of-place                       in-place
+#       size         count      type   redop    root     time   algbw   busbw #wrong     time   algbw   busbw #wrong
+#        (B)    (elements)                               (us)  (GB/s)  (GB/s)            (us)  (GB/s)  (GB/s)
+   536870912     134217728     float     sum      -1   2407.5  222.99  418.12      0   2380.3  225.55  422.90      0
+  1073741824     268435456     float     sum      -1   4524.3  237.33  444.99      0   4531.6  236.95  444.28      0
+  2147483648     536870912     float     sum      -1   8787.5  244.38  458.21      0   8780.7  244.57  458.56      0
+  4294967296    1073741824     float     sum      -1    17327  247.88  464.77      0    17328  247.86  464.74      0
+  8589934592    2147483648     float     sum      -1    34462  249.26  467.36      0    34482  249.11  467.08      0
+# Out of bounds values : 0 OK
+# Avg bus bandwidth    : 451.101
+```
+
+> **NOTE:** To run NCCL tests without InfiniBand, you can launch a cluster with `nccl_no_ib.yaml`:
+>
+> ```bash
+> sky launch -c no_infiniband nccl_no_ib.yaml
+> ```
diff --git a/examples/together_infiniband/nccl_network_tier.yaml b/examples/together_infiniband/nccl_network_tier.yaml
new file mode 100644
index 00000000000..075c7673edb
--- /dev/null
+++ b/examples/together_infiniband/nccl_network_tier.yaml
@@ -0,0 +1,52 @@
+# This example is used to test the NCCL performance with
+# InfiniBand on Together AI Kubernetes cluster.
+name: nccl-network-tier
+
+resources:
+  infra: k8s
+  accelerators: H100:8
+  image_id: docker:nvcr.io/nvidia/pytorch:24.07-py3
+  network_tier: best
+
+num_nodes: 2
+
+run: |
+  if [ "${SKYPILOT_NODE_RANK}" == "0" ]; then
+    echo "Head node"
+
+    # Total number of processes, NP should be the total number of GPUs in the cluster
+    NP=$(($SKYPILOT_NUM_GPUS_PER_NODE * $SKYPILOT_NUM_NODES))
+
+    # Append :${SKYPILOT_NUM_GPUS_PER_NODE} to each IP as slots
+    nodes=""
+    for ip in $SKYPILOT_NODE_IPS; do
+      nodes="${nodes}${ip}:${SKYPILOT_NUM_GPUS_PER_NODE},"
+    done
+    nodes=${nodes::-1}
+    echo "All nodes: ${nodes}"
+
+    mpirun \
+      --allow-run-as-root \
+      --tag-output \
+      -H $nodes \
+      -np $NP \
+      -N $SKYPILOT_NUM_GPUS_PER_NODE \
+      --bind-to none \
+      -x PATH \
+      -x LD_LIBRARY_PATH \
+      -x NCCL_DEBUG=INFO \
+      -x NCCL_IB_HCA \
+      -x UCX_NET_DEVICES \
+      -x SHARP_COLL_ENABLE_PCI_RELAXED_ORDERING=1 \
+      -x NCCL_COLLNET_ENABLE=0 \
+      /usr/local/bin/all_reduce_perf_mpi \
+      -b 512M \
+      -e 8G \
+      -f 2 \
+      -g 1 \
+      -c 1 \
+      -w 5 \
+      -n 10
+  else
+    echo "Worker nodes"
+  fi
diff --git a/examples/together_infiniband/nccl_no_ib.yaml b/examples/together_infiniband/nccl_no_ib.yaml
new file mode 100644
index 00000000000..9f0bc40d773
--- /dev/null
+++ b/examples/together_infiniband/nccl_no_ib.yaml
@@ -0,0 +1,54 @@
+# This example is used to test the NCCL performance without
+# InfiniBand on Together AI Kubernetes cluster.
+name: nccl-no-ib
+
+resources:
+  infra: k8s
+  accelerators: H100:8
+  image_id: docker:nvcr.io/nvidia/pytorch:24.07-py3
+
+num_nodes: 2
+
+run: |
+  if [ "${SKYPILOT_NODE_RANK}" == "0" ]; then
+    echo "Head node"
+
+    # Total number of processes, NP should be the total number of GPUs in the cluster
+    NP=$(($SKYPILOT_NUM_GPUS_PER_NODE * $SKYPILOT_NUM_NODES))
+
+    # Append :${SKYPILOT_NUM_GPUS_PER_NODE} to each IP as slots
+    nodes=""
+    for ip in $SKYPILOT_NODE_IPS; do
+      nodes="${nodes}${ip}:${SKYPILOT_NUM_GPUS_PER_NODE},"
+    done
+    nodes=${nodes::-1}
+    echo "All nodes: ${nodes}"
+
+    export NCCL_IB_HCA=""
+    export UCX_NET_DEVICES="eth0"
+
+    mpirun \
+      --allow-run-as-root \
+      --tag-output \
+      -H $nodes \
+      -np $NP \
+      -N $SKYPILOT_NUM_GPUS_PER_NODE \
+      --bind-to none \
+      -x PATH \
+      -x LD_LIBRARY_PATH \
+      -x NCCL_DEBUG=INFO \
+      -x NCCL_IB_HCA \
+      -x UCX_NET_DEVICES \
+      -x SHARP_COLL_ENABLE_PCI_RELAXED_ORDERING=1 \
+      -x NCCL_COLLNET_ENABLE=0 \
+      /usr/local/bin/all_reduce_perf_mpi \
+      -b 512M \
+      -e 8G \
+      -f 2 \
+      -g 1 \
+      -c 1 \
+      -w 5 \
+      -n 10
+  else
+    echo "Worker nodes"
+  fi
diff --git a/format.sh b/format.sh
index 62ac9889900..ee3f67f45fe 100755
--- a/format.sh
+++ b/format.sh
@@ -129,10 +129,7 @@ isort --profile black -l 88 -m 3 "sky/skylet/providers/ibm"
 # TODO(zhwu): When more of the codebase is typed properly, the mypy flags
 # should be set to do a more stringent check.
 echo 'SkyPilot mypy:'
-# Workaround for mypy 1.14.1 cache serialization bug that causes
-# "AssertionError: Internal error: unresolved placeholder type None"
-# Using --cache-dir=/dev/null disables cache writing to avoid the error
-mypy $(cat tests/mypy_files.txt) --cache-dir=/dev/null
+mypy $(cat tests/mypy_files.txt)
 
 # Run Pylint
 echo 'Sky Pylint:'
@@ -159,9 +156,9 @@ if ! npm -v || ! node -v; then
     # Don't fail the script if npm or node is not installed
     # because it's not required for all users
 else
-    npm --prefix sky/dashboard install
+    output=$(npm --prefix sky/dashboard install 2>&1) || { echo "$output"; exit 1; }
     npm --prefix sky/dashboard run lint
-    npm --prefix sky/dashboard run format
+    npm --prefix sky/dashboard run format -- --log-level warn
     echo "SkyPilot Dashboard linting and formatting: Done"
     echo
 fi
diff --git a/llm/rl-post-training-jobgroup/README.md b/llm/rl-post-training-jobgroup/README.md
new file mode 100644
index 00000000000..eba5c84e7b8
--- /dev/null
+++ b/llm/rl-post-training-jobgroup/README.md
@@ -0,0 +1,175 @@
+# RL Post-Training with Job Groups
+
+This example demonstrates a distributed RL post-training architecture using SkyPilot job groups. It trains an LLM on mathematical reasoning tasks using GRPO (Group Relative Policy Optimization) with verifiable rewards.
+
+## Architecture
+
+The example consists of 5 task types that communicate over HTTP, with built-in load balancing for scaling inference:
+
+<p align="center">
+  <img src="../../docs/source/images/job-groups-rl-architecture.jpg" width="80%">
+</p>
+
+### Components
+
+1. **data-server** (auxiliary): FastAPI server that serves math prompts from the GSM8K dataset. Provides batches of problems with ground truth answers.
+
+2. **rollout-server** (auxiliary, x2): SGLang inference servers with native load balancing:
+   - Using `num_nodes: 2` creates two GPU instances for higher throughput
+   - Head node (rank 0) runs both SGLang server and SGLang router on port 30000
+   - SGLang router provides cache-aware load balancing for optimal KV cache reuse
+
+3. **reward-server** (auxiliary): Verifies mathematical answers by comparing model outputs against ground truth. Returns binary rewards (1.0 for correct, 0.0 for incorrect).
+
+4. **replay-buffer** (auxiliary): Stores experience tuples (prompt, response, reward) for sampling during training. Supports priority-based sampling where high-reward experiences are sampled more frequently.
+
+5. **ppo-trainer** (primary): Multi-node training orchestrator that implements GRPO. Coordinates with all other services to fetch prompts, generate responses, compute rewards, store experiences, and update the policy.
+
+### Primary/Auxiliary Tasks
+
+The ppo-trainer is designated as the **primary task**. When training completes:
+- All auxiliary services (data-server, rollout-server, reward-server, replay-buffer) are automatically terminated after a 10-second grace period (`termination_delay: 10s`)
+- This ensures GPU and CPU resources are released promptly once training finishes
+- Without this feature, auxiliary services would run indefinitely
+
+## Usage
+
+### Prerequisites
+
+- SkyPilot configured with a Kubernetes cluster
+- GPU nodes available (H100 recommended for optimal performance)
+
+### Launch Training
+
+```bash
+sky jobs launch llm/rl-post-training-jobgroup/rlhf-math-jobgroup.yaml
+```
+
+### Monitor Training
+
+```bash
+# Check job status
+sky jobs queue
+
+# View logs for specific components
+sky jobs logs <job-id> data-server
+sky jobs logs <job-id> rollout-server
+sky jobs logs <job-id> reward-server
+sky jobs logs <job-id> replay-buffer
+sky jobs logs <job-id> ppo-trainer
+```
+
+Or use the SkyPilot dashboard to monitor jobs.
+
+## Configuration
+
+### Environment Variables
+
+| Variable | Default | Description |
+|----------|---------|-------------|
+| `MODEL_NAME` | `Qwen/Qwen2.5-0.5B-Instruct` | Model to train |
+| `NUM_EPOCHS` | `3` | Number of training epochs |
+| `BATCH_SIZE` | `4` | Training batch size |
+
+### Customizing Resources
+
+Edit the YAML to adjust resources per component:
+
+```yaml
+# For larger models, increase GPU memory
+resources:
+  accelerators: H100:1  # or A100:1
+  memory: 64+
+```
+
+## Service Discovery
+
+Components discover each other using job group DNS names:
+
+- `data-server-0.${SKYPILOT_JOBGROUP_NAME}:8000`
+- `rollout-server-0.${SKYPILOT_JOBGROUP_NAME}:30000` (SGLang router endpoint)
+- `rollout-server-0.${SKYPILOT_JOBGROUP_NAME}:30001` (SGLang backend 1)
+- `rollout-server-1.${SKYPILOT_JOBGROUP_NAME}:30001` (SGLang backend 2)
+- `reward-server-0.${SKYPILOT_JOBGROUP_NAME}:8002`
+- `replay-buffer-0.${SKYPILOT_JOBGROUP_NAME}:8003`
+
+This allows components to communicate without hardcoded IP addresses.
+
+## Load Balancing
+
+The example uses [SGLang's native router](https://docs.sglang.ai/advanced_features/router.html) for load balancing:
+
+1. **Multiple rollout servers**: Using `num_nodes: 2` creates two SGLang instances
+2. **Head node router**: The head node (rank 0) runs both SGLang server and SGLang router on port 30000
+3. **Automatic discovery**: The router is configured with worker URLs from `SKYPILOT_NUM_NODES`
+4. **Transparent to clients**: The trainer only needs to know the head node endpoint
+
+### Scaling to More Servers
+
+To scale up, simply increase `num_nodes`:
+
+```yaml
+name: rollout-server
+num_nodes: 4  # Scale to 4 servers
+```
+
+The router on the head node automatically discovers all workers using:
+```bash
+for i in $(seq 0 $((SKYPILOT_NUM_NODES - 1))); do
+  WORKER_URLS="${WORKER_URLS} http://rollout-server-${i}.${SKYPILOT_JOBGROUP_NAME}:30001"
+done
+```
+
+### SGLang Router Features
+
+SGLang's native router (`sglang_router`) provides:
+- **Cache-aware routing**: Routes requests to maximize KV cache reuse
+- Health checking with automatic failover
+- OpenAI-compatible API passthrough
+- Built-in Rust implementation for high performance
+
+## GRPO Algorithm
+
+GRPO (Group Relative Policy Optimization) is a simplified variant of PPO that:
+- Doesn't require a critic/value model
+- Uses group-relative advantages (compares rewards within a batch)
+- Works well with verifiable rewards (math, code)
+
+The training loop:
+1. Fetch batch of prompts from data-server
+2. Generate responses using rollout-server
+3. Compute rewards using reward-server
+4. Store experiences in replay-buffer
+5. Calculate group-relative advantages
+6. Update policy with clipped surrogate loss
+7. Sample from replay-buffer for additional updates (experience replay)
+
+## Extending This Example
+
+### Using a Reward Model
+
+Replace the reward-server with a neural reward model:
+
+```python
+# In reward_server.py, load a reward model
+from transformers import AutoModelForSequenceClassification
+model = AutoModelForSequenceClassification.from_pretrained("Skywork/Skywork-Reward-Llama-3.1-8B-v0.2")
+```
+
+### Scaling Up
+
+For larger models:
+1. Increase SGLang tensor parallelism
+2. Use multiple GPUs per trainer node
+3. Enable gradient checkpointing
+
+### Adding a Critic
+
+For full PPO, add a critic-server component that estimates value functions.
+
+## References
+
+- [OpenRLHF](https://github.com/OpenRLHF/OpenRLHF) - Distributed RLHF framework
+- [VeRL](https://github.com/volcengine/verl) - Hybrid flow RLHF framework
+- [GRPO Paper](https://arxiv.org/abs/2402.03300) - Group Relative Policy Optimization
+- [GSM8K Dataset](https://huggingface.co/datasets/openai/gsm8k) - Math reasoning benchmark
diff --git a/llm/rl-post-training-jobgroup/code/data_server.py b/llm/rl-post-training-jobgroup/code/data_server.py
new file mode 100644
index 00000000000..a6d58ae876c
--- /dev/null
+++ b/llm/rl-post-training-jobgroup/code/data_server.py
@@ -0,0 +1,185 @@
+#!/usr/bin/env python3
+"""Data server for RLHF training - serves math prompts from GSM8K dataset.
+
+This server provides batches of math problems with their ground truth answers
+for training LLMs on mathematical reasoning tasks.
+
+Usage:
+    python data_server.py --port 8000
+"""
+
+import argparse
+import random
+from typing import List, Optional
+
+from fastapi import FastAPI
+from fastapi import Query
+from pydantic import BaseModel
+import uvicorn
+
+app = FastAPI(title="RLHF Data Server",
+              description="Serves math prompts for training")
+
+# Global state
+prompts_data: List[dict] = []
+current_index: int = 0
+
+
+class Prompt(BaseModel):
+    """A single prompt with its ground truth answer."""
+    id: int
+    prompt: str
+    ground_truth: str
+
+
+class PromptBatch(BaseModel):
+    """A batch of prompts."""
+    prompts: List[Prompt]
+    total_available: int
+
+
+def load_dataset():
+    """Load GSM8K dataset from HuggingFace."""
+    global prompts_data
+
+    try:
+        from datasets import load_dataset
+        print("Loading GSM8K dataset...")
+        dataset = load_dataset("openai/gsm8k", "main", split="train")
+
+        prompts_data = []
+        for i, item in enumerate(dataset):
+            # Extract the numerical answer from the solution
+            # GSM8K format: solution ends with "#### <answer>"
+            solution = item["answer"]
+            answer_marker = "####"
+            if answer_marker in solution:
+                ground_truth = solution.split(answer_marker)[-1].strip()
+            else:
+                ground_truth = solution.strip()
+
+            # Format prompt for instruction-following model
+            prompt = f"""Solve the following math problem step by step. End your solution with the final numerical answer.
+
+Problem: {item["question"]}
+
+Solution:"""
+
+            prompts_data.append({
+                "id": i,
+                "prompt": prompt,
+                "ground_truth": ground_truth
+            })
+
+        # Shuffle for training
+        random.shuffle(prompts_data)
+        print(f"Loaded {len(prompts_data)} prompts from GSM8K")
+
+    except Exception as e:
+        print(f"Error loading dataset: {e}")
+        # Fallback to simple math problems for testing
+        prompts_data = [
+            {
+                "id": 0,
+                "prompt": "What is 2 + 2?",
+                "ground_truth": "4"
+            },
+            {
+                "id": 1,
+                "prompt": "What is 10 * 5?",
+                "ground_truth": "50"
+            },
+            {
+                "id": 2,
+                "prompt": "What is 100 / 4?",
+                "ground_truth": "25"
+            },
+            {
+                "id": 3,
+                "prompt": "What is 7 + 8?",
+                "ground_truth": "15"
+            },
+            {
+                "id": 4,
+                "prompt": "What is 9 * 9?",
+                "ground_truth": "81"
+            },
+        ]
+        print(f"Using {len(prompts_data)} fallback prompts")
+
+
+@app.on_event("startup")
+async def startup_event():
+    """Load dataset on startup."""
+    load_dataset()
+
+
+@app.get("/health")
+async def health():
+    """Health check endpoint."""
+    return {"status": "healthy", "prompts_loaded": len(prompts_data)}
+
+
+@app.get("/prompts", response_model=PromptBatch)
+async def get_prompts(
+        batch_size: int = Query(default=8,
+                                ge=1,
+                                le=256,
+                                description="Number of prompts to return"),
+        shuffle: bool = Query(default=True,
+                              description="Whether to shuffle prompts")):
+    """Get a batch of prompts for training."""
+    global current_index
+
+    if not prompts_data:
+        return PromptBatch(prompts=[], total_available=0)
+
+    # Get batch of prompts
+    if shuffle:
+        batch = random.sample(prompts_data, min(batch_size, len(prompts_data)))
+    else:
+        # Sequential access with wraparound
+        batch = []
+        for _ in range(batch_size):
+            batch.append(prompts_data[current_index])
+            current_index = (current_index + 1) % len(prompts_data)
+
+    prompts = [Prompt(**p) for p in batch]
+    return PromptBatch(prompts=prompts, total_available=len(prompts_data))
+
+
+@app.get("/prompt/{prompt_id}", response_model=Optional[Prompt])
+async def get_prompt_by_id(prompt_id: int):
+    """Get a specific prompt by ID."""
+    for p in prompts_data:
+        if p["id"] == prompt_id:
+            return Prompt(**p)
+    return None
+
+
+@app.post("/reset")
+async def reset_index():
+    """Reset the sequential index to the beginning."""
+    global current_index
+    current_index = 0
+    return {"status": "reset", "index": current_index}
+
+
+def main():
+    parser = argparse.ArgumentParser(description="RLHF Data Server")
+    parser.add_argument("--port",
+                        type=int,
+                        default=8000,
+                        help="Port to run server on")
+    parser.add_argument("--host",
+                        type=str,
+                        default="0.0.0.0",
+                        help="Host to bind to")
+    args = parser.parse_args()
+
+    print(f"Starting data server on {args.host}:{args.port}")
+    uvicorn.run(app, host=args.host, port=args.port)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/llm/rl-post-training-jobgroup/code/replay_buffer.py b/llm/rl-post-training-jobgroup/code/replay_buffer.py
new file mode 100644
index 00000000000..d4c1422e60b
--- /dev/null
+++ b/llm/rl-post-training-jobgroup/code/replay_buffer.py
@@ -0,0 +1,254 @@
+"""Replay Buffer Server for RLHF Training.
+
+This server provides a centralized experience replay buffer that stores
+(prompt, response, reward) tuples and allows sampling for training.
+
+Features:
+- Thread-safe storage with configurable capacity
+- Uniform random sampling
+- Priority-based sampling (optional)
+- Statistics tracking
+
+API Endpoints:
+- POST /add: Add experiences to the buffer
+- POST /sample: Sample a batch of experiences
+- GET /stats: Get buffer statistics
+- POST /clear: Clear the buffer
+- GET /health: Health check
+"""
+
+import argparse
+from collections import deque
+from dataclasses import dataclass
+from dataclasses import field
+import random
+import threading
+import time
+from typing import List, Optional
+
+from fastapi import FastAPI
+from fastapi import HTTPException
+from pydantic import BaseModel
+import uvicorn
+
+
+@dataclass
+class Experience:
+    """A single experience tuple."""
+    prompt: str
+    response: str
+    reward: float
+    ground_truth: Optional[str] = None
+    timestamp: float = field(default_factory=time.time)
+    priority: float = 1.0
+
+
+class AddExperienceRequest(BaseModel):
+    """Request to add experiences to the buffer."""
+    experiences: List[dict]
+
+
+class SampleRequest(BaseModel):
+    """Request to sample from the buffer."""
+    batch_size: int = 4
+    prioritized: bool = False
+
+
+class ReplayBuffer:
+    """Thread-safe replay buffer with priority sampling support."""
+
+    def __init__(self, capacity: int = 10000):
+        self.capacity = capacity
+        self.buffer: deque = deque(maxlen=capacity)
+        self.lock = threading.Lock()
+        self.total_added = 0
+        self.total_sampled = 0
+
+    def add(self, experiences: List[Experience]) -> int:
+        """Add experiences to the buffer."""
+        with self.lock:
+            for exp in experiences:
+                self.buffer.append(exp)
+                self.total_added += 1
+        return len(experiences)
+
+    def sample(self,
+               batch_size: int,
+               prioritized: bool = False) -> List[Experience]:
+        """Sample a batch of experiences."""
+        with self.lock:
+            if len(self.buffer) == 0:
+                return []
+
+            actual_size = min(batch_size, len(self.buffer))
+
+            if prioritized:
+                # Priority-based sampling (higher reward = higher priority)
+                priorities = [exp.priority for exp in self.buffer]
+                total_priority = sum(priorities)
+                if total_priority > 0:
+                    probs = [p / total_priority for p in priorities]
+                    indices = random.choices(range(len(self.buffer)),
+                                             weights=probs,
+                                             k=actual_size)
+                else:
+                    indices = random.sample(range(len(self.buffer)),
+                                            actual_size)
+            else:
+                # Uniform random sampling
+                indices = random.sample(range(len(self.buffer)), actual_size)
+
+            samples = [self.buffer[i] for i in indices]
+            self.total_sampled += len(samples)
+            return samples
+
+    def clear(self):
+        """Clear the buffer."""
+        with self.lock:
+            self.buffer.clear()
+
+    def stats(self) -> dict:
+        """Get buffer statistics."""
+        with self.lock:
+            rewards = [exp.reward for exp in self.buffer]
+            return {
+                "size": len(self.buffer),
+                "capacity": self.capacity,
+                "total_added": self.total_added,
+                "total_sampled": self.total_sampled,
+                "avg_reward": sum(rewards) / len(rewards) if rewards else 0,
+                "min_reward": min(rewards) if rewards else 0,
+                "max_reward": max(rewards) if rewards else 0,
+                "positive_ratio": sum(1 for r in rewards if r > 0) /
+                                  len(rewards) if rewards else 0,
+            }
+
+
+# Initialize FastAPI app
+app = FastAPI(title="Replay Buffer Server",
+              description="Experience replay buffer for RLHF training")
+
+# Global buffer instance
+buffer: Optional[ReplayBuffer] = None
+
+
+@app.on_event("startup")
+async def startup():
+    """Initialize the replay buffer on startup."""
+    global buffer
+    buffer = ReplayBuffer(capacity=10000)
+    print("Replay buffer initialized with capacity 10000")
+
+
+@app.get("/health")
+async def health():
+    """Health check endpoint."""
+    return {
+        "status": "healthy",
+        "buffer_size": len(buffer.buffer) if buffer else 0
+    }
+
+
+@app.post("/add")
+async def add_experiences(request: AddExperienceRequest):
+    """Add experiences to the replay buffer.
+
+    Each experience should have:
+    - prompt: The input prompt
+    - response: The model's response
+    - reward: The reward score
+    - ground_truth (optional): The correct answer
+    """
+    if buffer is None:
+        raise HTTPException(status_code=503, detail="Buffer not initialized")
+
+    experiences = []
+    for exp_dict in request.experiences:
+        exp = Experience(
+            prompt=exp_dict.get("prompt", ""),
+            response=exp_dict.get("response", ""),
+            reward=exp_dict.get("reward", 0.0),
+            ground_truth=exp_dict.get("ground_truth"),
+            priority=abs(exp_dict.get("reward", 0.0)) +
+            0.1  # Higher reward = higher priority
+        )
+        experiences.append(exp)
+
+    added = buffer.add(experiences)
+
+    return {"added": added, "buffer_size": len(buffer.buffer)}
+
+
+@app.post("/sample")
+async def sample_experiences(request: SampleRequest):
+    """Sample a batch of experiences from the buffer.
+
+    Args:
+        batch_size: Number of experiences to sample
+        prioritized: If True, use priority-based sampling
+    """
+    if buffer is None:
+        raise HTTPException(status_code=503, detail="Buffer not initialized")
+
+    if len(buffer.buffer) == 0:
+        return {"experiences": [], "message": "Buffer is empty"}
+
+    samples = buffer.sample(request.batch_size, request.prioritized)
+
+    return {
+        "experiences": [{
+            "prompt": exp.prompt,
+            "response": exp.response,
+            "reward": exp.reward,
+            "ground_truth": exp.ground_truth,
+            "timestamp": exp.timestamp
+        } for exp in samples],
+        "sampled": len(samples),
+        "buffer_size": len(buffer.buffer)
+    }
+
+
+@app.get("/stats")
+async def get_stats():
+    """Get buffer statistics."""
+    if buffer is None:
+        raise HTTPException(status_code=503, detail="Buffer not initialized")
+
+    return buffer.stats()
+
+
+@app.post("/clear")
+async def clear_buffer():
+    """Clear the replay buffer."""
+    if buffer is None:
+        raise HTTPException(status_code=503, detail="Buffer not initialized")
+
+    buffer.clear()
+    return {"status": "cleared"}
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Replay Buffer Server")
+    parser.add_argument("--port", type=int, default=8003, help="Port to run on")
+    parser.add_argument("--host",
+                        type=str,
+                        default="0.0.0.0",
+                        help="Host to bind to")
+    parser.add_argument("--capacity",
+                        type=int,
+                        default=10000,
+                        help="Buffer capacity")
+    args = parser.parse_args()
+
+    # Update global capacity
+    global buffer
+    buffer = ReplayBuffer(capacity=args.capacity)
+
+    print(f"Starting Replay Buffer Server on {args.host}:{args.port}")
+    print(f"Buffer capacity: {args.capacity}")
+
+    uvicorn.run(app, host=args.host, port=args.port)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/llm/rl-post-training-jobgroup/code/reward_server.py b/llm/rl-post-training-jobgroup/code/reward_server.py
new file mode 100644
index 00000000000..52cf93395a6
--- /dev/null
+++ b/llm/rl-post-training-jobgroup/code/reward_server.py
@@ -0,0 +1,170 @@
+#!/usr/bin/env python3
+"""Reward server for RLHF training - verifies math answers.
+
+This server computes rewards by comparing generated answers against ground truth.
+Uses simple string/numeric matching for math problems.
+
+Usage:
+    python reward_server.py --port 8002
+"""
+
+import argparse
+import re
+from typing import List, Optional
+
+from fastapi import FastAPI
+from pydantic import BaseModel
+import uvicorn
+
+app = FastAPI(title="RLHF Reward Server",
+              description="Computes rewards for math responses")
+
+
+class RewardRequest(BaseModel):
+    """Request for computing reward for a single response."""
+    prompt: str
+    response: str
+    ground_truth: str
+
+
+class RewardResponse(BaseModel):
+    """Reward computation result."""
+    reward: float
+    extracted_answer: Optional[str]
+    ground_truth: str
+    correct: bool
+
+
+class BatchRewardRequest(BaseModel):
+    """Request for computing rewards for multiple responses."""
+    items: List[RewardRequest]
+
+
+class BatchRewardResponse(BaseModel):
+    """Batch reward computation results."""
+    rewards: List[RewardResponse]
+    mean_reward: float
+    accuracy: float
+
+
+def extract_answer(response: str) -> Optional[str]:
+    """Extract the final numerical answer from a response.
+
+    Tries multiple patterns commonly used in math solutions:
+    1. "#### <answer>" (GSM8K format)
+    2. "The answer is <answer>"
+    3. "= <answer>" at the end
+    4. Last number in the response
+    """
+    response = response.strip()
+
+    # Pattern 1: GSM8K format "#### <answer>"
+    match = re.search(r'####\s*([+-]?\d+(?:,\d{3})*(?:\.\d+)?)', response)
+    if match:
+        return match.group(1).replace(',', '')
+
+    # Pattern 2: "The answer is <answer>"
+    match = re.search(
+        r'[Tt]he\s+(?:final\s+)?answer\s+is[:\s]*([+-]?\d+(?:,\d{3})*(?:\.\d+)?)',
+        response)
+    if match:
+        return match.group(1).replace(',', '')
+
+    # Pattern 3: "= <answer>" at the end of a line
+    match = re.search(r'=\s*([+-]?\d+(?:,\d{3})*(?:\.\d+)?)\s*$', response,
+                      re.MULTILINE)
+    if match:
+        return match.group(1).replace(',', '')
+
+    # Pattern 4: Last number in the response
+    numbers = re.findall(r'([+-]?\d+(?:,\d{3})*(?:\.\d+)?)', response)
+    if numbers:
+        return numbers[-1].replace(',', '')
+
+    return None
+
+
+def normalize_answer(answer: str) -> str:
+    """Normalize an answer for comparison."""
+    if answer is None:
+        return ""
+    # Remove commas, whitespace, and convert to lowercase
+    answer = answer.replace(',', '').strip().lower()
+    # Try to parse as number and format consistently
+    try:
+        num = float(answer)
+        # If it's a whole number, return as int
+        if num == int(num):
+            return str(int(num))
+        return str(num)
+    except ValueError:
+        return answer
+
+
+def compute_reward(prompt: str, response: str,
+                   ground_truth: str) -> RewardResponse:
+    """Compute reward by comparing extracted answer to ground truth."""
+    extracted = extract_answer(response)
+    normalized_extracted = normalize_answer(extracted)
+    normalized_truth = normalize_answer(ground_truth)
+
+    # Check if answers match
+    correct = normalized_extracted == normalized_truth
+
+    # Binary reward: 1.0 for correct, 0.0 for incorrect
+    reward = 1.0 if correct else 0.0
+
+    return RewardResponse(reward=reward,
+                          extracted_answer=extracted,
+                          ground_truth=ground_truth,
+                          correct=correct)
+
+
+@app.get("/health")
+async def health():
+    """Health check endpoint."""
+    return {"status": "healthy"}
+
+
+@app.post("/reward", response_model=RewardResponse)
+async def get_reward(request: RewardRequest):
+    """Compute reward for a single response."""
+    return compute_reward(request.prompt, request.response,
+                          request.ground_truth)
+
+
+@app.post("/batch_reward", response_model=BatchRewardResponse)
+async def get_batch_reward(request: BatchRewardRequest):
+    """Compute rewards for a batch of responses."""
+    rewards = [
+        compute_reward(item.prompt, item.response, item.ground_truth)
+        for item in request.items
+    ]
+
+    total_reward = sum(r.reward for r in rewards)
+    correct_count = sum(1 for r in rewards if r.correct)
+
+    return BatchRewardResponse(
+        rewards=rewards,
+        mean_reward=total_reward / len(rewards) if rewards else 0.0,
+        accuracy=correct_count / len(rewards) if rewards else 0.0)
+
+
+def main():
+    parser = argparse.ArgumentParser(description="RLHF Reward Server")
+    parser.add_argument("--port",
+                        type=int,
+                        default=8002,
+                        help="Port to run server on")
+    parser.add_argument("--host",
+                        type=str,
+                        default="0.0.0.0",
+                        help="Host to bind to")
+    args = parser.parse_args()
+
+    print(f"Starting reward server on {args.host}:{args.port}")
+    uvicorn.run(app, host=args.host, port=args.port)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/llm/rl-post-training-jobgroup/code/trainer.py b/llm/rl-post-training-jobgroup/code/trainer.py
new file mode 100644
index 00000000000..66bf8f5d04a
--- /dev/null
+++ b/llm/rl-post-training-jobgroup/code/trainer.py
@@ -0,0 +1,454 @@
+#!/usr/bin/env python3
+"""GRPO Trainer for RLHF math training.
+
+This trainer orchestrates the RLHF pipeline by:
+1. Fetching prompts from data-server
+2. Generating responses via rollout-server (SGLang)
+3. Computing rewards via reward-server
+4. Storing experiences in replay-buffer
+5. Updating the policy using GRPO (Group Relative Policy Optimization)
+
+GRPO is a simplified variant of PPO that doesn't require a critic model,
+making it popular for math/code tasks with verifiable rewards.
+
+Usage:
+    python trainer.py \
+        --data-server localhost:8000 \
+        --rollout-server localhost:8001 \
+        --reward-server localhost:8002 \
+        --replay-buffer localhost:8003 \
+        --num-epochs 3
+"""
+
+import argparse
+from dataclasses import dataclass
+import os
+import time
+from typing import List, Optional
+
+from accelerate import Accelerator
+import httpx
+import torch
+from torch.utils.data import DataLoader
+from transformers import AutoModelForCausalLM
+from transformers import AutoTokenizer
+
+
+@dataclass
+class TrainingConfig:
+    """Training configuration."""
+    data_server: str
+    rollout_server: str
+    reward_server: str
+    replay_buffer: Optional[str] = None
+    model_name: str = "Qwen/Qwen2.5-0.5B-Instruct"
+    batch_size: int = 4
+    num_epochs: int = 3
+    learning_rate: float = 1e-6
+    max_new_tokens: int = 512
+    temperature: float = 0.7
+    num_samples_per_prompt: int = 4  # For GRPO, generate multiple samples
+    kl_coef: float = 0.01
+    clip_range: float = 0.2
+    use_replay_buffer: bool = True  # Whether to use replay buffer for training
+
+
+class RLHFTrainer:
+    """GRPO trainer that coordinates with external services."""
+
+    def __init__(self, config: TrainingConfig):
+        self.config = config
+        self.accelerator = Accelerator()
+
+        # HTTP clients for services
+        self.http_client = httpx.Client(timeout=120.0)
+
+        # Load model and tokenizer
+        if self.accelerator.is_main_process:
+            print(f"Loading model: {config.model_name}")
+
+        self.tokenizer = AutoTokenizer.from_pretrained(config.model_name)
+        if self.tokenizer.pad_token is None:
+            self.tokenizer.pad_token = self.tokenizer.eos_token
+
+        self.model = AutoModelForCausalLM.from_pretrained(
+            config.model_name, torch_dtype=torch.bfloat16, device_map="auto")
+
+        # Optimizer
+        self.optimizer = torch.optim.AdamW(self.model.parameters(),
+                                           lr=config.learning_rate)
+
+        # Prepare with accelerator
+        self.model, self.optimizer = self.accelerator.prepare(
+            self.model, self.optimizer)
+
+        # Statistics
+        self.total_steps = 0
+        self.total_rewards = 0.0
+
+    def wait_for_services(self,
+                          max_retries: int = 30,
+                          retry_interval: int = 10):
+        """Wait for all services to be available."""
+        services = [
+            ("data-server", f"http://{self.config.data_server}/health"),
+            ("rollout-server", f"http://{self.config.rollout_server}/health"),
+            ("reward-server", f"http://{self.config.reward_server}/health"),
+        ]
+        if self.config.replay_buffer:
+            services.append(
+                ("replay-buffer", f"http://{self.config.replay_buffer}/health"))
+
+        for name, url in services:
+            if self.accelerator.is_main_process:
+                print(f"Waiting for {name} at {url}...")
+
+            for attempt in range(max_retries):
+                try:
+                    response = self.http_client.get(url)
+                    if response.status_code == 200:
+                        if self.accelerator.is_main_process:
+                            print(f"  {name} is ready!")
+                        break
+                except Exception as e:
+                    pass
+
+                if attempt < max_retries - 1:
+                    time.sleep(retry_interval)
+            else:
+                raise RuntimeError(
+                    f"Service {name} not available after {max_retries} retries")
+
+    def fetch_prompts(self, batch_size: int) -> List[dict]:
+        """Fetch a batch of prompts from data server."""
+        url = f"http://{self.config.data_server}/prompts"
+        response = self.http_client.get(url, params={"batch_size": batch_size})
+        response.raise_for_status()
+        data = response.json()
+        return data["prompts"]
+
+    def generate_responses(self, prompts: List[str]) -> List[str]:
+        """Generate responses using the rollout server (SGLang)."""
+        url = f"http://{self.config.rollout_server}/v1/completions"
+
+        responses = []
+        for prompt in prompts:
+            payload = {
+                "model": self.config.model_name,
+                "prompt": prompt,
+                "max_tokens": self.config.max_new_tokens,
+                "temperature": self.config.temperature,
+                "n": 1,
+            }
+            try:
+                response = self.http_client.post(url, json=payload)
+                response.raise_for_status()
+                data = response.json()
+                text = data["choices"][0]["text"]
+                responses.append(text)
+            except Exception as e:
+                print(f"Error generating response: {e}")
+                responses.append("")
+
+        return responses
+
+    def compute_rewards(self, prompts: List[str], responses: List[str],
+                        ground_truths: List[str]) -> List[float]:
+        """Compute rewards using the reward server."""
+        url = f"http://{self.config.reward_server}/batch_reward"
+
+        items = [{
+            "prompt": p,
+            "response": r,
+            "ground_truth": gt
+        } for p, r, gt in zip(prompts, responses, ground_truths)]
+
+        response = self.http_client.post(url, json={"items": items})
+        response.raise_for_status()
+        data = response.json()
+
+        return [r["reward"] for r in data["rewards"]]
+
+    def store_experiences(self, prompts: List[str], responses: List[str],
+                          rewards: List[float], ground_truths: List[str]):
+        """Store experiences in the replay buffer."""
+        if not self.config.replay_buffer:
+            return
+
+        url = f"http://{self.config.replay_buffer}/add"
+        experiences = [{
+            "prompt": p,
+            "response": r,
+            "reward": rw,
+            "ground_truth": gt
+        } for p, r, rw, gt in zip(prompts, responses, rewards, ground_truths)]
+
+        try:
+            response = self.http_client.post(url,
+                                             json={"experiences": experiences})
+            response.raise_for_status()
+        except Exception as e:
+            print(f"Warning: Failed to store experiences in replay buffer: {e}")
+
+    def sample_from_replay_buffer(self,
+                                  batch_size: int) -> Optional[List[dict]]:
+        """Sample experiences from the replay buffer."""
+        if not self.config.replay_buffer:
+            return None
+
+        url = f"http://{self.config.replay_buffer}/sample"
+        try:
+            response = self.http_client.post(url,
+                                             json={
+                                                 "batch_size": batch_size,
+                                                 "prioritized": True
+                                             })
+            response.raise_for_status()
+            data = response.json()
+            if data["experiences"]:
+                return data["experiences"]
+        except Exception as e:
+            print(f"Warning: Failed to sample from replay buffer: {e}")
+        return None
+
+    def get_replay_buffer_stats(self) -> Optional[dict]:
+        """Get replay buffer statistics."""
+        if not self.config.replay_buffer:
+            return None
+
+        url = f"http://{self.config.replay_buffer}/stats"
+        try:
+            response = self.http_client.get(url)
+            response.raise_for_status()
+            return response.json()
+        except Exception as e:
+            return None
+
+    def compute_grpo_loss(self, prompts: List[str], responses: List[str],
+                          rewards: List[float]) -> torch.Tensor:
+        """Compute GRPO loss for policy update.
+
+        GRPO uses group-relative advantages: for each prompt, we compare
+        the reward of each response to the mean reward of all responses
+        for that prompt.
+        """
+        # Tokenize prompts and responses together
+        full_texts = [p + r for p, r in zip(prompts, responses)]
+        encodings = self.tokenizer(full_texts,
+                                   return_tensors="pt",
+                                   padding=True,
+                                   truncation=True,
+                                   max_length=1024).to(self.accelerator.device)
+
+        # Get prompt lengths for masking
+        prompt_encodings = self.tokenizer(prompts,
+                                          return_tensors="pt",
+                                          padding=True,
+                                          truncation=True,
+                                          max_length=512)
+        prompt_lengths = prompt_encodings.attention_mask.sum(dim=1)
+
+        # Forward pass
+        outputs = self.model(**encodings, labels=encodings.input_ids)
+
+        # Compute per-token log probabilities
+        logits = outputs.logits[:, :-1, :]
+        labels = encodings.input_ids[:, 1:]
+        log_probs = torch.nn.functional.log_softmax(logits, dim=-1)
+        token_log_probs = torch.gather(log_probs, 2,
+                                       labels.unsqueeze(-1)).squeeze(-1)
+
+        # Mask out prompt tokens (only count response tokens)
+        response_mask = torch.zeros_like(token_log_probs)
+        for i, plen in enumerate(prompt_lengths):
+            response_mask[i, plen - 1:] = 1.0
+        response_mask = response_mask * encodings.attention_mask[:, 1:]
+
+        # Sum log probs for each response
+        response_log_probs = (token_log_probs * response_mask).sum(dim=1)
+
+        # Convert rewards to tensor and compute advantages
+        rewards_tensor = torch.tensor(rewards,
+                                      device=self.accelerator.device,
+                                      dtype=torch.float32)
+
+        # GRPO: normalize rewards within batch (group-relative)
+        mean_reward = rewards_tensor.mean()
+        std_reward = rewards_tensor.std() + 1e-8
+        advantages = (rewards_tensor - mean_reward) / std_reward
+
+        # Policy gradient loss (negative because we maximize reward)
+        loss = -(response_log_probs * advantages).mean()
+
+        return loss
+
+    def train_step(self) -> dict:
+        """Execute one training step."""
+        self.model.train()
+
+        # 1. Fetch prompts
+        prompt_data = self.fetch_prompts(self.config.batch_size)
+        prompts = [p["prompt"] for p in prompt_data]
+        ground_truths = [p["ground_truth"] for p in prompt_data]
+
+        # 2. Generate responses
+        responses = self.generate_responses(prompts)
+
+        # 3. Compute rewards
+        rewards = self.compute_rewards(prompts, responses, ground_truths)
+
+        # 4. Store experiences in replay buffer
+        self.store_experiences(prompts, responses, rewards, ground_truths)
+
+        # 5. Compute loss and update with fresh experiences
+        loss = self.compute_grpo_loss(prompts, responses, rewards)
+
+        self.optimizer.zero_grad()
+        self.accelerator.backward(loss)
+        self.optimizer.step()
+
+        # 6. Optionally do additional update with replay buffer samples
+        replay_loss = None
+        if self.config.use_replay_buffer and self.config.replay_buffer:
+            replay_experiences = self.sample_from_replay_buffer(
+                self.config.batch_size)
+            if replay_experiences and len(replay_experiences) >= 2:
+                replay_prompts = [e["prompt"] for e in replay_experiences]
+                replay_responses = [e["response"] for e in replay_experiences]
+                replay_rewards = [e["reward"] for e in replay_experiences]
+
+                replay_loss = self.compute_grpo_loss(replay_prompts,
+                                                     replay_responses,
+                                                     replay_rewards)
+                self.optimizer.zero_grad()
+                self.accelerator.backward(replay_loss)
+                self.optimizer.step()
+
+        # Update statistics
+        self.total_steps += 1
+        mean_reward = sum(rewards) / len(rewards)
+        self.total_rewards += mean_reward
+
+        result = {
+            "loss": loss.item(),
+            "mean_reward": mean_reward,
+            "accuracy": sum(1 for r in rewards if r > 0) / len(rewards),
+            "num_samples": len(prompts)
+        }
+        if replay_loss is not None:
+            result["replay_loss"] = replay_loss.item()
+        return result
+
+    def train(self):
+        """Run the full training loop."""
+        if self.accelerator.is_main_process:
+            print("=" * 60)
+            print("GRPO Training for Math")
+            print("=" * 60)
+            print(f"Model: {self.config.model_name}")
+            print(f"Batch size: {self.config.batch_size}")
+            print(f"Epochs: {self.config.num_epochs}")
+            print(f"Learning rate: {self.config.learning_rate}")
+            print("=" * 60)
+
+        # Wait for services
+        self.wait_for_services()
+
+        # Training loop
+        steps_per_epoch = 100  # Configurable
+        for epoch in range(self.config.num_epochs):
+            epoch_rewards = []
+            epoch_losses = []
+
+            for step in range(steps_per_epoch):
+                metrics = self.train_step()
+                epoch_rewards.append(metrics["mean_reward"])
+                epoch_losses.append(metrics["loss"])
+
+                if self.accelerator.is_main_process and step % 10 == 0:
+                    print(f"Epoch {epoch+1}/{self.config.num_epochs} | "
+                          f"Step {step+1}/{steps_per_epoch} | "
+                          f"Loss: {metrics['loss']:.4f} | "
+                          f"Reward: {metrics['mean_reward']:.4f} | "
+                          f"Accuracy: {metrics['accuracy']:.2%}")
+
+            # Epoch summary
+            if self.accelerator.is_main_process:
+                mean_epoch_reward = sum(epoch_rewards) / len(epoch_rewards)
+                mean_epoch_loss = sum(epoch_losses) / len(epoch_losses)
+                print(f"\n=== Epoch {epoch+1} Complete ===")
+                print(f"Mean Reward: {mean_epoch_reward:.4f}")
+                print(f"Mean Loss: {mean_epoch_loss:.4f}")
+
+                # Print replay buffer stats
+                buffer_stats = self.get_replay_buffer_stats()
+                if buffer_stats:
+                    print(
+                        f"Replay Buffer: {buffer_stats['size']}/{buffer_stats['capacity']} "
+                        f"(avg_reward: {buffer_stats['avg_reward']:.4f}, "
+                        f"positive_ratio: {buffer_stats['positive_ratio']:.2%})"
+                    )
+                print()
+
+        if self.accelerator.is_main_process:
+            print("=" * 60)
+            print("Training Complete!")
+            print(f"Total steps: {self.total_steps}")
+            print(
+                f"Average reward: {self.total_rewards / self.total_steps:.4f}")
+            print("=" * 60)
+
+
+def main():
+    parser = argparse.ArgumentParser(description="GRPO Trainer for RLHF")
+    parser.add_argument("--data-server",
+                        type=str,
+                        required=True,
+                        help="Data server address (host:port)")
+    parser.add_argument("--rollout-server",
+                        type=str,
+                        required=True,
+                        help="Rollout server address (host:port)")
+    parser.add_argument("--reward-server",
+                        type=str,
+                        required=True,
+                        help="Reward server address (host:port)")
+    parser.add_argument("--replay-buffer",
+                        type=str,
+                        default=None,
+                        help="Replay buffer address (host:port)")
+    parser.add_argument("--model",
+                        type=str,
+                        default="Qwen/Qwen2.5-0.5B-Instruct",
+                        help="Model name or path")
+    parser.add_argument("--batch-size",
+                        type=int,
+                        default=4,
+                        help="Training batch size")
+    parser.add_argument("--num-epochs",
+                        type=int,
+                        default=3,
+                        help="Number of training epochs")
+    parser.add_argument("--learning-rate",
+                        type=float,
+                        default=1e-6,
+                        help="Learning rate")
+    args = parser.parse_args()
+
+    config = TrainingConfig(
+        data_server=args.data_server,
+        rollout_server=args.rollout_server,
+        reward_server=args.reward_server,
+        replay_buffer=args.replay_buffer,
+        model_name=args.model,
+        batch_size=args.batch_size,
+        num_epochs=args.num_epochs,
+        learning_rate=args.learning_rate,
+    )
+
+    trainer = RLHFTrainer(config)
+    trainer.train()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/llm/rl-post-training-jobgroup/rlhf-math-jobgroup-cpu.yaml b/llm/rl-post-training-jobgroup/rlhf-math-jobgroup-cpu.yaml
new file mode 100644
index 00000000000..732b53a7dc0
--- /dev/null
+++ b/llm/rl-post-training-jobgroup/rlhf-math-jobgroup-cpu.yaml
@@ -0,0 +1,184 @@
+# RLHF Math Training with Job Groups - CPU Test Version
+#
+# This is a simplified CPU-only version for testing the job group functionality.
+# It demonstrates the service connectivity without requiring GPUs.
+#
+# Primary/Auxiliary Behavior:
+#   The test-client is the primary task. When tests complete, all auxiliary
+#   services are terminated after a 5-second grace period.
+#
+# Usage:
+#   sky jobs launch llm/rl-post-training-jobgroup/rlhf-math-jobgroup-cpu.yaml
+---
+name: rlhf-math-cpu
+execution: parallel
+primary_tasks: [test-client]
+termination_delay: 5s
+
+---
+# Data Server: Serves math prompts from GSM8K dataset
+name: data-server
+resources:
+  cpus: 2
+  memory: 4+
+  infra: kubernetes
+
+file_mounts:
+  /code: llm/rl-post-training-jobgroup/code
+
+setup: |
+  pip install fastapi uvicorn datasets
+
+run: |
+  echo "Starting data server..."
+  echo "JobGroup: ${SKYPILOT_JOBGROUP_NAME}"
+  echo "This server provides math prompts at http://data-server-0.${SKYPILOT_JOBGROUP_NAME}:8000"
+
+  cd /code
+  python data_server.py --port 8000
+
+---
+# Reward Server: Verifies math answers against ground truth
+name: reward-server
+resources:
+  cpus: 2
+  memory: 4+
+  infra: kubernetes
+
+file_mounts:
+  /code: llm/rl-post-training-jobgroup/code
+
+setup: |
+  pip install fastapi uvicorn
+
+run: |
+  echo "Starting reward server..."
+  echo "JobGroup: ${SKYPILOT_JOBGROUP_NAME}"
+  echo "Reward API at http://reward-server-0.${SKYPILOT_JOBGROUP_NAME}:8002"
+
+  cd /code
+  python reward_server.py --port 8002
+
+---
+# Replay Buffer: Stores experience tuples for training
+name: replay-buffer
+resources:
+  cpus: 2
+  memory: 4+
+  infra: kubernetes
+
+file_mounts:
+  /code: llm/rl-post-training-jobgroup/code
+
+setup: |
+  pip install fastapi uvicorn
+
+run: |
+  echo "Starting replay buffer server..."
+  echo "JobGroup: ${SKYPILOT_JOBGROUP_NAME}"
+  echo "Replay Buffer API at http://replay-buffer-0.${SKYPILOT_JOBGROUP_NAME}:8003"
+
+  cd /code
+  python replay_buffer.py --port 8003 --capacity 1000
+
+---
+# Test Client: Verifies connectivity between services
+name: test-client
+resources:
+  cpus: 2
+  memory: 4+
+  infra: kubernetes
+
+setup: |
+  pip install httpx
+
+run: |
+  echo "Starting test client..."
+  echo "JobGroup: ${SKYPILOT_JOBGROUP_NAME}"
+
+  # Service discovery via job group DNS
+  DATA_SERVER="data-server-0.${SKYPILOT_JOBGROUP_NAME}:8000"
+  REWARD_SERVER="reward-server-0.${SKYPILOT_JOBGROUP_NAME}:8002"
+  REPLAY_BUFFER="replay-buffer-0.${SKYPILOT_JOBGROUP_NAME}:8003"
+
+  echo "Data server: ${DATA_SERVER}"
+  echo "Reward server: ${REWARD_SERVER}"
+  echo "Replay buffer: ${REPLAY_BUFFER}"
+
+  # Wait for services to be ready
+  echo "Waiting for services to be available..."
+  sleep 30
+
+  # Test data server
+  echo "Testing data server..."
+  for i in {1..5}; do
+    if curl -s "http://${DATA_SERVER}/health" | grep -q "healthy"; then
+      echo "Data server is healthy!"
+      break
+    fi
+    echo "Waiting for data server... attempt $i"
+    sleep 5
+  done
+
+  # Fetch some prompts
+  echo "Fetching prompts..."
+  PROMPTS=$(curl -s "http://${DATA_SERVER}/prompts?batch_size=2")
+  echo "Prompts: ${PROMPTS}"
+
+  # Test reward server
+  echo "Testing reward server..."
+  for i in {1..5}; do
+    if curl -s "http://${REWARD_SERVER}/health" | grep -q "healthy"; then
+      echo "Reward server is healthy!"
+      break
+    fi
+    echo "Waiting for reward server... attempt $i"
+    sleep 5
+  done
+
+  # Test reward computation
+  echo "Testing reward computation..."
+  REWARD=$(curl -s -X POST "http://${REWARD_SERVER}/reward" \
+    -H "Content-Type: application/json" \
+    -d '{"prompt": "What is 2+2?", "response": "The answer is 4", "ground_truth": "4"}')
+  echo "Reward response: ${REWARD}"
+
+  # Test replay buffer
+  echo "Testing replay buffer..."
+  for i in {1..5}; do
+    if curl -s "http://${REPLAY_BUFFER}/health" | grep -q "healthy"; then
+      echo "Replay buffer is healthy!"
+      break
+    fi
+    echo "Waiting for replay buffer... attempt $i"
+    sleep 5
+  done
+
+  # Add experience to replay buffer
+  echo "Adding experience to replay buffer..."
+  ADD_RESULT=$(curl -s -X POST "http://${REPLAY_BUFFER}/add" \
+    -H "Content-Type: application/json" \
+    -d '{"experiences": [{"prompt": "What is 2+2?", "response": "The answer is 4", "reward": 1.0, "ground_truth": "4"}]}')
+  echo "Add result: ${ADD_RESULT}"
+
+  # Get replay buffer stats
+  echo "Getting replay buffer stats..."
+  STATS=$(curl -s "http://${REPLAY_BUFFER}/stats")
+  echo "Stats: ${STATS}"
+
+  # Sample from replay buffer
+  echo "Sampling from replay buffer..."
+  SAMPLE=$(curl -s -X POST "http://${REPLAY_BUFFER}/sample" \
+    -H "Content-Type: application/json" \
+    -d '{"batch_size": 1}')
+  echo "Sample: ${SAMPLE}"
+
+  echo ""
+  echo "=========================================="
+  echo "All services are working correctly!"
+  echo "=========================================="
+  echo ""
+  echo "Job group connectivity test complete."
+
+  # Keep running to allow inspection
+  sleep 300
diff --git a/llm/rl-post-training-jobgroup/rlhf-math-jobgroup.yaml b/llm/rl-post-training-jobgroup/rlhf-math-jobgroup.yaml
new file mode 100644
index 00000000000..be405b409bd
--- /dev/null
+++ b/llm/rl-post-training-jobgroup/rlhf-math-jobgroup.yaml
@@ -0,0 +1,226 @@
+# RLHF Math Training with Job Groups
+#
+# This example demonstrates a distributed RLHF architecture using SkyPilot job groups.
+# It trains an LLM on mathematical reasoning using GRPO (Group Relative Policy Optimization)
+# with verifiable rewards.
+#
+# Architecture:
+#   - data-server (auxiliary): Serves GSM8K math prompts
+#   - rollout-server (auxiliary, x2): SGLang instances + SGLang router
+#   - reward-server (auxiliary): Verifies math answers against ground truth
+#   - replay-buffer (auxiliary): Stores experience tuples for sampling
+#   - ppo-trainer (primary): Orchestrates GRPO training across multiple nodes
+#
+# Primary/Auxiliary Behavior:
+#   The ppo-trainer is the primary task. When training completes, all auxiliary
+#   services (data-server, rollout-server, reward-server, replay-buffer) are
+#   terminated after a 10-second grace period to ensure clean shutdown.
+#
+# Load Balancing:
+#   The head node runs SGLang's native router (sglang_router) which provides
+#   cache-aware load balancing across all SGLang instances for optimal KV cache reuse.
+#   The trainer connects to the router endpoint on port 30000.
+#
+# Usage:
+#   sky jobs launch llm/rl-post-training-jobgroup/rlhf-math-jobgroup.yaml
+#
+# The components communicate over the job group network using DNS names:
+#   - data-server-0.${SKYPILOT_JOBGROUP_NAME}:8000
+#   - rollout-server-0.${SKYPILOT_JOBGROUP_NAME}:30000 (SGLang router endpoint)
+#   - rollout-server-0.${SKYPILOT_JOBGROUP_NAME}:30001 (SGLang backend 1)
+#   - rollout-server-1.${SKYPILOT_JOBGROUP_NAME}:30001 (SGLang backend 2)
+#   - reward-server-0.${SKYPILOT_JOBGROUP_NAME}:8002
+#   - replay-buffer-0.${SKYPILOT_JOBGROUP_NAME}:8003
+---
+name: rlhf-math
+execution: parallel
+primary_tasks: [ppo-trainer]
+termination_delay: 10s
+
+---
+# Data Server: Serves math prompts from GSM8K dataset
+name: data-server
+resources:
+  cpus: 4
+  memory: 16+
+  infra: kubernetes
+
+file_mounts:
+  /code: llm/rl-post-training-jobgroup/code
+
+setup: |
+  uv pip install fastapi uvicorn datasets --system
+
+run: |
+  echo "Starting data server..."
+  echo "JobGroup: ${SKYPILOT_JOBGROUP_NAME}"
+  echo "This server provides math prompts at http://data-server-0.${SKYPILOT_JOBGROUP_NAME}:8000"
+
+  cd /code
+  python data_server.py --port 8000
+
+---
+# Rollout Servers: Multiple SGLang instances with SGLang router on head node
+# Using num_nodes=2 to create rollout-server-0 and rollout-server-1
+# Head node (rank 0) runs both SGLang server and SGLang router for load balancing
+name: rollout-server
+num_nodes: 2
+resources:
+  accelerators: H100:1
+  memory: 32+
+  infra: kubernetes
+
+envs:
+  MODEL_NAME: Qwen/Qwen2.5-0.5B-Instruct
+
+setup: |
+  # Install system dependencies (libnuma is required by SGLang kernel)
+  sudo apt-get update && sudo apt-get install -y libnuma-dev
+  uv pip install "sglang[all]" sglang-router --system
+
+run: |
+  echo "Starting rollout server with SGLang..."
+  echo "JobGroup: ${SKYPILOT_JOBGROUP_NAME}"
+  echo "Node rank: ${SKYPILOT_NODE_RANK} / ${SKYPILOT_NUM_NODES}"
+  echo "Model: ${MODEL_NAME}"
+
+  # Start SGLang server in background
+  python -m sglang.launch_server \
+    --model ${MODEL_NAME} \
+    --host 0.0.0.0 \
+    --port 30001 &
+  SGLANG_PID=$!
+
+  # On head node, also run the SGLang router for load balancing
+  if [ "${SKYPILOT_NODE_RANK}" == "0" ]; then
+    echo "Head node: starting SGLang router..."
+
+    # Build worker URL list for all rollout servers
+    WORKER_URLS=""
+    for i in $(seq 0 $((SKYPILOT_NUM_NODES - 1))); do
+      WORKER_URLS="${WORKER_URLS} http://rollout-server-${i}.${SKYPILOT_JOBGROUP_NAME}:30001"
+    done
+
+    echo "Load balancing across:${WORKER_URLS}"
+    echo "Router API available at http://rollout-server-0.${SKYPILOT_JOBGROUP_NAME}:30000/v1"
+
+    # Wait for SGLang backends to start
+    sleep 60
+
+    python -m sglang_router.launch_router \
+      --worker-urls ${WORKER_URLS} \
+      --host 0.0.0.0 \
+      --port 30000 \
+      --policy cache_aware &
+    ROUTER_PID=$!
+
+    # Wait for both processes
+    wait $SGLANG_PID $ROUTER_PID
+  else
+    # Worker nodes just run SGLang server
+    wait $SGLANG_PID
+  fi
+
+---
+# Reward Server: Verifies math answers against ground truth
+name: reward-server
+resources:
+  cpus: 4
+  memory: 8+
+  infra: kubernetes
+
+file_mounts:
+  /code: llm/rl-post-training-jobgroup/code
+
+setup: |
+  uv pip install fastapi uvicorn --system
+
+run: |
+  echo "Starting reward server..."
+  echo "JobGroup: ${SKYPILOT_JOBGROUP_NAME}"
+  echo "Reward API at http://reward-server-0.${SKYPILOT_JOBGROUP_NAME}:8002"
+
+  cd /code
+  python reward_server.py --port 8002
+
+---
+# Replay Buffer: Stores experience tuples for training
+name: replay-buffer
+resources:
+  cpus: 4
+  memory: 16+
+  infra: kubernetes
+
+file_mounts:
+  /code: llm/rl-post-training-jobgroup/code
+
+setup: |
+  uv pip install fastapi uvicorn --system
+
+run: |
+  echo "Starting replay buffer server..."
+  echo "JobGroup: ${SKYPILOT_JOBGROUP_NAME}"
+  echo "Replay Buffer API at http://replay-buffer-0.${SKYPILOT_JOBGROUP_NAME}:8003"
+
+  cd /code
+  python replay_buffer.py --port 8003 --capacity 10000
+
+---
+# PPO Trainer: Multi-node GRPO training
+name: ppo-trainer
+resources:
+  accelerators: H100:1
+  memory: 32+
+  infra: kubernetes
+num_nodes: 2
+
+envs:
+  MODEL_NAME: Qwen/Qwen2.5-0.5B-Instruct
+  NUM_EPOCHS: 3
+  BATCH_SIZE: 4
+
+file_mounts:
+  /code: llm/rl-post-training-jobgroup/code
+
+setup: |
+  uv pip install torch transformers accelerate httpx --system
+
+run: |
+  echo "Starting GRPO trainer..."
+  echo "JobGroup: ${SKYPILOT_JOBGROUP_NAME}"
+  echo "Node rank: ${SKYPILOT_NODE_RANK} / ${SKYPILOT_NUM_NODES}"
+
+  # Service discovery via job group DNS
+  # The rollout head node provides load balancing across all SGLang instances
+  DATA_SERVER="data-server-0.${SKYPILOT_JOBGROUP_NAME}:8000"
+  ROLLOUT_SERVER="rollout-server-0.${SKYPILOT_JOBGROUP_NAME}:30000"
+  REWARD_SERVER="reward-server-0.${SKYPILOT_JOBGROUP_NAME}:8002"
+  REPLAY_BUFFER="replay-buffer-0.${SKYPILOT_JOBGROUP_NAME}:8003"
+
+  echo "Data server: ${DATA_SERVER}"
+  echo "Rollout server (load balanced): ${ROLLOUT_SERVER}"
+  echo "Reward server: ${REWARD_SERVER}"
+  echo "Replay buffer: ${REPLAY_BUFFER}"
+
+  # Wait for services to be ready
+  echo "Waiting for services to be available..."
+  sleep 30
+
+  # Only run training on rank 0 (coordinator)
+  if [ "${SKYPILOT_NODE_RANK}" == "0" ]; then
+    echo "Starting training on coordinator node..."
+    cd /code
+    python trainer.py \
+      --data-server ${DATA_SERVER} \
+      --rollout-server ${ROLLOUT_SERVER} \
+      --reward-server ${REWARD_SERVER} \
+      --replay-buffer ${REPLAY_BUFFER} \
+      --model ${MODEL_NAME} \
+      --batch-size ${BATCH_SIZE} \
+      --num-epochs ${NUM_EPOCHS}
+  else
+    echo "Worker node ${SKYPILOT_NODE_RANK} ready for distributed training"
+    # In a full implementation, worker nodes would join distributed training
+    # For this demo, they just wait
+    sleep infinity
+  fi
diff --git a/llm/train-eval-jobgroup/README.md b/llm/train-eval-jobgroup/README.md
new file mode 100644
index 00000000000..7d0cd9ffa16
--- /dev/null
+++ b/llm/train-eval-jobgroup/README.md
@@ -0,0 +1,172 @@
+# Parallel Training and Evaluation with Shared Volume
+
+This example demonstrates SkyPilot job groups with parallel training and evaluation tasks that share a Kubernetes PVC volume for checkpoints. The evaluator monitors the checkpoint directory and evaluates models "on the fly" as training produces them.
+
+## Architecture
+
+<p align="center">
+  <img src="../../docs/source/images/job-groups-train-eval-architecture.png" width="80%">
+</p>
+
+### Components
+
+1. **trainer**: Trains ResNet-18 on CIFAR-10, saves checkpoints every N epochs to shared storage
+2. **evaluator**: Watches the checkpoint directory, evaluates new checkpoints as they appear, reports test accuracy
+
+### Graceful Completion
+
+Both tasks complete naturally without forced termination:
+- When training finishes, the trainer writes a `training_complete` marker file to the shared volume
+- The evaluator detects this marker, finishes evaluating any remaining checkpoints, and exits gracefully
+- This pattern avoids the need for `primary_tasks` and `termination_delay` settings
+
+## Usage
+
+### Create the Shared Volume
+
+First, create the shared volume that both tasks will use:
+
+```bash
+sky volume apply llm/train-eval-jobgroup/train-eval-ckpts-volume.yaml
+```
+
+### Launch the Job Group
+
+```bash
+sky jobs launch llm/train-eval-jobgroup/train-eval-jobgroup.yaml
+```
+
+### Monitor Training
+
+```bash
+# Check job status
+sky jobs queue
+
+# View trainer logs (training progress)
+sky jobs logs <job-id> --task trainer
+
+# View evaluator logs (accuracy reports)
+sky jobs logs <job-id> --task evaluator
+```
+
+### Expected Output
+
+**Trainer logs:**
+```
+Starting trainer...
+Loading CIFAR-10 dataset...
+Epoch 1/10 | Loss: 1.8234 | LR: 0.099511 | Time: 45.2s
+Epoch 2/10 | Loss: 1.2456 | LR: 0.095106 | Time: 44.8s
+Saved checkpoint: /checkpoints/checkpoint_epoch_2.pt
+...
+```
+
+**Evaluator logs:**
+```
+Starting evaluator...
+Watching for checkpoints...
+Epoch   2 | Train Loss: 1.2456 | Test Accuracy: 52.34%
+Epoch   4 | Train Loss: 0.8123 | Test Accuracy: 68.91%
+Epoch   6 | Train Loss: 0.5234 | Test Accuracy: 75.23%
+...
+```
+
+## Configuration
+
+### Environment Variables
+
+| Variable | Default | Description |
+|----------|---------|-------------|
+| `NUM_EPOCHS` | `10` | Number of training epochs |
+| `SAVE_EVERY` | `2` | Save checkpoint every N epochs |
+| `CHECKPOINT_DIR` | `/checkpoints` | Shared checkpoint directory |
+
+### Customizing Resources
+
+Edit the YAML to adjust resources:
+
+```yaml
+resources:
+  accelerators: H100:1  # or A100:1 for faster training
+  memory: 32+
+```
+
+## How It Works
+
+### Shared Volume
+
+Both tasks mount the same SkyPilot volume at `/checkpoints`:
+
+```yaml
+volumes:
+  /checkpoints: train-eval-ckpts
+```
+
+This creates a shared Kubernetes PVC that both tasks can access. The volume must be created before launching the job group.
+
+### Checkpoint Format
+
+The trainer saves checkpoints with:
+- Model state dict
+- Optimizer state dict
+- Epoch number
+- Training loss
+- Timestamp
+
+Files are named `checkpoint_epoch_N.pt` and a `latest.json` file tracks the most recent checkpoint.
+
+### Evaluator Polling
+
+The evaluator uses simple filesystem polling to detect new checkpoints:
+1. Scans for `checkpoint_epoch_*.pt` files every 5 seconds
+2. Loads new checkpoints and evaluates on CIFAR-10 test set
+3. Reports accuracy and tracks results
+4. Exits when training completes
+
+## Key Features Demonstrated
+
+1. **Parallel Execution**: Training and evaluation run simultaneously
+2. **Shared Storage**: Tasks communicate through a shared filesystem
+3. **On-the-fly Evaluation**: No need to wait for training to finish
+4. **Simple Communication**: Filesystem-based, no network services needed
+
+## Comparison with RLHF Example
+
+| Feature | Train-Eval | RLHF |
+|---------|------------|------|
+| Communication | Shared filesystem | HTTP APIs |
+| Complexity | Simple | Complex |
+| Components | 2 tasks | 5 tasks |
+| Use case | Checkpointing | Service mesh |
+
+This example is intentionally simpler to demonstrate job groups without the complexity of network services.
+
+## Extending This Example
+
+### Adding More Evaluators
+
+You can run multiple evaluators for different metrics:
+
+```yaml
+---
+name: evaluator-accuracy
+# ... evaluates accuracy
+
+---
+name: evaluator-perplexity
+# ... evaluates perplexity
+```
+
+### Distributed Training
+
+Add `num_nodes` for multi-node training:
+
+```yaml
+name: trainer
+num_nodes: 2
+# ... use torch.distributed
+```
+
+### Early Stopping
+
+The evaluator could signal the trainer to stop early by writing a `stop.txt` file that the trainer checks.
diff --git a/llm/train-eval-jobgroup/code/evaluator.py b/llm/train-eval-jobgroup/code/evaluator.py
new file mode 100644
index 00000000000..732d8cb16da
--- /dev/null
+++ b/llm/train-eval-jobgroup/code/evaluator.py
@@ -0,0 +1,219 @@
+#!/usr/bin/env python3
+"""Evaluator script that watches for new checkpoints and evaluates them.
+
+This script monitors a checkpoint directory and evaluates new checkpoints
+as they appear, reporting accuracy on the CIFAR-10 test set.
+
+Usage:
+    python evaluator.py --checkpoint-dir /checkpoints
+"""
+
+import argparse
+import glob
+import os
+import time
+
+import torch
+import torch.nn as nn
+from torch.utils.data import DataLoader
+import torchvision
+import torchvision.transforms as transforms
+
+
+def get_model():
+    """Create a ResNet-18 model for CIFAR-10."""
+    model = torchvision.models.resnet18(weights=None)
+    # Modify for CIFAR-10 (32x32 images, 10 classes)
+    model.conv1 = nn.Conv2d(3,
+                            64,
+                            kernel_size=3,
+                            stride=1,
+                            padding=1,
+                            bias=False)
+    model.maxpool = nn.Identity()
+    model.fc = nn.Linear(model.fc.in_features, 10)
+    return model
+
+
+def get_test_dataloader(batch_size=128):
+    """Create test dataloader for CIFAR-10."""
+    transform_test = transforms.Compose([
+        transforms.ToTensor(),
+        transforms.Normalize((0.4914, 0.4822, 0.4465),
+                             (0.2023, 0.1994, 0.2010)),
+    ])
+
+    testset = torchvision.datasets.CIFAR10(root='./data',
+                                           train=False,
+                                           download=True,
+                                           transform=transform_test)
+    testloader = DataLoader(testset,
+                            batch_size=batch_size,
+                            shuffle=False,
+                            num_workers=2)
+
+    return testloader
+
+
+def evaluate(model, testloader, device):
+    """Evaluate model on test set and return accuracy."""
+    model.eval()
+    correct = 0
+    total = 0
+
+    with torch.no_grad():
+        for inputs, targets in testloader:
+            inputs, targets = inputs.to(device), targets.to(device)
+            outputs = model(inputs)
+            _, predicted = outputs.max(1)
+            total += targets.size(0)
+            correct += predicted.eq(targets).sum().item()
+
+    accuracy = 100.0 * correct / total
+    return accuracy
+
+
+def get_checkpoint_files(checkpoint_dir):
+    """Get list of checkpoint files in directory."""
+    pattern = os.path.join(checkpoint_dir, 'checkpoint_epoch_*.pt')
+    return set(glob.glob(pattern))
+
+
+def load_checkpoint(checkpoint_path, model, device):
+    """Load checkpoint and return metadata."""
+    checkpoint = torch.load(checkpoint_path,
+                            map_location=device,
+                            weights_only=False)
+    model.load_state_dict(checkpoint['model_state_dict'])
+    return {
+        'epoch': checkpoint['epoch'],
+        'train_loss': checkpoint['train_loss'],
+        'timestamp': checkpoint.get('timestamp', 0),
+    }
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Evaluate checkpoints as they appear')
+    parser.add_argument('--checkpoint-dir',
+                        type=str,
+                        required=True,
+                        help='Directory to watch for checkpoints')
+    parser.add_argument('--poll-interval',
+                        type=int,
+                        default=5,
+                        help='Seconds between polling for new checkpoints')
+    parser.add_argument('--batch-size',
+                        type=int,
+                        default=128,
+                        help='Evaluation batch size')
+    args = parser.parse_args()
+
+    # Setup device
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    print(f"Using device: {device}")
+
+    # Create model
+    model = get_model().to(device)
+
+    # Get test data
+    print("Loading CIFAR-10 test dataset...")
+    testloader = get_test_dataloader(args.batch_size)
+    print(f"Test samples: {len(testloader.dataset)}")
+
+    print("\n" + "=" * 60)
+    print("Checkpoint Evaluator")
+    print("=" * 60)
+    print(f"Watching directory: {args.checkpoint_dir}")
+    print(f"Poll interval: {args.poll_interval} seconds")
+    print("=" * 60 + "\n")
+
+    # Track evaluated checkpoints
+    evaluated_checkpoints = set()
+    results = []
+
+    print("Waiting for checkpoints...")
+    print("-" * 60)
+
+    training_complete = False
+    complete_marker_path = os.path.join(args.checkpoint_dir,
+                                        'training_complete')
+
+    while True:
+        # Get current checkpoint files
+        current_checkpoints = get_checkpoint_files(args.checkpoint_dir)
+
+        # Find new checkpoints
+        new_checkpoints = current_checkpoints - evaluated_checkpoints
+
+        if new_checkpoints:
+            # Sort by epoch number
+            sorted_checkpoints = sorted(
+                new_checkpoints,
+                key=lambda x: int(
+                    os.path.basename(x).split('_')[-1].replace('.pt', '')))
+
+            for checkpoint_path in sorted_checkpoints:
+                try:
+                    # Load and evaluate
+                    metadata = load_checkpoint(checkpoint_path, model, device)
+                    accuracy = evaluate(model, testloader, device)
+
+                    result = {
+                        'checkpoint': os.path.basename(checkpoint_path),
+                        'epoch': metadata['epoch'],
+                        'train_loss': metadata['train_loss'],
+                        'test_accuracy': accuracy,
+                    }
+                    results.append(result)
+
+                    print(f"Epoch {metadata['epoch']:3d} | "
+                          f"Train Loss: {metadata['train_loss']:.4f} | "
+                          f"Test Accuracy: {accuracy:.2f}%")
+
+                    evaluated_checkpoints.add(checkpoint_path)
+
+                except Exception as e:
+                    print(f"Error evaluating {checkpoint_path}: {e}")
+                    # Don't mark as evaluated, will retry next poll
+                    continue
+
+        # Check if training is complete (look for training_complete marker)
+        if os.path.exists(complete_marker_path):
+            if not training_complete:
+                print("\nDetected training completion marker.")
+                training_complete = True
+
+            # Evaluate any remaining checkpoints
+            remaining = get_checkpoint_files(
+                args.checkpoint_dir) - evaluated_checkpoints
+            if not remaining:
+                print("All checkpoints evaluated. Exiting.")
+                break
+
+        time.sleep(args.poll_interval)
+
+    # Final summary
+    print("\n" + "=" * 60)
+    print("Evaluation Complete!")
+    print("=" * 60)
+
+    if results:
+        print("\nResults Summary:")
+        print("-" * 60)
+        print(f"{'Epoch':>6} | {'Train Loss':>12} | {'Test Accuracy':>14}")
+        print("-" * 60)
+        for r in results:
+            print(f"{r['epoch']:>6} | {r['train_loss']:>12.4f} | "
+                  f"{r['test_accuracy']:>13.2f}%")
+        print("-" * 60)
+
+        best = max(results, key=lambda x: x['test_accuracy'])
+        print(f"\nBest: Epoch {best['epoch']} with "
+              f"{best['test_accuracy']:.2f}% accuracy")
+
+    print("=" * 60)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/llm/train-eval-jobgroup/code/trainer.py b/llm/train-eval-jobgroup/code/trainer.py
new file mode 100644
index 00000000000..77381e18051
--- /dev/null
+++ b/llm/train-eval-jobgroup/code/trainer.py
@@ -0,0 +1,216 @@
+#!/usr/bin/env python3
+"""Trainer script for ResNet-18 on CIFAR-10.
+
+This script trains a ResNet-18 model on CIFAR-10 and saves checkpoints
+periodically to a shared directory that the evaluator can access.
+
+Usage:
+    python trainer.py --checkpoint-dir /checkpoints --num-epochs 10 --save-every 2
+"""
+
+import argparse
+import json
+import os
+import time
+
+import torch
+import torch.nn as nn
+import torch.optim as optim
+from torch.utils.data import DataLoader
+import torchvision
+import torchvision.transforms as transforms
+
+
+def get_model():
+    """Create a ResNet-18 model for CIFAR-10."""
+    model = torchvision.models.resnet18(weights=None)
+    # Modify for CIFAR-10 (32x32 images, 10 classes)
+    model.conv1 = nn.Conv2d(3,
+                            64,
+                            kernel_size=3,
+                            stride=1,
+                            padding=1,
+                            bias=False)
+    model.maxpool = nn.Identity()
+    model.fc = nn.Linear(model.fc.in_features, 10)
+    return model
+
+
+def get_dataloaders(batch_size=128):
+    """Create training and test dataloaders for CIFAR-10."""
+    transform_train = transforms.Compose([
+        transforms.RandomCrop(32, padding=4),
+        transforms.RandomHorizontalFlip(),
+        transforms.ToTensor(),
+        transforms.Normalize((0.4914, 0.4822, 0.4465),
+                             (0.2023, 0.1994, 0.2010)),
+    ])
+
+    transform_test = transforms.Compose([
+        transforms.ToTensor(),
+        transforms.Normalize((0.4914, 0.4822, 0.4465),
+                             (0.2023, 0.1994, 0.2010)),
+    ])
+
+    trainset = torchvision.datasets.CIFAR10(root='./data',
+                                            train=True,
+                                            download=True,
+                                            transform=transform_train)
+    trainloader = DataLoader(trainset,
+                             batch_size=batch_size,
+                             shuffle=True,
+                             num_workers=2)
+
+    testset = torchvision.datasets.CIFAR10(root='./data',
+                                           train=False,
+                                           download=True,
+                                           transform=transform_test)
+    testloader = DataLoader(testset,
+                            batch_size=batch_size,
+                            shuffle=False,
+                            num_workers=2)
+
+    return trainloader, testloader
+
+
+def save_checkpoint(model, optimizer, epoch, train_loss, checkpoint_dir):
+    """Save a training checkpoint."""
+    os.makedirs(checkpoint_dir, exist_ok=True)
+
+    checkpoint_path = os.path.join(checkpoint_dir,
+                                   f'checkpoint_epoch_{epoch}.pt')
+    checkpoint = {
+        'epoch': epoch,
+        'model_state_dict': model.state_dict(),
+        'optimizer_state_dict': optimizer.state_dict(),
+        'train_loss': train_loss,
+        'timestamp': time.time(),
+    }
+    torch.save(checkpoint, checkpoint_path)
+    print(f"Saved checkpoint: {checkpoint_path}")
+
+    # Update latest.json to point to this checkpoint
+    latest_path = os.path.join(checkpoint_dir, 'latest.json')
+    with open(latest_path, 'w') as f:
+        json.dump(
+            {
+                'checkpoint': f'checkpoint_epoch_{epoch}.pt',
+                'epoch': epoch,
+                'train_loss': train_loss,
+                'timestamp': time.time(),
+            },
+            f,
+            indent=2)
+
+
+def train_epoch(model, trainloader, criterion, optimizer, device):
+    """Train for one epoch and return average loss."""
+    model.train()
+    running_loss = 0.0
+    total_batches = 0
+
+    for inputs, targets in trainloader:
+        inputs, targets = inputs.to(device), targets.to(device)
+
+        optimizer.zero_grad()
+        outputs = model(inputs)
+        loss = criterion(outputs, targets)
+        loss.backward()
+        optimizer.step()
+
+        running_loss += loss.item()
+        total_batches += 1
+
+    return running_loss / total_batches
+
+
+def main():
+    parser = argparse.ArgumentParser(description='Train ResNet-18 on CIFAR-10')
+    parser.add_argument('--checkpoint-dir',
+                        type=str,
+                        required=True,
+                        help='Directory to save checkpoints')
+    parser.add_argument('--num-epochs',
+                        type=int,
+                        default=10,
+                        help='Number of training epochs')
+    parser.add_argument('--save-every',
+                        type=int,
+                        default=2,
+                        help='Save checkpoint every N epochs')
+    parser.add_argument('--batch-size',
+                        type=int,
+                        default=128,
+                        help='Training batch size')
+    parser.add_argument('--learning-rate',
+                        type=float,
+                        default=0.1,
+                        help='Initial learning rate')
+    args = parser.parse_args()
+
+    # Setup device
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    print(f"Using device: {device}")
+
+    # Create model, criterion, optimizer
+    model = get_model().to(device)
+    criterion = nn.CrossEntropyLoss()
+    optimizer = optim.SGD(model.parameters(),
+                          lr=args.learning_rate,
+                          momentum=0.9,
+                          weight_decay=5e-4)
+    scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer,
+                                                     T_max=args.num_epochs)
+
+    # Get data
+    print("Loading CIFAR-10 dataset...")
+    trainloader, _ = get_dataloaders(args.batch_size)
+    print(f"Training samples: {len(trainloader.dataset)}")
+
+    # Training loop
+    print("\n" + "=" * 60)
+    print("Starting Training")
+    print("=" * 60)
+    print(f"Epochs: {args.num_epochs}")
+    print(f"Batch size: {args.batch_size}")
+    print(f"Checkpoint directory: {args.checkpoint_dir}")
+    print(f"Saving every {args.save_every} epochs")
+    print("=" * 60 + "\n")
+
+    for epoch in range(1, args.num_epochs + 1):
+        start_time = time.time()
+        train_loss = train_epoch(model, trainloader, criterion, optimizer,
+                                 device)
+        scheduler.step()
+        epoch_time = time.time() - start_time
+
+        print(f"Epoch {epoch}/{args.num_epochs} | "
+              f"Loss: {train_loss:.4f} | "
+              f"LR: {scheduler.get_last_lr()[0]:.6f} | "
+              f"Time: {epoch_time:.1f}s")
+
+        # Save checkpoint
+        if epoch % args.save_every == 0 or epoch == args.num_epochs:
+            save_checkpoint(model, optimizer, epoch, train_loss,
+                            args.checkpoint_dir)
+
+    # Write training complete marker for evaluator
+    complete_marker = os.path.join(args.checkpoint_dir, 'training_complete')
+    with open(complete_marker, 'w') as f:
+        json.dump(
+            {
+                'final_epoch': args.num_epochs,
+                'final_loss': train_loss,
+                'timestamp': time.time(),
+            },
+            f,
+            indent=2)
+    print(f"Wrote completion marker: {complete_marker}")
+
+    print("\n" + "=" * 60)
+    print("Training Complete!")
+    print("=" * 60)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/llm/train-eval-jobgroup/train-eval-ckpts-volume.yaml b/llm/train-eval-jobgroup/train-eval-ckpts-volume.yaml
new file mode 100644
index 00000000000..c72fc577fdc
--- /dev/null
+++ b/llm/train-eval-jobgroup/train-eval-ckpts-volume.yaml
@@ -0,0 +1,16 @@
+# Volume definition for train-eval-jobgroup shared checkpoint storage
+#
+# This volume is used by both the trainer and evaluator tasks to share
+# checkpoint files. Create this volume before launching the job group:
+#
+#   sky volume apply llm/train-eval-jobgroup/train-eval-ckpts-volume.yaml
+#
+# Then launch the job group:
+#   sky jobs launch llm/train-eval-jobgroup/train-eval-jobgroup.yaml
+
+name: train-eval-ckpts
+type: k8s-pvc
+size: 10Gi
+infra: kubernetes
+config:
+  access_mode: ReadWriteMany
diff --git a/llm/train-eval-jobgroup/train-eval-jobgroup.yaml b/llm/train-eval-jobgroup/train-eval-jobgroup.yaml
new file mode 100644
index 00000000000..861f1ed1459
--- /dev/null
+++ b/llm/train-eval-jobgroup/train-eval-jobgroup.yaml
@@ -0,0 +1,90 @@
+# Parallel Training and Evaluation with Shared Volume
+#
+# This example demonstrates a job group with parallel training and evaluation
+# tasks that share a Kubernetes volume for checkpoints. The evaluator monitors
+# the checkpoint directory and evaluates models as training produces them.
+#
+# Architecture:
+#   - trainer: Trains ResNet-18 on CIFAR-10, saves checkpoints to shared volume
+#   - evaluator: Watches for checkpoints, evaluates and reports accuracy
+#
+# Completion Behavior:
+#   When training completes, the trainer writes a "training_complete" marker
+#   file to the shared volume. The evaluator detects this marker, finishes
+#   evaluating any remaining checkpoints, and exits gracefully. Both tasks
+#   complete naturally without forced termination.
+#
+# Usage:
+#   # First, create the shared volume:
+#   sky volume apply llm/train-eval-jobgroup/train-eval-ckpts-volume.yaml
+#
+#   # Then launch the job group:
+#   sky jobs launch llm/train-eval-jobgroup/train-eval-jobgroup.yaml
+#
+# The components share storage via a Kubernetes PVC:
+#   /checkpoints - Shared volume for checkpoint files
+---
+name: train-eval
+execution: parallel
+
+---
+# Trainer: Trains ResNet-18 on CIFAR-10 and saves checkpoints
+name: trainer
+resources:
+  accelerators: H100:1
+  memory: 16+
+  infra: kubernetes
+
+file_mounts:
+  /code: llm/train-eval-jobgroup/code
+
+volumes:
+  /checkpoints: train-eval-ckpts
+
+envs:
+  CHECKPOINT_DIR: /checkpoints
+  NUM_EPOCHS: 10
+  SAVE_EVERY: 2
+
+setup: |
+  uv pip install torch torchvision --system
+
+run: |
+  echo "Starting trainer..."
+  echo "JobGroup: ${SKYPILOT_JOBGROUP_NAME}"
+  echo "Checkpoints will be saved to ${CHECKPOINT_DIR}"
+
+  cd /code
+  python trainer.py \
+    --checkpoint-dir ${CHECKPOINT_DIR} \
+    --num-epochs ${NUM_EPOCHS} \
+    --save-every ${SAVE_EVERY}
+
+---
+# Evaluator: Watches for checkpoints and evaluates them
+name: evaluator
+resources:
+  accelerators: H100:1
+  memory: 16+
+  infra: kubernetes
+
+file_mounts:
+  /code: llm/train-eval-jobgroup/code
+
+volumes:
+  /checkpoints: train-eval-ckpts
+
+envs:
+  CHECKPOINT_DIR: /checkpoints
+
+setup: |
+  uv pip install torch torchvision --system
+
+run: |
+  echo "Starting evaluator..."
+  echo "JobGroup: ${SKYPILOT_JOBGROUP_NAME}"
+  echo "Watching for checkpoints in ${CHECKPOINT_DIR}"
+
+  cd /code
+  python evaluator.py \
+    --checkpoint-dir ${CHECKPOINT_DIR}
diff --git a/llm/verl/README.md b/llm/verl/README.md
index 0d2ec660b97..e227368b038 100644
--- a/llm/verl/README.md
+++ b/llm/verl/README.md
@@ -3,6 +3,8 @@
 
 [Verl](https://github.com/volcengine/verl) is the most popular open-source reinforcement learning framework for LLMs, supporting PPO, GRPO, and other algorithms.
 
+Also see [`search-tooling/`](https://github.com/skypilot-org/skypilot/tree/master/llm/verl/search-tooling) and this [blog](https://blog.skypilot.co/verl-tool-calling/) for tool-augmented “search” workflows (Search-R1 style), including Google Search–backed inference and a Wikipedia FAISS retrieval service used for inference and training.
+
 ## Why SkyPilot + Verl?
 
 SkyPilot makes RL training **easy and cost-effective**:
@@ -47,81 +49,8 @@ sky status --endpoint 8280 verl
 </p>
 <p align="center"><i>Ray dashboard showing real-time monitoring of distributed training across multiple nodes</i></p>
 
-## Key Features
-
-The example trains Qwen2.5-0.5B-Instruct on the GSM8K dataset using PPO:
-- **Multi-node distributed training** with automatic Ray cluster setup
-- **Checkpoint persistence** to cloud storage for fault tolerance
-- **Customizable models and datasets** via environment variables
-
-## Optional: Enable W&B for Training Visualization
-
-To track training curves and metrics in Weights & Biases:
-```bash
-# 1. Set your W&B API key locally
-export WANDB_API_KEY=your-api-key
-
-# 2. Launch with the secret flag
-sky launch -c verl llm/verl/multinode.yaml --secret WANDB_API_KEY
-
-# 3. Edit multinode.yaml to enable W&B logger (see comments in the file)
-```
-
-## Advanced Usage
-
-### 💰 Use Spot Instances for 3x Cost Savings
-
-```bash
-sky jobs launch -n verl-job llm/verl/multinode.yaml
-```
-Training automatically resumes from checkpoints if preempted.
-
-### 🚀 Continue Experiments on the Same Cluster
-
-```bash
-# Run additional training epochs
-sky exec verl llm/verl/multinode.yaml --env TOTAL_EPOCHS=10
-
-# The YAML automatically detects and reuses the existing Ray cluster
-```
-
-### 📈 Scale to More Nodes
-
-```bash
-sky launch -c verl llm/verl/multinode.yaml --num-nodes 4
-```
-
-### 🔧 Customize Training Configuration
-
-Modify parameters directly:
-```bash
-sky launch -c verl llm/verl/multinode.yaml \
-  --env MODEL_NAME=meta-llama/Llama-2-7b-hf \
-  --env ACTOR_LR=5e-6 \
-  --env CRITIC_LR=1e-5
-```
-
-Train a larger model:
-```bash
-sky launch -c verl llm/verl/multinode.yaml \
-  --env MODEL_NAME=Qwen/Qwen2.5-7B-Instruct \
-  --gpus A100-80GB:8 --num-nodes 4
-```
-
-## Understanding the Setup
-
-1. **Head node**: Prepares data, starts Ray head, submits training job
-2. **Worker nodes**: Join Ray cluster for distributed training
-3. **Smart resumption**: Ray cluster is reused if already running, avoiding restart overhead
-
-## Troubleshooting
-
-- **OOM errors**: Reduce batch sizes or `gpu_memory_utilization`
-- **Connection issues**: Ensure ports 6385 (Ray) and 8280 (dashboard) are not blocked
-- **First run is slow**: Model download happens once, subsequent runs are faster
-
 ## Learn More
 
 - [Verl Documentation](https://verl.readthedocs.io/)
 - [Verl GitHub Repository](https://github.com/volcengine/verl)
-- [SkyPilot Ray Setup Guide](https://docs.skypilot.co/en/latest/running-jobs/distributed-jobs.html#executing-a-distributed-ray-program)
\ No newline at end of file
+- [SkyPilot Ray Setup Guide](https://docs.skypilot.co/en/latest/running-jobs/distributed-jobs.html#executing-a-distributed-ray-program)
diff --git a/llm/verl/search-tooling/README.md b/llm/verl/search-tooling/README.md
new file mode 100644
index 00000000000..41f8b322e0e
--- /dev/null
+++ b/llm/verl/search-tooling/README.md
@@ -0,0 +1,37 @@
+# Search tooling for VERL
+
+This folder contains SkyPilot YAMLs for training and inference with tool-augmented “search” workflows (Search-R1 style), using either:
+- a **Google Search** backend, or
+- a **Wikipedia retrieval service** (FAISS index).
+
+See this [blog](https://blog.skypilot.co/verl-tool-calling/) for how the YAMLs are used for training a RL agent that can use Google search.
+
+## Inference (Google Search backend)
+
+```bash
+sky launch -c verl-infer-google llm/verl/search-tooling/verl-search-interaction-google-search.yaml \
+  --env MODEL_PATH=/checkpoints/hf_model \
+  --env GOOGLE_API_KEY=your_key_here \
+  --env GOOGLE_CSE_ID=your_cse_id_here \
+  -y
+```
+
+## Inference (local Wikipedia retrieval on the same node)
+
+```bash
+sky launch -c verl-infer llm/verl/search-tooling/verl-search-interaction-infer.yaml \
+  --env MODEL_PATH=/checkpoints/hf_model \
+  -y
+```
+
+## Retrieval service (CPU-only, for reuse across jobs)
+
+```bash
+sky serve up -n retrieval llm/verl/search-tooling/verl-search-interaction-retrieval.yaml --cpus 32+ --memory 256+ -y
+sky serve status retrieval --endpoint 8000
+```
+
+## Training
+
+- Single-node training with retrieval running on the same node: `llm/verl/search-tooling/verl-search-interaction.yaml`
+- Training that points to an external retrieval service: `llm/verl/search-tooling/verl-search-interaction-rl-trainer.yaml`
diff --git a/llm/verl/search-tooling/verl-search-interaction-google-search.yaml b/llm/verl/search-tooling/verl-search-interaction-google-search.yaml
new file mode 100644
index 00000000000..e51cc6db2eb
--- /dev/null
+++ b/llm/verl/search-tooling/verl-search-interaction-google-search.yaml
@@ -0,0 +1,140 @@
+# Search Tool Interaction Inference (Google Search backend)
+#
+# This example demonstrates inference using Search-R1 with a search/retrieval tool.
+# The model uses a Google Search–backed tool for answering questions that require external knowledge.
+# Both the Google search server and inference run on the same node.
+#
+# Usage:
+#   sky launch -c verl-infer-google llm/verl/verl-search-interaction-google-infer.yaml \
+#     --env MODEL_PATH=/checkpoints/hf_model \
+#     --env GOOGLE_API_KEY=your_key_here \
+#     --env GOOGLE_CSE_ID=your_cse_id_here \
+#     -y
+#
+# Requirements:
+#   - Single GPU for inference
+#   - Valid Google Programmable Search Engine (CSE) + API key
+
+resources:
+  accelerators: H100:1
+  memory: 128+
+  ports:
+    - 8000  # Google search server
+
+num_nodes: 1
+
+envs:
+  MODEL_PATH: ""          # Optional: Path to model checkpoint (defaults to base model)
+  GOOGLE_API_KEY: ""      # Required: Google API key
+  GOOGLE_CSE_ID: ""       # Required: Google Custom Search Engine ID
+  CHECKPOINT_BUCKET_NAME: verl-search-interaction-checkpoints
+
+file_mounts:
+  /checkpoints:
+    name: ${CHECKPOINT_BUCKET_NAME}
+    mode: MOUNT
+
+setup: |
+  set -e
+
+  echo "=== Search Tool Inference Setup (Google Search) ==="
+
+  # System dependencies
+  echo "Installing system dependencies..."
+  sudo apt update && sudo apt install -y iproute2 git
+
+  # Python environment
+  echo "Setting up Python virtual environment..."
+  uv venv --python 3.10 --seed
+  source .venv/bin/activate
+
+  echo "Installing PyTorch..."
+  uv pip install "torch==2.8.*" torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128
+
+  # Clone VERL repository (if infer.py relies on its code / configs)
+  echo "Cloning VERL repository..."
+  rm -rf verl
+  git clone https://github.com/volcengine/verl.git
+  cd verl
+  git checkout v0.6.0
+
+  echo "Installing VERL + SGLang dependencies..."
+  uv pip install -v -e .
+  uv pip install wheel
+  uv pip install packaging
+  uv pip install -r ./requirements_sglang.txt
+  uv pip install "https://github.com/Dao-AILab/flash-attention/releases/download/v2.8.3/flash_attn-2.8.3+cu12torch2.8cxx11abiFALSE-cp310-cp310-linux_x86_64.whl"
+
+  cd ..
+
+  # Clone Search-R1 for inference
+  echo "Cloning Search-R1 repository..."
+  rm -rf Search-R1
+  git clone https://github.com/PeterGriffinJin/Search-R1.git
+
+  # Install additional inference dependencies
+  cd Search-R1
+  if [ -f requirements.txt ]; then
+    echo "Installing Search-R1 requirements..."
+    uv pip install -r requirements.txt
+  fi
+
+  # Ensure Google API client is available (if not already pulled in)
+  uv pip install google-api-python-client
+
+  cd ..
+
+  echo "✓ Inference setup complete!"
+
+run: |
+  set -e
+
+  echo "=== Search Tool Inference (Google Search backend) ==="
+
+  # Activate environment
+  source .venv/bin/activate
+
+  # Sanity check env vars
+  if [ -z "$GOOGLE_API_KEY" ] || [ -z "$GOOGLE_CSE_ID" ]; then
+    echo "ERROR: GOOGLE_API_KEY and GOOGLE_CSE_ID must be set via --env."
+    exit 1
+  fi
+
+  echo "Using GOOGLE_API_KEY: (set)"
+  echo "Using GOOGLE_CSE_ID:  (set)"
+
+  # Start Google search server in background
+  cd ~/sky_workdir/Search-R1
+  echo "Starting Google search server on port 8000..."
+  python search_r1/search/google_search_server.py \
+    --api_key "$GOOGLE_API_KEY" \
+    --cse_id "$GOOGLE_CSE_ID" \
+    > google_search_server.log 2>&1 &
+
+  RETRIEVAL_PID=$!
+  echo "Google search server PID: $RETRIEVAL_PID"
+
+  # Give the server a moment to start
+  sleep 10
+
+  # (Optional) basic health check if the server exposes one
+  # curl -f http://127.0.0.1:8000/health || echo "Healthcheck failed (continuing anyway)"
+
+  # Run inference
+  echo "Running infer.py..."
+  if [ -n "$MODEL_PATH" ]; then
+    # If your infer.py supports a flag, use it; otherwise it may read MODEL_PATH from env.
+    python infer.py --model_path "$MODEL_PATH" || python infer.py
+  else
+    python infer.py
+  fi
+
+  echo "✓ Inference finished"
+
+  # Clean up search server (SkyPilot will tear down the node afterwards anyway)
+  if ps -p $RETRIEVAL_PID > /dev/null 2>&1; then
+    echo "Stopping Google search server..."
+    kill $RETRIEVAL_PID || true
+  fi
+
+  echo "=== Done ==="
diff --git a/llm/verl/search-tooling/verl-search-interaction-infer.yaml b/llm/verl/search-tooling/verl-search-interaction-infer.yaml
new file mode 100644
index 00000000000..cdc88566e60
--- /dev/null
+++ b/llm/verl/search-tooling/verl-search-interaction-infer.yaml
@@ -0,0 +1,122 @@
+# Search Tool Interaction Inference
+#
+# This example demonstrates inference using Search-R1 with a search/retrieval tool.
+# The model uses a search tool for answering questions that require external knowledge.
+# Both retrieval service and inference run on the same node.
+#
+# Usage:
+#   sky launch -c verl-infer llm/verl/verl-search-interaction-infer.yaml --env MODEL_PATH=/checkpoints/hf_model -y
+#
+# Requirements:
+#   - Single GPU for inference
+#   - Sufficient memory for retrieval index
+
+resources:
+  accelerators: H100:1
+  memory: 128+
+  ports:
+    - 8000  # Retrieval service
+
+num_nodes: 1
+
+envs:
+  MODEL_PATH: ""  # Optional: Path to model checkpoint (defaults to base model)
+  RETRIEVAL_TOPK: 3
+  RETRIEVER_NAME: e5
+  RETRIEVER_MODEL: intfloat/e5-base-v2
+  CHECKPOINT_BUCKET_NAME: verl-search-interaction-checkpoints
+
+file_mounts:
+  /checkpoints:
+    name: ${CHECKPOINT_BUCKET_NAME}
+    mode: MOUNT
+
+setup: |
+  set -e
+
+  echo "=== Search Tool Inference Setup ==="
+
+  # System dependencies
+  echo "Installing system dependencies..."
+  sudo apt update && sudo apt install -y iproute2
+
+  # Python environment
+  echo "Setting up Python virtual environment..."
+  uv venv --python 3.10 --seed
+  source .venv/bin/activate
+
+  # Install dependencies
+  echo "Installing PyTorch and dependencies..."
+  uv pip install "torch==2.8.*" torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128
+  uv pip install -v -e .
+  uv pip install wheel
+  uv pip install packaging
+  uv pip install -r ./requirements_sglang.txt
+  uv pip install "https://github.com/Dao-AILab/flash-attention/releases/download/v2.8.3/flash_attn-2.8.3+cu12torch2.8cxx11abiFALSE-cp310-cp310-linux_x86_64.whl"
+
+  # Download Wikipedia corpus and FAISS index
+  echo "Downloading Wikipedia corpus and FAISS index..."
+  export save_path=~/dataset
+  mkdir -p $save_path
+
+  huggingface-cli download maknee/wiki-18-subsets wiki-18-100k.jsonl.gz --repo-type=dataset --local-dir $save_path
+  huggingface-cli download maknee/wiki-18-subsets e5_Flat-100k.index --repo-type=dataset --local-dir $save_path
+
+  # Move files to expected locations
+  mv $save_path/wiki-18-100k.jsonl.gz $save_path/wiki-18.jsonl.gz
+  mv $save_path/e5_Flat-100k.index $save_path/e5_Flat.index
+
+  # Decompress the JSONL file
+  gzip -d $save_path/wiki-18.jsonl.gz -f
+
+  # Clone VERL repository
+  echo "Cloning VERL repository..."
+  rm -rf verl
+  git clone https://github.com/volcengine/verl.git
+  cd verl
+  git checkout v0.6.0
+  cd ..
+
+  # Clone Search-R1 for inference
+  echo "Cloning Search-R1 repository..."
+  rm -rf Search-R1
+  git clone https://github.com/PeterGriffinJin/Search-R1/
+
+  # Install additional inference dependencies if needed
+  cd Search-R1
+  if [ -f requirements.txt ]; then
+    uv pip install -r requirements.txt
+  fi
+  cd ..
+
+  echo "✓ Inference setup complete!"
+
+run: |
+  set -e
+
+  echo "=== Search Tool Inference ==="
+
+  # Activate environment
+  source .venv/bin/activate
+
+  # Set up paths
+  save_path=~/dataset
+  index_file=$save_path/e5_Flat.index
+  corpus_file=$save_path/wiki-18.jsonl
+
+  # Start retrieval server in background
+  echo "Starting retrieval server on port 8000..."
+  cd verl
+  python examples/sglang_multiturn/search_r1_like/local_dense_retriever/retrieval_server.py \
+    --index_path $index_file \
+    --corpus_path $corpus_file \
+    --topk $RETRIEVAL_TOPK \
+    --retriever_name $RETRIEVER_NAME \
+    --retriever_model $RETRIEVER_MODEL &
+
+  RETRIEVAL_PID=$!
+  sleep 10
+
+  # Run inference
+  cd ~/sky_workdir/Search-R1
+  python infer.py
diff --git a/llm/verl/search-tooling/verl-search-interaction-retrieval.yaml b/llm/verl/search-tooling/verl-search-interaction-retrieval.yaml
new file mode 100644
index 00000000000..dce01c9a268
--- /dev/null
+++ b/llm/verl/search-tooling/verl-search-interaction-retrieval.yaml
@@ -0,0 +1,112 @@
+# Search Tool Retrieval Service
+#
+# This service provides Wikipedia retrieval capabilities using FAISS indexing.
+# It runs on CPU nodes and exposes a retrieval API on port 8000.
+#
+# Usage:
+#   sky launch -c retrieval llm/verl/verl-search-interaction-retrieval.yaml --cpus 32+ --memory 256+ -y
+#
+# Get endpoint:
+#   sky status retrieval --endpoint 8000
+# 
+# OR with sky serve
+#   sky serve up -n retrieval llm/verl/verl-search-interaction-retrieval.yaml --cpus 32+ --memory 256+ -y
+# 
+# Get endpoint:
+#   sky serve status retrieval --endpoint 8000
+
+service:
+  readiness_probe: /
+  replicas: 3
+
+resources:
+  cpus: 32+
+  memory: 256+
+  use_spot: false
+  ports:
+    - 8000  # Retrieval service API
+
+num_nodes: 1
+
+envs:
+  RETRIEVAL_TOPK: 3
+  RETRIEVER_NAME: e5
+  RETRIEVER_MODEL: intfloat/e5-base-v2
+
+setup: |
+  set -e
+
+  echo "=== Retrieval Service Setup ==="
+
+  # System dependencies
+  echo "Installing system dependencies..."
+  sudo apt update && sudo apt install -y iproute2
+
+  # Python environment
+  echo "Setting up Python virtual environment..."
+  uv venv --python 3.10 --seed
+  source .venv/bin/activate
+
+  # Install retrieval service dependencies
+  echo "Installing retrieval service dependencies..."
+  uv pip install "torch==2.8.*" torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
+  uv pip install transformers datasets huggingface_hub
+  uv pip install faiss-cpu
+  uv pip install uvicorn fastapi uvloop==0.21.0
+
+  # Download Wikipedia corpus and FAISS index
+  echo "Downloading Wikipedia corpus and FAISS index..."
+  export save_path=~/dataset
+  mkdir -p $save_path
+
+  huggingface-cli download maknee/wiki-18-subsets wiki-18-100k.jsonl.gz --repo-type=dataset --local-dir $save_path
+  huggingface-cli download maknee/wiki-18-subsets e5_Flat-100k.index --repo-type=dataset --local-dir $save_path
+
+  # Move files to expected locations
+  mv $save_path/wiki-18-100k.jsonl.gz $save_path/wiki-18.jsonl.gz
+  mv $save_path/e5_Flat-100k.index $save_path/e5_Flat.index
+
+  # Decompress the JSONL file
+  gzip -d $save_path/wiki-18.jsonl.gz -f
+
+  # Clone VERL repository for retrieval server code
+  echo "Cloning repositories..."
+  git clone https://github.com/volcengine/verl.git
+  cd verl
+  git checkout v0.6.0
+
+  # Patch retrieval server for CPU-only usage (comment out CUDA calls)
+  echo "Patching retrieval server for CPU-only usage..."
+  sed -i 's/^\(\s*\)\(model\.cuda()\)/\1# \2  # Commented out for CPU-only deployment/' \
+    examples/sglang_multiturn/search_r1_like/local_dense_retriever/retrieval_server.py
+  sed -i 's/^\(\s*\)\(inputs = {k: v\.cuda() for k, v in inputs\.items()}\)/\1# \2  # Commented out for CPU-only deployment/' \
+    examples/sglang_multiturn/search_r1_like/local_dense_retriever/retrieval_server.py
+
+  cd ..
+
+  echo "✓ Retrieval service setup complete!"
+
+run: |
+  set -e
+
+  echo "=== Starting Retrieval Service ==="
+
+  # Activate environment
+  source .venv/bin/activate
+
+  # Set up paths
+  save_path=~/dataset
+  index_file=$save_path/e5_Flat.index
+  corpus_file=$save_path/wiki-18.jsonl
+
+  # Start retrieval server
+  echo "Starting retrieval server on port 8000..."
+  cd verl
+  python examples/sglang_multiturn/search_r1_like/local_dense_retriever/retrieval_server.py \
+    --index_path $index_file \
+    --corpus_path $corpus_file \
+    --topk $RETRIEVAL_TOPK \
+    --retriever_name $RETRIEVER_NAME \
+    --retriever_model $RETRIEVER_MODEL &
+
+  echo "✓ Retrieval service running on port 8000"
diff --git a/llm/verl/search-tooling/verl-search-interaction-rl-trainer.yaml b/llm/verl/search-tooling/verl-search-interaction-rl-trainer.yaml
new file mode 100644
index 00000000000..574976b66e7
--- /dev/null
+++ b/llm/verl/search-tooling/verl-search-interaction-rl-trainer.yaml
@@ -0,0 +1,314 @@
+# Search Tool Interaction Training with VERL (RL Trainer)
+#
+# This example demonstrates multi-turn tool interaction training using VERL with a search/retrieval tool.
+# The model learns to use a search tool for answering questions that require external knowledge.
+#
+# Requires a separate retrieval service running (see verl-search-interaction-retrieval.yaml)
+#
+# Based on: https://verl.readthedocs.io/en/v0.5.x/sglang_multiturn/search_tool_example.html
+#
+# Usage:
+#   # 1. Launch retrieval service first
+#   sky launch -c retrieval llm/verl/verl-search-interaction-retrieval.yaml --cpus 32+ --memory 256+ -y
+#
+#   # 2. Get retrieval service endpoint
+#   RETRIEVAL_IP=$(sky status retrieval --endpoint 8000)
+#
+#   # 3. Launch training (without WandB)
+#   sky launch -c verl-train llm/verl/verl-search-interaction-rl-trainer.yaml --env RETRIEVAL_SERVICE_URL=http://$RETRIEVAL_IP --env DATASET_SIZE=small --env TOTAL_EPOCHS=1 -y
+#
+#   # Or with WandB logging (optional)
+#   sky launch -c verl-train llm/verl/verl-search-interaction-rl-trainer.yaml --env RETRIEVAL_SERVICE_URL=http://$RETRIEVAL_IP --env DATASET_SIZE=small --env TOTAL_EPOCHS=1 --secret WANDB_API_KEY -y
+#
+# Requirements:
+#   - Docker with SYS_PTRACE capability (for PyTorch multiprocessing CUDA tensor sharing)
+#   - H100 GPUs (can be adjusted for other accelerators)
+#   - Running retrieval service at RETRIEVAL_SERVICE_URL
+
+resources:
+  accelerators: H100:1
+  memory: 128+
+  image_id: docker:verlai/verl:app-verl0.6-transformers4.56.1-sglang0.5.2-mcore0.13.0-te2.2
+  ports:
+    - 8265  # Ray dashboard
+    - 9090  # vLLM model serving
+
+num_nodes: 1
+
+config:
+  docker:
+    run_options:
+      - --cap-add=SYS_PTRACE  # Required for PyTorch CUDA tensor sharing between Ray workers
+      - --ipc=host
+      - --shm-size=16g
+
+envs:
+  RETRIEVAL_SERVICE_URL: ""  # Required: URL of the retrieval service (e.g., http://retrieval-ip:8000)
+  DATASET_SIZE: small  # Options: small (1000 train, 200 test), medium (10k train, 2k test), full
+  TOTAL_EPOCHS: 1
+  TOTAL_STEPS: 10
+  TRAIN_BATCH_SIZE: 512
+  VAL_BATCH_SIZE: 256
+  SAVE_FREQ: 5  # Save checkpoints every 5 steps
+  TEST_FREQ: 5  # Test every 5 steps
+  MODEL_NAME: Qwen/Qwen2.5-3B-Instruct
+  WANDB_PROJECT_NAME: search_r1_like_async_rl
+  WANDB_EXPERIMENT_NAME: qwen2.5-3b-it_rm-searchR1-like-sgl-multiturn
+  CHECKPOINT_BUCKET_NAME: nebius://verl-search-interaction-checkpoints
+
+file_mounts:
+  /checkpoints:
+    source: ${CHECKPOINT_BUCKET_NAME}
+    mode: MOUNT_CACHED
+
+secrets:
+  WANDB_API_KEY: ""  # Optional: Set to enable WandB logging. If not set, only console logging will be used.
+
+setup: |
+  rm -f ~/.pip/pip.conf
+  rm -f ~/.config/pip/pip.conf
+
+  set -e
+
+  echo "=== VERL Search Tool Interaction Training Setup ==="
+
+  # Validate required environment variables
+  if [ -z "$RETRIEVAL_SERVICE_URL" ]; then
+    echo "ERROR: RETRIEVAL_SERVICE_URL environment variable is required"
+    echo "Example: --env RETRIEVAL_SERVICE_URL=http://retrieval-ip:8000"
+    exit 1
+  fi
+
+  # Python environment
+  echo "Setting up Python virtual environment..."
+  uv venv --python 3.10 --seed
+  source .venv/bin/activate
+
+  # Clone VERL repository
+  echo "Cloning VERL repository..."
+  rm -rf verl
+  git clone https://github.com/volcengine/verl.git
+  cd verl
+  git checkout v0.6.0
+
+  # Core dependencies
+  echo "Installing PyTorch and VERL..."
+  uv pip install "torch==2.8.*" torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128
+  uv pip install -v -e .
+  uv pip install "https://github.com/Dao-AILab/flash-attention/releases/download/v2.8.3/flash_attn-2.8.3+cu12torch2.8cxx11abiFALSE-cp310-cp310-linux_x86_64.whl"
+  uv pip install wheel
+  uv pip install packaging
+  uv pip install -r ./requirements_sglang.txt
+
+  # Install uvloop (required version)
+  uv pip install uvloop==0.21.0
+
+  # Data preparation
+  echo "Preparing search R1 dataset..."
+  python3 examples/data_preprocess/preprocess_search_r1_dataset.py
+
+  # Clone Search-R1 for additional utilities
+  git clone https://github.com/PeterGriffinJin/Search-R1/
+
+  # Update tool config to use external retrieval service
+  echo "Configuring external retrieval service..."
+  TOOL_CONFIG="examples/sglang_multiturn/config/tool_config/search_tool_config.yaml"
+
+  # Backup original config
+  cp $TOOL_CONFIG ${TOOL_CONFIG}.bak
+
+  # Update retrieval URL and num_workers in the config
+  sed -i 's/num_workers: *120/num_workers: 8/' $TOOL_CONFIG
+  sed -i "s|http://127\.0\.0\.1:8000/retrieve|$RETRIEVAL_SERVICE_URL/retrieve|g" $TOOL_CONFIG
+  sed -i "s|http://localhost:8000|$RETRIEVAL_SERVICE_URL|g" $TOOL_CONFIG
+
+  echo "✓ Setup complete!"
+  echo "Dataset location: ~/data/searchR1_processed_direct/"
+  echo "VERL repository: $(pwd)"
+  echo "Retrieval service: $RETRIEVAL_SERVICE_URL"
+
+run: |
+  set -e
+
+  echo "=== VERL Search Tool Interaction Training ==="
+  sudo apt update && sudo apt install -y iproute2 npm
+
+  # Validate retrieval service
+  if [ -z "$RETRIEVAL_SERVICE_URL" ]; then
+    echo "ERROR: RETRIEVAL_SERVICE_URL environment variable is required"
+    exit 1
+  fi
+
+  echo "Testing connection to retrieval service at $RETRIEVAL_SERVICE_URL..."
+  # Give it a few retries in case the service is still starting
+  max_retries=30
+  retry_count=0
+  while [ $retry_count -lt $max_retries ]; do
+    # Test the /retrieve endpoint with a sample query
+    test_response=$(curl -s -X POST "${RETRIEVAL_SERVICE_URL}/retrieve" \
+      -H "Content-Type: application/json" \
+      -d '{"queries": ["test query"], "topk": 1, "return_scores": false}' \
+      -w "\n%{http_code}" 2>&1)
+
+    http_code=$(echo "$test_response" | tail -n1)
+
+    if [ "$http_code" = "200" ]; then
+      echo "✓ Successfully connected to retrieval service"
+      echo "✓ /retrieve endpoint is responding correctly"
+      break
+    fi
+    retry_count=$((retry_count+1))
+    if [ $retry_count -eq $max_retries ]; then
+      echo "WARNING: Could not connect to retrieval service at $RETRIEVAL_SERVICE_URL"
+      echo "Make sure the retrieval service is running and accessible"
+      echo "Last response code: $http_code"
+    fi
+    sleep 5
+  done
+
+  # Multi-node setup
+  HEAD_IP=$(echo "$SKYPILOT_NODE_IPS" | head -n1)
+  NUM_NODES=$SKYPILOT_NUM_NODES
+  NUM_GPUS_PER_NODE=$SKYPILOT_NUM_GPUS_PER_NODE
+
+  # Network configuration for distributed training
+  NETWORK_INTERFACE=$(ip route get 8.8.8.8 | grep -oP 'dev \K\S+')
+  export GLOO_SOCKET_IFNAME=$NETWORK_INTERFACE
+  export NCCL_SOCKET_IFNAME=$NETWORK_INTERFACE
+
+  # PyTorch multiprocessing configuration
+  export TORCH_MULTIPROCESSING_SHARING_STRATEGY=file_system
+
+  # Activate environment
+  source .venv/bin/activate
+
+  # Set up paths
+  cd verl
+  PROJECT_DIR="$(pwd)"
+  export PYTHONPATH="$PROJECT_DIR:$PYTHONPATH"
+
+  # WandB login (optional)
+  if [ -n "$WANDB_API_KEY" ]; then
+    echo "Logging into Weights & Biases..."
+    python3 -c "import wandb; wandb.login(relogin=True, key='$WANDB_API_KEY')"
+  fi
+
+  if [ "$SKYPILOT_NODE_RANK" == "0" ]; then
+    echo "Starting Ray head node on port 6379..."
+    ps aux | grep ray | grep 6379 &> /dev/null || ray start --head --disable-usage-stats --port=6379 --dashboard-host=0.0.0.0 --dashboard-port=8265
+
+    # Wait for all nodes to connect
+    echo "Waiting for $NUM_NODES nodes to connect..."
+    retry_count=0
+    max_retries=30
+    while [ $retry_count -lt $max_retries ]; do
+      connected_nodes=$(ray status 2>/dev/null | grep -c "node_" || echo "0")
+      if [ "$connected_nodes" -ge "$NUM_NODES" ]; then
+        echo "✓ All $NUM_NODES nodes connected"
+        break
+      fi
+      retry_count=$((retry_count+1))
+      sleep 10
+    done
+
+    # Display Ray cluster status
+    echo "Ray cluster status:"
+    ray status
+
+    echo "Starting search tool interaction training..."
+    cd $PROJECT_DIR
+
+    # Increase file descriptor limit
+    ulimit -n 65535
+
+    # Set up configuration paths
+    CONFIG_PATH="$PROJECT_DIR/examples/sglang_multiturn/config"
+    TRAIN_DATA="$HOME/data/searchR1_processed_direct/train.parquet"
+    VAL_DATA="$HOME/data/searchR1_processed_direct/test.parquet"
+    TOOL_CONFIG="$CONFIG_PATH/tool_config/search_tool_config.yaml"
+
+    # Configure logging based on WANDB_API_KEY availability
+    if [ -n "$WANDB_API_KEY" ]; then
+      LOGGER_CONFIG='["console","wandb"]'
+      WANDB_ARGS="trainer.project_name=$WANDB_PROJECT_NAME trainer.experiment_name=$WANDB_EXPERIMENT_NAME"
+      echo "✓ WandB logging enabled"
+    else
+      LOGGER_CONFIG='["console"]'
+      WANDB_ARGS=""
+      echo "ℹ WandB logging disabled (no API key provided)"
+    fi
+
+    # Training with search tool
+    python3 -m verl.trainer.main_ppo \
+      --config-path="$CONFIG_PATH" \
+      --config-name='search_multiturn_grpo' \
+      algorithm.adv_estimator=grpo \
+      data.train_batch_size=$TRAIN_BATCH_SIZE \
+      data.val_batch_size=$VAL_BATCH_SIZE \
+      data.max_prompt_length=4096 \
+      data.max_response_length=3000 \
+      data.filter_overlong_prompts=True \
+      data.truncation='error' \
+      data.return_raw_chat=True \
+      actor_rollout_ref.model.path=$MODEL_NAME \
+      actor_rollout_ref.actor.optim.lr=1e-6 \
+      actor_rollout_ref.actor.optim.lr_warmup_steps_ratio=0.285 \
+      actor_rollout_ref.model.use_remove_padding=True \
+      actor_rollout_ref.actor.ppo_mini_batch_size=16 \
+      actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \
+      actor_rollout_ref.actor.use_kl_loss=True \
+      actor_rollout_ref.actor.kl_loss_coef=0.001 \
+      actor_rollout_ref.actor.kl_loss_type=low_var_kl \
+      actor_rollout_ref.actor.entropy_coeff=0 \
+      actor_rollout_ref.model.enable_gradient_checkpointing=True \
+      actor_rollout_ref.actor.fsdp_config.param_offload=True \
+      actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
+      actor_rollout_ref.actor.fsdp_config.model_dtype=bfloat16 \
+      actor_rollout_ref.rollout.max_model_len=15000 \
+      actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=8 \
+      actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
+      actor_rollout_ref.rollout.name=sglang \
+      actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \
+      actor_rollout_ref.rollout.n=5 \
+      actor_rollout_ref.rollout.multi_turn.max_assistant_turns=2 \
+      actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=8 \
+      algorithm.use_kl_in_reward=False \
+      trainer.critic_warmup=0 \
+      trainer.val_before_train=False \
+      trainer.logger="$LOGGER_CONFIG" \
+      $WANDB_ARGS \
+      trainer.n_gpus_per_node=$NUM_GPUS_PER_NODE \
+      trainer.nnodes=$NUM_NODES \
+      trainer.save_freq=$SAVE_FREQ \
+      trainer.test_freq=$TEST_FREQ \
+      data.train_files="$TRAIN_DATA" \
+      data.val_files="$VAL_DATA" \
+      actor_rollout_ref.rollout.multi_turn.tool_config_path="$TOOL_CONFIG" \
+      trainer.total_epochs=$TOTAL_EPOCHS \
+      trainer.total_training_steps=$TOTAL_STEPS \
+      trainer.default_local_dir=/checkpoints
+
+    echo "✓ Training complete!"
+
+    # Model checkpoint merging
+    echo "Merging model checkpoints..."
+    LATEST_STEP=$(cat /checkpoints/latest_checkpointed_iteration.txt)
+    CHECKPOINT_DIR="/checkpoints/global_step_${LATEST_STEP}/actor"
+
+    python -m verl.model_merger merge \
+      --backend fsdp \
+      --tie-word-embedding \
+      --local_dir ${CHECKPOINT_DIR} \
+      --target_dir /checkpoints/hf_model
+
+    echo "✓ Model saved to /checkpoints/hf_model"
+    echo "Training artifacts saved to cloud bucket: ${CHECKPOINT_BUCKET_NAME}"
+
+  else
+    # Worker node setup
+    echo "Worker node (rank $SKYPILOT_NODE_RANK) connecting to head at $HEAD_IP:6379..."
+    sleep 15
+    ps aux | grep ray | grep $HEAD_IP:6379 &> /dev/null || ray start --address $HEAD_IP:6379 --disable-usage-stats
+    echo "✓ Worker node connected"
+    sleep infinity
+  fi
diff --git a/llm/verl/search-tooling/verl-search-interaction.yaml b/llm/verl/search-tooling/verl-search-interaction.yaml
new file mode 100644
index 00000000000..44b10c5b1d0
--- /dev/null
+++ b/llm/verl/search-tooling/verl-search-interaction.yaml
@@ -0,0 +1,351 @@
+# Search Tool Interaction Training with VERL
+#
+# This example demonstrates multi-turn tool interaction training using VERL with a search/retrieval tool.
+# The model learns to use a search tool for answering questions that require external knowledge.
+#
+# Based on: https://verl.readthedocs.io/en/v0.5.x/sglang_multiturn/search_tool_example.html
+#
+# Usage:
+#   # Without WandB logging
+#   sky launch -c verl-search llm/verl/verl-search-interaction.yaml --env DATASET_SIZE=small --env TOTAL_EPOCHS=1 -y
+#
+#   # Or with WandB logging (optional)
+#   sky launch -c verl-search llm/verl/verl-search-interaction.yaml --secret WANDB_API_KEY --env DATASET_SIZE=small --env TOTAL_EPOCHS=1 -y
+# 
+# Requirements:
+#   - Docker with SYS_PTRACE capability (for PyTorch multiprocessing CUDA tensor sharing)
+#   - Single H100 or equivalent GPU (can be adjusted for other accelerators)
+
+resources:
+  accelerators: H100:1
+  memory: 128+
+  image_id: docker:verlai/verl:app-verl0.6-transformers4.56.1-sglang0.5.2-mcore0.13.0-te2.2
+  ports:
+    - 8265  # Ray dashboard
+    - 8000  # Retrieval service
+
+num_nodes: 1
+
+config:
+  docker:
+    run_options:
+      - --cap-add=SYS_PTRACE  # Required for PyTorch CUDA tensor sharing between Ray workers
+      - --ipc=host
+      - --shm-size=16g
+
+envs:
+  DATASET_SIZE: small  # Options: small (1000 train, 200 test), medium (10k train, 2k test), full
+  TOTAL_EPOCHS: 1
+  TOTAL_STEPS: 10
+  TRAIN_BATCH_SIZE: 512  # Reduced from 512 for smaller steps
+  VAL_BATCH_SIZE: 256  # Reduced from 256 for smaller steps
+  SAVE_FREQ: 5  # Save checkpoints every 10 steps (reduced from 100)
+  TEST_FREQ: 5  # Test every 5 steps (reduced from 50)
+  MODEL_NAME: Qwen/Qwen2.5-3B-Instruct
+  WANDB_PROJECT_NAME: search_r1_like_async_rl
+  WANDB_EXPERIMENT_NAME: qwen2.5-3b-it_rm-searchR1-like-sgl-multiturn
+  CHECKPOINT_BUCKET_NAME: verl-search-interaction-checkpoints
+
+file_mounts:
+  /checkpoints:
+    name: ${CHECKPOINT_BUCKET_NAME}
+    mode: MOUNT
+
+secrets:
+  WANDB_API_KEY: ""  # Optional: Set to enable WandB logging. If not set, only console logging will be used.
+
+setup: |
+  rm -f ~/.pip/pip.conf
+  rm -f ~/.config/pip/pip.conf
+  
+  set -e
+
+  echo "=== VERL Search Tool Interaction Setup ==="
+
+  # System dependencies
+  echo "Installing system dependencies..."
+  sudo apt update && sudo apt install -y iproute2 npm
+
+  # Optional: Install AI CLI tools
+  npm i -g @anthropic-ai/claude-code -y
+  npm i -g @openai/codex -y
+  npm i -g @google/gemini-cli -y
+  
+  # export IS_SANDBOX=1
+  # echo 'alias cx="codex --dangerously-bypass-approvals-and-sandbox --enable web_search_request"' >> ~/.bashrc
+  # echo 'alias ccd="claude --dangerously-skip-permissions"' >> ~/.bashrc
+
+  # echo 'alias cxh="codex -m gpt-5 -c model_reasoning_effort="high" --dangerously-bypass-approvals-and-sandbox --enable web_search_request"' >> ~/.bashrc
+  # echo 'alias gmi="gemini --telemetry false --yolo"' >> ~/.bashrc
+
+
+  # claude mcp add codex -s user -- codex -m gpt-5-codex -c model_reasoning_effort="high" --enable web_search_request mcp-server
+  # claude mcp add gpt -s user -- codex -m gpt-5 -c model_reasoning_effort="high" --enable web_search_request mcp-server
+
+  # claude mcp add gemini -- npx -y gemini-mcp-tool
+
+
+  # Python environment
+  echo "Setting up Python virtual environment..."
+  uv venv --python 3.10 --seed
+  source .venv/bin/activate
+
+  # Clone VERL repository
+  echo "Cloning VERL repository..."
+  rm -rf verl
+  git clone https://github.com/volcengine/verl.git
+  cd verl
+  git checkout v0.6.0
+
+  # Core dependencies
+  echo "Installing PyTorch and VERL..."
+  uv pip install "torch==2.8.*" torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128
+  uv pip install "https://github.com/Dao-AILab/flash-attention/releases/download/v2.8.3/flash_attn-2.8.3+cu12torch2.8cxx11abiFALSE-cp310-cp310-linux_x86_64.whl"
+  uv pip install -v -e .
+  uv pip install wheel
+  uv pip install packaging
+  uv pip install -r ./requirements_sglang.txt
+
+  # Search/retrieval specific dependencies
+  echo "Installing retrieval service dependencies..."
+  uv pip install faiss-gpu-cu12
+  # issue with uvloop version https://github.com/volcengine/verl/issues/3806
+  uv pip install uvloop==0.21.0
+
+  # Download Wikipedia corpus and FAISS index
+  echo "Downloading Wikipedia corpus and FAISS index..."
+  export save_path=~/dataset
+  mkdir -p $save_path
+
+  huggingface-cli download maknee/wiki-18-subsets wiki-18-100k.jsonl.gz --repo-type=dataset --local-dir $save_path
+  huggingface-cli download maknee/wiki-18-subsets e5_Flat-100k.index --repo-type=dataset --local-dir $save_path
+
+  # Move files to expected locations
+  mv $save_path/wiki-18-100k.jsonl.gz $save_path/wiki-18.jsonl.gz
+  mv $save_path/e5_Flat-100k.index $save_path/e5_Flat.index
+
+  # Decompress the JSONL file
+  gzip -d $save_path/wiki-18.jsonl.gz -f
+
+  # Data preparation
+  echo "Preparing search R1 dataset..."
+  python3 examples/data_preprocess/preprocess_search_r1_dataset.py
+  
+  # sed -i 's/num_workers: *120/num_workers: 8/' examples/sglang_multiturn/config/tool_config/search_tool_config.yaml
+
+  # # Setup faiss
+  # # Activate conda (only in the current shell)
+  # eval "$($HOME/miniconda3/bin/conda shell.bash hook)"
+
+  # # (Optional) Add conda to your default shell startup
+  # conda init
+
+  # # Reload shell config
+  # source ~/.bashrc
+
+  # # Create and activate the retriever environment with Python 3.10
+  # conda create -n retriever python=3.10 -y
+  # conda activate retriever
+
+  # # Install PyTorch (with GPU support) and related libraries
+  # conda install pytorch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 pytorch-cuda=12.1 -c pytorch -c nvidia -y
+
+  # # Install other Python packages
+  # pip install transformers datasets pyserini huggingface_hub
+
+  # # Install the GPU version of faiss
+  # conda install faiss-gpu=1.9.0 -c pytorch -c nvidia -y
+
+  # # Install the API service framework
+  # pip install uvicorn fastapi hf_transfer
+
+  # echo "✓ Setup complete!"
+  # echo "Dataset location: ~/data/searchR1_processed_direct/"
+  # echo "VERL repository: $(pwd)"
+
+  git clone https://github.com/PeterGriffinJin/Search-R1/
+
+run: |
+  set -e
+
+  echo "=== VERL Search Tool Interaction Training ==="
+
+  # Multi-node setup
+  HEAD_IP=$(echo "$SKYPILOT_NODE_IPS" | head -n1)
+  NUM_NODES=$SKYPILOT_NUM_NODES
+  NUM_GPUS_PER_NODE=$SKYPILOT_NUM_GPUS_PER_NODE
+
+  # Network configuration for distributed training
+  NETWORK_INTERFACE=$(ip route get 8.8.8.8 | grep -oP 'dev \K\S+')
+  export GLOO_SOCKET_IFNAME=$NETWORK_INTERFACE
+  export NCCL_SOCKET_IFNAME=$NETWORK_INTERFACE
+
+  # PyTorch multiprocessing configuration
+  export TORCH_MULTIPROCESSING_SHARING_STRATEGY=file_system
+
+  # Activate environment
+  source .venv/bin/activate
+
+  # Set up paths
+  cd verl
+  PROJECT_DIR="$(pwd)"
+  export PYTHONPATH="$PROJECT_DIR:$PYTHONPATH"
+
+  # Start retrieval service
+  echo "Starting retrieval server..."
+  # conda activate retriever
+  save_path=~/dataset
+  index_file=$save_path/e5_Flat.index
+  corpus_file=$save_path/wiki-18.jsonl
+  retriever_name=e5
+  retriever_path=intfloat/e5-base-v2
+
+  python examples/sglang_multiturn/search_r1_like/local_dense_retriever/retrieval_server.py \
+    --index_path $index_file \
+    --corpus_path $corpus_file \
+    --topk 3 \
+    --retriever_name $retriever_name \
+    --retriever_model $retriever_path &
+
+  RETRIEVAL_PID=$!
+  sleep 10
+  conda deactivate
+
+  save_path=~/dataset
+  index_file=$save_path/e5_Flat.index
+  corpus_file=$save_path/wiki-18.jsonl
+  retriever_name=e5
+  retriever_path=intfloat/e5-base-v2
+
+  # WandB login (optional)
+  if [ -n "$WANDB_API_KEY" ]; then
+    echo "Logging into Weights & Biases..."
+    python3 -c "import wandb; wandb.login(relogin=True, key='$WANDB_API_KEY')"
+  fi
+
+  if [ "$SKYPILOT_NODE_RANK" == "0" ]; then
+    echo "Starting Ray head node on port 6379..."
+    ps aux | grep ray | grep 6379 &> /dev/null || ray start --head --disable-usage-stats --port=6379 --dashboard-host=0.0.0.0 --dashboard-port=8265
+
+    # Wait for all nodes to connect
+    echo "Waiting for $NUM_NODES nodes to connect..."
+    retry_count=0
+    max_retries=30
+    while [ $retry_count -lt $max_retries ]; do
+      connected_nodes=$(ray status 2>/dev/null | grep -c "node_" || echo "0")
+      if [ "$connected_nodes" -ge "$NUM_NODES" ]; then
+        echo "✓ All $NUM_NODES nodes connected"
+        break
+      fi
+      retry_count=$((retry_count+1))
+      sleep 10
+    done
+
+    # Display Ray cluster status
+    echo "Ray cluster status:"
+    ray status
+
+    echo "Starting search tool interaction training..."
+    cd $PROJECT_DIR
+
+    # Increase file descriptor limit
+    ulimit -n 65535
+
+    # Set up configuration paths
+    CONFIG_PATH="$PROJECT_DIR/examples/sglang_multiturn/config"
+    TRAIN_DATA="$HOME/data/searchR1_processed_direct/train.parquet"
+    VAL_DATA="$HOME/data/searchR1_processed_direct/test.parquet"
+    TOOL_CONFIG="$CONFIG_PATH/tool_config/search_tool_config.yaml"
+
+    # Configure logging based on WANDB_API_KEY availability
+    if [ -n "$WANDB_API_KEY" ]; then
+      LOGGER_CONFIG='["console","wandb"]'
+      WANDB_ARGS="trainer.project_name=$WANDB_PROJECT_NAME trainer.experiment_name=$WANDB_EXPERIMENT_NAME"
+      echo "✓ WandB logging enabled"
+    else
+      LOGGER_CONFIG='["console"]'
+      WANDB_ARGS=""
+      echo "ℹ WandB logging disabled (no API key provided)"
+    fi
+
+    # Training with search tool
+    python3 -m verl.trainer.main_ppo \
+      --config-path="$CONFIG_PATH" \
+      --config-name='search_multiturn_grpo' \
+      algorithm.adv_estimator=grpo \
+      data.train_batch_size=$TRAIN_BATCH_SIZE \
+      data.val_batch_size=$VAL_BATCH_SIZE \
+      data.max_prompt_length=4096 \
+      data.max_response_length=3000 \
+      data.filter_overlong_prompts=True \
+      data.truncation='error' \
+      data.return_raw_chat=True \
+      actor_rollout_ref.model.path=$MODEL_NAME \
+      actor_rollout_ref.actor.optim.lr=1e-6 \
+      actor_rollout_ref.actor.optim.lr_warmup_steps_ratio=0.285 \
+      actor_rollout_ref.model.use_remove_padding=True \
+      actor_rollout_ref.actor.ppo_mini_batch_size=16 \
+      actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \
+      actor_rollout_ref.actor.use_kl_loss=True \
+      actor_rollout_ref.actor.kl_loss_coef=0.001 \
+      actor_rollout_ref.actor.kl_loss_type=low_var_kl \
+      actor_rollout_ref.actor.entropy_coeff=0 \
+      actor_rollout_ref.model.enable_gradient_checkpointing=True \
+      actor_rollout_ref.actor.fsdp_config.param_offload=True \
+      actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
+      actor_rollout_ref.actor.fsdp_config.model_dtype=bfloat16 \
+      actor_rollout_ref.rollout.max_model_len=15000 \
+      actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=8 \
+      actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
+      actor_rollout_ref.rollout.name=sglang \
+      actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \
+      actor_rollout_ref.rollout.n=5 \
+      actor_rollout_ref.rollout.multi_turn.max_assistant_turns=2 \
+      actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=8 \
+      algorithm.use_kl_in_reward=False \
+      trainer.critic_warmup=0 \
+      trainer.val_before_train=False \
+      trainer.logger="$LOGGER_CONFIG" \
+      $WANDB_ARGS \
+      trainer.n_gpus_per_node=$NUM_GPUS_PER_NODE \
+      trainer.nnodes=$NUM_NODES \
+      trainer.save_freq=$SAVE_FREQ \
+      trainer.test_freq=$TEST_FREQ \
+      data.train_files="$TRAIN_DATA" \
+      data.val_files="$VAL_DATA" \
+      actor_rollout_ref.rollout.multi_turn.tool_config_path="$TOOL_CONFIG" \
+      trainer.total_epochs=$TOTAL_EPOCHS \
+      trainer.total_training_steps=$TOTAL_STEPS \
+      trainer.default_local_dir=/checkpoints
+
+    echo "✓ Training complete!"
+
+    # Model checkpoint merging
+    echo "Merging model checkpoints..."
+    LATEST_STEP=$(cat /checkpoints/latest_checkpointed_iteration.txt)
+    CHECKPOINT_DIR="/checkpoints/global_step_${LATEST_STEP}/actor"
+
+    python -m verl.model_merger merge \
+      --backend fsdp \
+      --tie-word-embedding \
+      --local_dir ${CHECKPOINT_DIR} \
+      --target_dir /checkpoints/hf_model
+
+    echo "✓ Model saved to /checkpoints/hf_model"
+    echo "Training artifacts saved to cloud bucket: ${CHECKPOINT_BUCKET_NAME}"
+
+    # Cleanup retrieval service before starting vLLM
+    if [ -n "$RETRIEVAL_PID" ]; then
+      echo "Stopping retrieval service..."
+      kill $RETRIEVAL_PID 2>/dev/null || true
+      sleep 5
+    fi
+
+  else
+    # Worker node setup
+    echo "Worker node (rank $SKYPILOT_NODE_RANK) connecting to head at $HEAD_IP:6379..."
+    sleep 15
+    ps aux | grep ray | grep $HEAD_IP:6379 &> /dev/null || ray start --address $HEAD_IP:6379 --disable-usage-stats
+    echo "✓ Worker node connected"
+    sleep infinity
+  fi
diff --git a/pyproject.toml b/pyproject.toml
index 9a509234dc4..704fa38cac1 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -20,7 +20,7 @@ addopts = "-s -n 16 -q --tb=short --dist loadgroup --disable-warnings"
 asyncio_default_fixture_loop_scope = "function"
 
 [tool.mypy]
-python_version = "3.8"
+python_version = "3.9"
 follow_imports = "skip"
 ignore_missing_imports = true
 allow_redefinition = true
diff --git a/requirements-dev.txt b/requirements-dev.txt
index e0dd4be1763..90c8239537e 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -16,7 +16,7 @@ isort==5.12.0
 
 # type checking
 # match the version with .pre-commit-config.yaml
-mypy==1.14.1
+mypy==1.19.1
 types-PyYAML
 types-paramiko
 # 2.31 requires urlib3>2, which is incompatible with IBM and
diff --git a/sky/__init__.py b/sky/__init__.py
index 2aec6b62c86..5faff53c6e5 100644
--- a/sky/__init__.py
+++ b/sky/__init__.py
@@ -155,6 +155,7 @@ def set_proxy_env_var(proxy_var: str, urllib_var: Optional[str]):
 Hyperbolic = clouds.Hyperbolic
 Shadeform = clouds.Shadeform
 Seeweb = clouds.Seeweb
+Yotta = clouds.Yotta
 
 __all__ = [
     '__version__',
@@ -180,6 +181,7 @@ def set_proxy_env_var(proxy_var: str, urllib_var: Optional[str]):
     'Hyperbolic',
     'Shadeform',
     'Seeweb',
+    'Yotta',
     'Optimizer',
     'OptimizeTarget',
     'backends',
diff --git a/sky/adaptors/kubernetes.py b/sky/adaptors/kubernetes.py
index 2ecae9e26fb..a4a13f834ea 100644
--- a/sky/adaptors/kubernetes.py
+++ b/sky/adaptors/kubernetes.py
@@ -1,8 +1,15 @@
-"""Kubernetes adaptors"""
+"""Kubernetes adaptors
+
+Thread safety notes:
+
+The API functions (core_api, batch_api, etc.) return cached clients that are
+created with context-specific ApiClient instances.
+"""
 import functools
 import logging
 import os
 import platform
+import typing
 from typing import Any, Callable, Optional, Set
 
 from sky import sky_logging
@@ -11,16 +18,17 @@
 from sky.utils import common_utils
 from sky.utils import ux_utils
 
-_IMPORT_ERROR_MESSAGE = ('Failed to import dependencies for Kubernetes. '
-                         'Try running: pip install "skypilot[kubernetes]"')
-kubernetes = common.LazyImport('kubernetes',
-                               import_error_message=_IMPORT_ERROR_MESSAGE)
-models = common.LazyImport('kubernetes.client.models',
-                           import_error_message=_IMPORT_ERROR_MESSAGE)
-urllib3 = common.LazyImport('urllib3',
-                            import_error_message=_IMPORT_ERROR_MESSAGE)
-dateutil_parser = common.LazyImport('dateutil.parser',
-                                    import_error_message=_IMPORT_ERROR_MESSAGE)
+if typing.TYPE_CHECKING:
+    import kubernetes
+    import urllib3
+    import urllib3.exceptions
+else:
+    _IMPORT_ERROR_MESSAGE = ('Failed to import dependencies for Kubernetes. '
+                             'Try running: pip install "skypilot[kubernetes]"')
+    kubernetes = common.LazyImport('kubernetes',
+                                   import_error_message=_IMPORT_ERROR_MESSAGE)
+    urllib3 = common.LazyImport('urllib3',
+                                import_error_message=_IMPORT_ERROR_MESSAGE)
 
 # Timeout to use for API calls
 API_TIMEOUT = 5
@@ -86,13 +94,33 @@ def _get_config_file() -> str:
     return os.environ.get('KUBECONFIG', '~/.kube/config')
 
 
-def _load_config(context: Optional[str] = None):
+def _get_api_client(context: Optional[str] = None) -> Any:
+    """Get an ApiClient for the given context without modifying global config.
+
+    This is fully thread-safe because it creates isolated Configuration
+    objects for each client rather than modifying the global
+    kubernetes.client.configuration.
+
+    Args:
+        context: The Kubernetes context to use. If None, tries in-cluster config
+            first, then falls back to kubeconfig current-context.
+
+    Returns:
+        A kubernetes.client.ApiClient configured for the specified context.
+
+    Raises:
+        ValueError: If the configuration cannot be loaded.
+    """
     urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
 
-    def _load_config_from_kubeconfig(context: Optional[str] = None):
+    def _get_api_client_from_kubeconfig(context: Optional[str] = None) -> Any:
+        """Load kubeconfig, return ApiClient without modifying global state."""
         try:
-            kubernetes.config.load_kube_config(config_file=_get_config_file(),
-                                               context=context)
+            # new_client_from_config returns an ApiClient configured for the
+            # specified context WITHOUT modifying the global configuration.
+            # This is the key to thread-safety.
+            return kubernetes.config.new_client_from_config(
+                config_file=_get_config_file(), context=context)
         except kubernetes.config.config_exception.ConfigException as e:
             suffix = common_utils.format_exception(e, use_bracket=True)
             context_name = '(current-context)' if context is None else context
@@ -143,20 +171,27 @@ def _load_config_from_kubeconfig(context: Optional[str] = None):
     if context == in_cluster_context_name() or context is None:
         try:
             # Load in-cluster config if running in a pod and context is None.
-            # Kubernetes set environment variables for service discovery do not
-            # show up in SkyPilot tasks. For now, we work around by using
-            # DNS name instead of environment variables.
-            # See issue: https://github.com/skypilot-org/skypilot/issues/2287
-            # Only set if not already present (preserving existing values)
+            # Use InClusterConfigLoader with an explicit Configuration object
+            # to avoid modifying global state (thread-safe).
+            #
+            # Workaround: Kubernetes service discovery environment variables
+            # may not show up in SkyPilot tasks. We set them to DNS names as
+            # a fallback. See: github.com/skypilot-org/skypilot/issues/2287
             if 'KUBERNETES_SERVICE_HOST' not in os.environ:
                 os.environ['KUBERNETES_SERVICE_HOST'] = 'kubernetes.default.svc'
             if 'KUBERNETES_SERVICE_PORT' not in os.environ:
                 os.environ['KUBERNETES_SERVICE_PORT'] = '443'
-            kubernetes.config.load_incluster_config()
+
+            config = kubernetes.client.Configuration()
+            kubernetes.config.load_incluster_config(config)
+            return kubernetes.client.ApiClient(configuration=config)
         except kubernetes.config.config_exception.ConfigException:
-            _load_config_from_kubeconfig()
-    else:
-        _load_config_from_kubeconfig(context)
+            if context == in_cluster_context_name():
+                # Explicitly requested in-cluster context but not in a cluster
+                raise
+            # Otherwise, if context is None, fall through to kubeconfig
+
+    return _get_api_client_from_kubeconfig(context)
 
 
 def list_kube_config_contexts():
@@ -219,88 +254,83 @@ def wrapper(*args, **kwargs):
 @annotations.lru_cache(scope='request')
 @wrap_kubernetes_client
 def core_api(context: Optional[str] = None):
-    _load_config(context)
-    return kubernetes.client.CoreV1Api()
+    return kubernetes.client.CoreV1Api(api_client=_get_api_client(context))
 
 
 @_api_logging_decorator('urllib3', logging.ERROR)
 @annotations.lru_cache(scope='request')
 @wrap_kubernetes_client
 def storage_api(context: Optional[str] = None):
-    _load_config(context)
-    return kubernetes.client.StorageV1Api()
+    return kubernetes.client.StorageV1Api(api_client=_get_api_client(context))
 
 
 @_api_logging_decorator('urllib3', logging.ERROR)
 @annotations.lru_cache(scope='request')
 @wrap_kubernetes_client
 def auth_api(context: Optional[str] = None):
-    _load_config(context)
-    return kubernetes.client.RbacAuthorizationV1Api()
+    return kubernetes.client.RbacAuthorizationV1Api(
+        api_client=_get_api_client(context))
 
 
 @_api_logging_decorator('urllib3', logging.ERROR)
 @annotations.lru_cache(scope='request')
 @wrap_kubernetes_client
 def networking_api(context: Optional[str] = None):
-    _load_config(context)
-    return kubernetes.client.NetworkingV1Api()
+    return kubernetes.client.NetworkingV1Api(
+        api_client=_get_api_client(context))
 
 
 @_api_logging_decorator('urllib3', logging.ERROR)
 @annotations.lru_cache(scope='request')
 @wrap_kubernetes_client
 def custom_objects_api(context: Optional[str] = None):
-    _load_config(context)
-    return kubernetes.client.CustomObjectsApi()
+    return kubernetes.client.CustomObjectsApi(
+        api_client=_get_api_client(context))
 
 
 @_api_logging_decorator('urllib3', logging.ERROR)
 @annotations.lru_cache(scope='global')
 @wrap_kubernetes_client
 def node_api(context: Optional[str] = None):
-    _load_config(context)
-    return kubernetes.client.NodeV1Api()
+    return kubernetes.client.NodeV1Api(api_client=_get_api_client(context))
 
 
 @_api_logging_decorator('urllib3', logging.ERROR)
 @annotations.lru_cache(scope='request')
 @wrap_kubernetes_client
 def apps_api(context: Optional[str] = None):
-    _load_config(context)
-    return kubernetes.client.AppsV1Api()
+    return kubernetes.client.AppsV1Api(api_client=_get_api_client(context))
 
 
 @_api_logging_decorator('urllib3', logging.ERROR)
 @annotations.lru_cache(scope='request')
 @wrap_kubernetes_client
 def batch_api(context: Optional[str] = None):
-    _load_config(context)
-    return kubernetes.client.BatchV1Api()
+    return kubernetes.client.BatchV1Api(api_client=_get_api_client(context))
 
 
 @_api_logging_decorator('urllib3', logging.ERROR)
 @annotations.lru_cache(scope='request')
 @wrap_kubernetes_client
 def api_client(context: Optional[str] = None):
-    _load_config(context)
-    return kubernetes.client.ApiClient()
+    return _get_api_client(context)
 
 
 @_api_logging_decorator('urllib3', logging.ERROR)
 @annotations.lru_cache(scope='request')
 @wrap_kubernetes_client
 def custom_resources_api(context: Optional[str] = None):
-    _load_config(context)
-    return kubernetes.client.CustomObjectsApi()
+    return kubernetes.client.CustomObjectsApi(
+        api_client=_get_api_client(context))
 
 
 @_api_logging_decorator('urllib3', logging.ERROR)
 @annotations.lru_cache(scope='request')
 @wrap_kubernetes_client
 def watch(context: Optional[str] = None):
-    _load_config(context)
-    return kubernetes.watch.Watch()
+    w = kubernetes.watch.Watch()
+    w._api_client = _get_api_client(context)  # pylint: disable=protected-access
+    return w
 
 
 def api_exception():
diff --git a/sky/adaptors/oci.py b/sky/adaptors/oci.py
index cbf4f9354b3..8fa6539b066 100644
--- a/sky/adaptors/oci.py
+++ b/sky/adaptors/oci.py
@@ -73,18 +73,24 @@ def service_exception():
 
 
 def with_oci_env(f):
+    """Wraps a function to return a single shell command string (joined by '&&')
+    that ensures OCI CLI is available before running the actual OCI
+    command returned by `f`.
+    """
 
     @functools.wraps(f)
     def wrapper(*args, **kwargs):
-        # pylint: disable=line-too-long
+        oci_venv_dir = '"$HOME/sky-oci-cli-env"'
         enter_env_cmds = [
-            'conda info --envs | grep "sky-oci-cli-env" || conda create -n sky-oci-cli-env python=3.10 -y',
-            '. $(conda info --base 2> /dev/null)/etc/profile.d/conda.sh > /dev/null 2>&1 || true',
-            'conda activate sky-oci-cli-env', 'pip install oci-cli',
-            'export OCI_CLI_SUPPRESS_FILE_PERMISSIONS_WARNING=True'
+            # Create the venv if missing
+            (f'[ -d {oci_venv_dir} ] || '
+             f'uv venv --seed {oci_venv_dir} --python 3.10'),
+            f'source {oci_venv_dir}/bin/activate',
+            'uv pip install oci-cli',
+            'export OCI_CLI_SUPPRESS_FILE_PERMISSIONS_WARNING=True',
         ]
         operation_cmd = [f(*args, **kwargs)]
-        leave_env_cmds = ['conda deactivate']
+        leave_env_cmds = ['deactivate']
         return ' && '.join(enter_env_cmds + operation_cmd + leave_env_cmds)
 
     return wrapper
diff --git a/sky/adaptors/slurm.py b/sky/adaptors/slurm.py
index 69b05c3bcfe..e937b101b63 100644
--- a/sky/adaptors/slurm.py
+++ b/sky/adaptors/slurm.py
@@ -129,6 +129,8 @@ def __init__(
                 ssh_proxy_command=ssh_proxy_command,
                 ssh_proxy_jump=ssh_proxy_jump,
                 enable_interactive_auth=True,
+                # Allow ssh-agent and default key fallback for Slurm.
+                disable_identities_only=True,
             )
 
     def _run_slurm_cmd(self, cmd: str) -> Tuple[int, str, str]:
@@ -625,3 +627,22 @@ def get_partitions(self) -> List[str]:
             at the end of the name.
         """
         return [partition.name for partition in self.get_partitions_info()]
+
+    def get_proctrack_type(self) -> Optional[str]:
+        """Get the ProctrackType from Slurm configuration.
+
+        Returns:
+            The proctrack type (e.g., 'cgroup', 'linuxproc', 'pgid'),
+            or None if it cannot be determined.
+        """
+        cmd = 'scontrol show config | grep -i "^ProctrackType"'
+        rc, stdout, stderr = self._run_slurm_cmd(cmd)
+        if rc != 0:
+            logger.warning(f'Failed to get ProctrackType: {stderr}')
+            return None
+
+        # Parse output like "ProctrackType           = proctrack/cgroup"
+        match = re.search(r'ProctrackType\s*=\s*proctrack/(\w+)', stdout)
+        if match:
+            return match.group(1)
+        return None
diff --git a/sky/adaptors/yotta.py b/sky/adaptors/yotta.py
new file mode 100644
index 00000000000..59c3808e58c
--- /dev/null
+++ b/sky/adaptors/yotta.py
@@ -0,0 +1 @@
+"""Yotta cloud adaptor."""
diff --git a/sky/authentication.py b/sky/authentication.py
index a2e14947a12..fac32878eef 100644
--- a/sky/authentication.py
+++ b/sky/authentication.py
@@ -28,6 +28,7 @@
 import uuid
 
 import colorama
+import filelock
 
 from sky import clouds
 from sky import exceptions
@@ -228,9 +229,14 @@ def setup_lambda_authentication(config: Dict[str, Any]) -> Dict[str, Any]:
     with open(public_key_path, 'r', encoding='utf-8') as f:
         public_key = f.read().strip()
     prefix = f'sky-key-{common_utils.get_user_hash()}'
-    name, exists = lambda_client.get_unique_ssh_key_name(prefix, public_key)
-    if not exists:
-        lambda_client.register_ssh_key(name, public_key)
+
+    lock_path = os.path.expanduser(
+        '~/.sky/locks/lambda-cloud-ssh-key-registration.lock')
+    os.makedirs(os.path.dirname(lock_path), exist_ok=True)
+    with filelock.FileLock(lock_path):
+        name, exists = lambda_client.get_unique_ssh_key_name(prefix, public_key)
+        if not exists:
+            lambda_client.register_ssh_key(name, public_key)
 
     config['auth']['remote_key_name'] = name
     return config
diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py
index 97abbd4d04c..b6fad8a9f79 100644
--- a/sky/backends/backend_utils.py
+++ b/sky/backends/backend_utils.py
@@ -139,7 +139,7 @@
 
 # Time that must elapse since the last status check before we should re-check if
 # the cluster has been terminated or autostopped.
-_CLUSTER_STATUS_CACHE_DURATION_SECONDS = 2
+CLUSTER_STATUS_CACHE_DURATION_SECONDS = 2
 
 CLUSTER_FILE_MOUNTS_LOCK_TIMEOUT_SECONDS = 10
 WORKSPACE_LOCK_TIMEOUT_SECONDS = 10
@@ -653,6 +653,7 @@ def write_cluster_config(
     dryrun: bool = False,
     keep_launch_fields_in_existing_config: bool = True,
     volume_mounts: Optional[List['volume_utils.VolumeMount']] = None,
+    cloud_specific_failover_overrides: Optional[Dict[str, Any]] = None,
 ) -> Dict[str, str]:
     """Fills in cluster configuration templates and writes them out.
 
@@ -726,7 +727,8 @@ def write_cluster_config(
         cloud=str(cloud).lower(),
         region=region.name,
         keys=('remote_identity',),
-        default_value=None)
+        default_value=None,
+        override_configs=to_provision.cluster_config_overrides)
     remote_identity = schemas.get_default_remote_identity(str(cloud).lower())
     if isinstance(remote_identity_config, str):
         remote_identity = remote_identity_config
@@ -899,6 +901,9 @@ def write_cluster_config(
     if to_provision.labels:
         labels.update(to_provision.labels)
 
+    install_conda = skypilot_config.get_nested(('provision', 'install_conda'),
+                                               True)
+
     # We disable conda auto-activation if the user has specified a docker image
     # to use, which is likely to already have a conda environment activated.
     conda_auto_activate = ('true' if to_provision.extract_docker_image() is None
@@ -949,114 +954,122 @@ def write_cluster_config(
     # Use a tmp file path to avoid incomplete YAML file being re-used in the
     # future.
     tmp_yaml_path = yaml_path + '.tmp'
-    common_utils.fill_template(
-        cluster_config_template,
-        dict(
-            resources_vars,
-            **{
-                'cluster_name_on_cloud': cluster_name_on_cloud,
-                'num_nodes': num_nodes,
-                'disk_size': to_provision.disk_size,
-                # If the current code is run by controller, propagate the real
-                # calling user which should've been passed in as the
-                # SKYPILOT_USER env var (see
-                # controller_utils.shared_controller_vars_to_fill().
-                'user': common_utils.get_cleaned_username(
-                    os.environ.get(constants.USER_ENV_VAR, '')),
-
-                # Networking configs
-                'use_internal_ips': skypilot_config.get_effective_region_config(
-                    cloud=str(cloud).lower(),
-                    region=region.name,
-                    keys=('use_internal_ips',),
-                    default_value=False),
-                'ssh_proxy_command': ssh_proxy_command,
-                'vpc_name': skypilot_config.get_effective_region_config(
-                    cloud=str(cloud).lower(),
-                    region=region.name,
-                    keys=('vpc_name',),
-                    default_value=None),
-                # User-supplied labels.
-                'labels': labels,
-                # User-supplied remote_identity
-                'remote_identity': remote_identity,
-                # The reservation pools that specified by the user. This is
-                # currently only used by AWS and GCP.
-                'specific_reservations': specific_reservations,
-
-                # Conda setup
-                # We should not use `.format`, as it contains '{}' as the bash
-                # syntax.
-                'conda_installation_commands':
-                    constants.CONDA_INSTALLATION_COMMANDS.replace(
-                        '{conda_auto_activate}',
-                        conda_auto_activate).replace('{is_custom_docker}',
-                                                     is_custom_docker),
-                # Currently only used by Slurm. For other clouds, it is
-                # already part of ray_skypilot_installation_commands
-                'setup_sky_dirs_commands': constants.SETUP_SKY_DIRS_COMMANDS,
-                'ray_skypilot_installation_commands':
-                    (constants.RAY_SKYPILOT_INSTALLATION_COMMANDS.replace(
-                        '{sky_wheel_hash}',
-                        wheel_hash).replace('{cloud}',
-                                            str(cloud).lower())),
-                'skypilot_wheel_installation_commands':
-                    constants.SKYPILOT_WHEEL_INSTALLATION_COMMANDS.replace(
-                        '{sky_wheel_hash}',
-                        wheel_hash).replace('{cloud}',
-                                            str(cloud).lower()),
-                'copy_skypilot_templates_commands':
-                    constants.COPY_SKYPILOT_TEMPLATES_COMMANDS,
-                # Port of Ray (GCS server).
-                # Ray's default port 6379 is conflicted with Redis.
-                'ray_port': constants.SKY_REMOTE_RAY_PORT,
-                'ray_dashboard_port': constants.SKY_REMOTE_RAY_DASHBOARD_PORT,
-                'ray_temp_dir': constants.SKY_REMOTE_RAY_TEMPDIR,
-                'dump_port_command': instance_setup.DUMP_RAY_PORTS,
-                # Sky-internal constants.
-                'sky_ray_cmd': constants.SKY_RAY_CMD,
-                # pip install needs to have python env activated to make sure
-                # installed packages are within the env path.
-                'sky_pip_cmd': f'{constants.SKY_PIP_CMD}',
-                # Activate the SkyPilot runtime environment when starting ray
-                # cluster, so that ray autoscaler can access cloud SDK and CLIs
-                # on remote
-                'sky_activate_python_env':
-                    constants.ACTIVATE_SKY_REMOTE_PYTHON_ENV,
-                'ray_version': constants.SKY_REMOTE_RAY_VERSION,
-                # Command for waiting ray cluster to be ready on head.
-                'ray_head_wait_initialized_command':
-                    instance_setup.RAY_HEAD_WAIT_INITIALIZED_COMMAND,
-
-                # Cloud credentials for cloud storage.
-                'credentials': credentials,
-                # Sky remote utils.
-                'sky_remote_path': SKY_REMOTE_PATH,
-                'sky_local_path': str(local_wheel_path),
-                # Add yaml file path to the template variables.
-                'sky_ray_yaml_remote_path':
-                    cluster_utils.SKY_CLUSTER_YAML_REMOTE_PATH,
-                'sky_ray_yaml_local_path': tmp_yaml_path,
-                'sky_version': str(version.parse(sky.__version__)),
-                'sky_wheel_hash': wheel_hash,
-                'ssh_max_sessions_config':
-                    constants.SET_SSH_MAX_SESSIONS_CONFIG_CMD,
-                # Authentication (optional).
-                **auth_config,
-
-                # Controller specific configs
-                'is_remote_controller': is_remote_controller,
-                'high_availability': high_availability_specified,
-
-                # Volume mounts
-                'volume_mounts': volume_mount_vars,
-                'ephemeral_volume_mounts': ephemeral_volume_mount_vars,
-
-                # runcmd to run before any of the SkyPilot runtime setup commands.
-                # This is currently only used by AWS and Kubernetes.
-                'runcmd': runcmd,
-            }),
-        output_path=tmp_yaml_path)
+    variables = dict(
+        resources_vars,
+        **{
+            'cluster_name_on_cloud': cluster_name_on_cloud,
+            'num_nodes': num_nodes,
+            'disk_size': to_provision.disk_size,
+            # If the current code is run by controller, propagate the real
+            # calling user which should've been passed in as the
+            # SKYPILOT_USER env var (see
+            # controller_utils.shared_controller_vars_to_fill().
+            'user': common_utils.get_cleaned_username(
+                os.environ.get(constants.USER_ENV_VAR, '')),
+
+            # Networking configs
+            'use_internal_ips': skypilot_config.get_effective_region_config(
+                cloud=str(cloud).lower(),
+                region=region.name,
+                keys=('use_internal_ips',),
+                default_value=False),
+            'ssh_proxy_command': ssh_proxy_command,
+            # TODO (kyuds): for backwards compatibility. If `vpc_names`
+            # is set, this will be overridden. We can remove this after
+            # v0.13.0 if all clouds that currently support `vpc_name`
+            # migrates to `vpc_names` (ie: gcp)
+            'vpc_name': skypilot_config.get_effective_region_config(
+                cloud=str(cloud).lower(),
+                region=region.name,
+                keys=('vpc_name',),
+                default_value=None),
+            # User-supplied labels.
+            'labels': labels,
+            # User-supplied remote_identity
+            'remote_identity': remote_identity,
+            # The reservation pools that specified by the user. This is
+            # currently only used by AWS and GCP.
+            'specific_reservations': specific_reservations,
+
+            # Conda setup
+            # We should not use `.format`, as it contains '{}' as the bash
+            # syntax.
+            'conda_installation_commands':
+                constants.CONDA_INSTALLATION_COMMANDS.replace(
+                    '{conda_auto_activate}', conda_auto_activate).replace(
+                        '{is_custom_docker}', is_custom_docker)
+                if install_conda else '',
+            # UV setup
+            'uv_installation_commands': constants.UV_INSTALLATION_COMMANDS,
+            # Currently only used by Slurm. For other clouds, it is
+            # already part of ray_skypilot_installation_commands
+            'setup_sky_dirs_commands': constants.SETUP_SKY_DIRS_COMMANDS,
+            'ray_skypilot_installation_commands':
+                (constants.RAY_SKYPILOT_INSTALLATION_COMMANDS.replace(
+                    '{sky_wheel_hash}',
+                    wheel_hash).replace('{cloud}',
+                                        str(cloud).lower())),
+            'skypilot_wheel_installation_commands':
+                constants.SKYPILOT_WHEEL_INSTALLATION_COMMANDS.replace(
+                    '{sky_wheel_hash}',
+                    wheel_hash).replace('{cloud}',
+                                        str(cloud).lower()),
+            'copy_skypilot_templates_commands':
+                constants.COPY_SKYPILOT_TEMPLATES_COMMANDS,
+            # Port of Ray (GCS server).
+            # Ray's default port 6379 is conflicted with Redis.
+            'ray_port': constants.SKY_REMOTE_RAY_PORT,
+            'ray_dashboard_port': constants.SKY_REMOTE_RAY_DASHBOARD_PORT,
+            'ray_temp_dir': constants.SKY_REMOTE_RAY_TEMPDIR,
+            'dump_port_command': instance_setup.DUMP_RAY_PORTS,
+            # Sky-internal constants.
+            'sky_ray_cmd': constants.SKY_RAY_CMD,
+            # pip install needs to have python env activated to make sure
+            # installed packages are within the env path.
+            'sky_pip_cmd': f'{constants.SKY_PIP_CMD}',
+            # Activate the SkyPilot runtime environment when starting ray
+            # cluster, so that ray autoscaler can access cloud SDK and CLIs
+            # on remote
+            'sky_activate_python_env': constants.ACTIVATE_SKY_REMOTE_PYTHON_ENV,
+            'ray_version': constants.SKY_REMOTE_RAY_VERSION,
+            # Command for waiting ray cluster to be ready on head.
+            'ray_head_wait_initialized_command':
+                instance_setup.RAY_HEAD_WAIT_INITIALIZED_COMMAND,
+
+            # Cloud credentials for cloud storage.
+            'credentials': credentials,
+            # Sky remote utils.
+            'sky_remote_path': SKY_REMOTE_PATH,
+            'sky_local_path': str(local_wheel_path),
+            # Add yaml file path to the template variables.
+            'sky_ray_yaml_remote_path':
+                cluster_utils.SKY_CLUSTER_YAML_REMOTE_PATH,
+            'sky_ray_yaml_local_path': tmp_yaml_path,
+            'sky_version': str(version.parse(sky.__version__)),
+            'sky_wheel_hash': wheel_hash,
+            'ssh_max_sessions_config':
+                constants.SET_SSH_MAX_SESSIONS_CONFIG_CMD,
+            # Authentication (optional).
+            **auth_config,
+
+            # Controller specific configs
+            'is_remote_controller': is_remote_controller,
+            'high_availability': high_availability_specified,
+
+            # Volume mounts
+            'volume_mounts': volume_mount_vars,
+            'ephemeral_volume_mounts': ephemeral_volume_mount_vars,
+
+            # runcmd to run before any of the SkyPilot runtime setup commands.
+            # This is currently only used by AWS and Kubernetes.
+            'runcmd': runcmd,
+        },
+    )
+    if cloud_specific_failover_overrides is not None:
+        variables.update(cloud_specific_failover_overrides)
+    common_utils.fill_template(cluster_config_template,
+                               variables,
+                               output_path=tmp_yaml_path)
     config_dict['cluster_name'] = cluster_name
     config_dict['ray'] = yaml_path
 
@@ -1173,6 +1186,7 @@ def _add_auth_to_cluster_config(cloud: clouds.Cloud, tmp_yaml_path: str):
             clouds.Azure,
             clouds.DO,
             clouds.Nebius,
+            clouds.Yotta,
         )):
         config = auth.configure_ssh_info(config)
     elif isinstance(cloud, clouds.GCP):
@@ -1509,23 +1523,6 @@ def wait_until_ray_cluster_ready(
     return True, docker_user  # success
 
 
-def _get_ssh_control_name(config: Dict[str, Any]) -> str:
-    ssh_provider_module = config['provider']['module']
-    ssh_control_name = config.get('cluster_name',
-                                  command_runner.DEFAULT_SSH_CONTROL_NAME)
-    if 'slurm' in ssh_provider_module:
-        # For Slurm, multiple SkyPilot clusters may share the same underlying
-        # Slurm login node. By using a fixed ssh_control_name ('__default__'),
-        # we ensure that all connections to the same login node reuse the same
-        # SSH ControlMaster process, avoiding repeated SSH handshakes.
-        #
-        # The %C token in ControlPath (see ssh_options_list) ensures that
-        # connections to different login nodes use different sockets, avoiding
-        # collisions between different Slurm clusters.
-        ssh_control_name = command_runner.DEFAULT_SSH_CONTROL_NAME
-    return ssh_control_name
-
-
 def ssh_credential_from_yaml(
     cluster_yaml: Optional[str],
     docker_user: Optional[str] = None,
@@ -1546,7 +1543,7 @@ def ssh_credential_from_yaml(
     if ssh_user is None:
         ssh_user = auth_section['ssh_user'].strip()
     ssh_private_key_path = auth_section.get('ssh_private_key')
-    ssh_control_name = _get_ssh_control_name(config)
+    ssh_control_name = config.get('cluster_name', '__default__')
     ssh_proxy_command = auth_section.get('ssh_proxy_command')
 
     # Update the ssh_user placeholder in proxy command, if required
@@ -1600,7 +1597,7 @@ def ssh_credentials_from_handles(
         if ssh_user is None:
             ssh_user = auth_section['ssh_user'].strip()
         ssh_private_key_path = auth_section.get('ssh_private_key')
-        ssh_control_name = _get_ssh_control_name(config)
+        ssh_control_name = config.get('cluster_name', '__default__')
         ssh_proxy_command = auth_section.get('ssh_proxy_command')
 
         # Update the ssh_user placeholder in proxy command, if required
@@ -2431,6 +2428,42 @@ def run_ray_status_to_check_ray_cluster_healthy() -> bool:
                          exc_info=e)
         return False
 
+    def _handle_autostopping_cluster(
+            print_newline: bool = False) -> Optional[Dict[str, Any]]:
+        """Handle cluster that is autostopping/autodowning.
+
+        Sets the cluster status to AUTOSTOPPING and returns the cluster record.
+
+        Args:
+            print_newline: Whether to print a newline before logging (for UX).
+
+        Returns:
+            Cluster record if autostopping, None otherwise.
+        """
+        # The cluster is autostopping - set to AUTOSTOPPING status
+        if print_newline:
+            ux_utils.console_newline()
+        operation_str = 'autodowning' if record.get('to_down',
+                                                    False) else 'autostopping'
+        logger.info(f'Cluster {cluster_name!r} is {operation_str}.')
+
+        # Set cluster to AUTOSTOPPING status
+        record['status'] = status_lib.ClusterStatus.AUTOSTOPPING
+        global_user_state.add_cluster_event(
+            cluster_name,
+            status_lib.ClusterStatus.AUTOSTOPPING,
+            f'Cluster is {operation_str}.',
+            global_user_state.ClusterEventType.STATUS_CHANGE,
+            nop_if_duplicate=True)
+        # Use set_cluster_status() to directly update the status in DB
+        # instead of add_or_update_cluster() which only supports INIT/UP
+        global_user_state.set_cluster_status(
+            cluster_name, status_lib.ClusterStatus.AUTOSTOPPING)
+        return global_user_state.get_cluster_from_name(
+            cluster_name,
+            include_user_info=include_user_info,
+            summary_response=summary_response)
+
     # Determining if the cluster is healthy (UP):
     #
     # For non-spot clusters: If ray status shows all nodes are healthy, it is
@@ -2452,6 +2485,13 @@ def run_ray_status_to_check_ray_cluster_healthy() -> bool:
         # NOTE: all_nodes_up calculation is fast due to calling cloud CLI;
         # run_ray_status_to_check_all_nodes_up() is slow due to calling `ray get
         # head-ip/worker-ips`.
+
+        # Check if the cluster is in the process of autostopping
+        backend = get_backend_from_handle(handle)
+        if isinstance(backend, backends.CloudVmRayBackend):
+            if backend.is_definitely_autostopping(handle, stream_logs=False):
+                return _handle_autostopping_cluster(print_newline=False)
+
         record['status'] = status_lib.ClusterStatus.UP
         # Add cluster event for instance status check.
         global_user_state.add_cluster_event(
@@ -2586,12 +2626,24 @@ def run_ray_status_to_check_ray_cluster_healthy() -> bool:
 
             backend = get_backend_from_handle(handle)
             if isinstance(backend, backends.CloudVmRayBackend):
-                if is_head_node_alive:
+                # Check autostopping first, before head_node_alive check
+                # This ensures we detect AUTOSTOPPING even when Ray becomes
+                # unhealthy during hook execution, or if the actual nodes are
+                # partially autostopped but not completely yet.
+                is_autostopping = backend.is_definitely_autostopping(
+                    handle, stream_logs=False)
+
+                if is_autostopping:
+                    logger.debug(
+                        f'The cluster {cluster_name!r} is abnormal '
+                        f'({init_reason}) but is definitely autostopping. '
+                        'Returning AUTOSTOPPING status.')
+                    return _handle_autostopping_cluster(print_newline=True)
+                elif is_head_node_alive:
                     logger.debug(
                         f'Skipping autostop reset for cluster {cluster_name!r} '
                         'because the head node is alive.')
-                elif not backend.is_definitely_autostopping(handle,
-                                                            stream_logs=False):
+                elif not is_autostopping:
                     # Friendly hint.
                     autostop = record['autostop']
                     maybe_down_str = ' --down' if record['to_down'] else ''
@@ -2642,13 +2694,6 @@ def run_ray_status_to_check_ray_cluster_healthy() -> bool:
                         f'abnormal state. To fix, try running: {reset}{bright}sky '
                         f'start -f -i {autostop}{maybe_down_str} {cluster_name}'
                         f'{reset}')
-                else:
-                    ux_utils.console_newline()
-                    operation_str = 'autodowning' if record[
-                        'to_down'] else 'autostopping'
-                    logger.info(
-                        f'Cluster {cluster_name!r} is {operation_str}. Setting to '
-                        'INIT status; try refresh again in a while.')
 
         # If the user starts part of a STOPPED cluster, we still need a status
         # to represent the abnormal status. For spot cluster, it can also
@@ -2734,10 +2779,13 @@ def _must_refresh_cluster_status(
     use_spot = record['handle'].launched_resources.use_spot
     has_autostop = (record['status'] != status_lib.ClusterStatus.STOPPED and
                     record['autostop'] >= 0)
+    # If cluster is AUTOSTOPPING, always refresh to check if it transitioned to STOPPED
+    is_autostopping = record['status'] == status_lib.ClusterStatus.AUTOSTOPPING
     recently_refreshed = (record['status_updated_at'] is not None and
                           time.time() - record['status_updated_at'] <
-                          _CLUSTER_STATUS_CACHE_DURATION_SECONDS)
-    is_stale = (use_spot or has_autostop) and not recently_refreshed
+                          CLUSTER_STATUS_CACHE_DURATION_SECONDS)
+    is_stale = (use_spot or has_autostop or
+                is_autostopping) and not recently_refreshed
 
     return force_refresh_for_cluster or is_stale
 
@@ -2764,7 +2812,7 @@ def refresh_cluster_record(
           following conditions will be refreshed no matter the argument is
           specified or not:
             - the most latest available status update is more than
-              _CLUSTER_STATUS_CACHE_DURATION_SECONDS old, and one of:
+              CLUSTER_STATUS_CACHE_DURATION_SECONDS old, and one of:
                 1. the cluster is a spot cluster, or
                 2. cluster autostop is set and the cluster is not STOPPED.
         cluster_lock_already_held: Whether the caller is already holding the
@@ -3021,7 +3069,8 @@ def check_cluster_available(
                 f'cluster {cluster_name!r}. It is only supported by backend: '
                 f'{backends.CloudVmRayBackend.NAME}.'
                 f'{reset}')
-    if cluster_status != status_lib.ClusterStatus.UP:
+    if cluster_status not in (status_lib.ClusterStatus.UP,
+                              status_lib.ClusterStatus.AUTOSTOPPING):
         with ux_utils.print_exception_no_traceback():
             hint_for_init = ''
             if cluster_status == status_lib.ClusterStatus.INIT:
@@ -3033,7 +3082,8 @@ def check_cluster_available(
                 f'{colorama.Fore.YELLOW}{operation.capitalize()}: skipped for '
                 f'cluster {cluster_name!r} (status: {cluster_status.value}). '
                 'It is only allowed for '
-                f'{status_lib.ClusterStatus.UP.value} clusters.'
+                f'{status_lib.ClusterStatus.UP.value} and '
+                f'{status_lib.ClusterStatus.AUTOSTOPPING.value} clusters.'
                 f'{hint_for_init}'
                 f'{reset}',
                 cluster_status=cluster_status,
@@ -3174,7 +3224,9 @@ def is_controller_accessible(
         if not runner.check_connection():
             error_msg = controller.value.connection_error_hint
     else:
-        assert controller_status == status_lib.ClusterStatus.UP, handle
+        assert controller_status in (
+            status_lib.ClusterStatus.UP,
+            status_lib.ClusterStatus.AUTOSTOPPING), handle
 
     if error_msg is not None:
         if exit_if_not_accessible:
@@ -3802,8 +3854,9 @@ def get_endpoints(cluster: str,
                 f'Cluster {cluster!r} not found.', cluster_status=None)
     assert len(cluster_records) == 1, cluster_records
     cluster_record = cluster_records[0]
-    if (not skip_status_check and
-            cluster_record['status'] != status_lib.ClusterStatus.UP):
+    if (not skip_status_check and cluster_record['status']
+            not in (status_lib.ClusterStatus.UP,
+                    status_lib.ClusterStatus.AUTOSTOPPING)):
         with ux_utils.print_exception_no_traceback():
             raise exceptions.ClusterNotUpError(
                 f'Cluster {cluster_record["name"]!r} '
diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py
index 3eeb16e70c4..927afb4331e 100644
--- a/sky/backends/cloud_vm_ray_backend.py
+++ b/sky/backends/cloud_vm_ray_backend.py
@@ -44,14 +44,17 @@
 from sky.backends import wheel_utils
 from sky.clouds import cloud as sky_cloud
 from sky.clouds.utils import gcp_utils
+from sky.dag import DEFAULT_EXECUTION
 from sky.data import data_utils
 from sky.data import storage as storage_lib
 from sky.provision import common as provision_common
+from sky.provision import constants as provision_constants
 from sky.provision import instance_setup
 from sky.provision import metadata_utils
 from sky.provision import provisioner
 from sky.provision.kubernetes import config as config_lib
 from sky.provision.kubernetes import utils as kubernetes_utils
+from sky.provision.slurm import utils as slurm_utils
 from sky.serve import constants as serve_constants
 from sky.server.requests import requests as requests_lib
 from sky.skylet import autostop_lib
@@ -184,6 +187,7 @@
 _MAX_GET_ZONE_RETRY = 3
 
 _JOB_ID_PATTERN = re.compile(r'Job ID: ([0-9]+)')
+_JOB_IDS_PATTERN = re.compile(r'Job IDs: ([0-9,]+)')
 _LOG_DIR_PATTERN = re.compile(r'Log Dir: ([^ ]+)')
 
 # Path to the monkey-patched ray up script.
@@ -285,7 +289,8 @@ def _get_cluster_config_template(cloud):
         clouds.Fluidstack: 'fluidstack-ray.yml.j2',
         clouds.Nebius: 'nebius-ray.yml.j2',
         clouds.Hyperbolic: 'hyperbolic-ray.yml.j2',
-        clouds.Seeweb: 'seeweb-ray.yml.j2'
+        clouds.Seeweb: 'seeweb-ray.yml.j2',
+        clouds.Yotta: 'yotta-ray.yml.j2',
     }
     return cloud_to_template[type(cloud)]
 
@@ -932,7 +937,7 @@ def _insufficient_resources_msg(
             message += (f'{to_provision.cloud} for {requested_resources}. ')
         return message
 
-    def _retry_zones(
+    def _retry_zones(  # pylint: disable=line-too-long
         self,
         to_provision: resources_lib.Resources,
         num_nodes: int,
@@ -1044,334 +1049,346 @@ def _retry_zones(
             else:
                 zone_str = ','.join(z.name for z in zones)
                 zone_str = f' ({zone_str})'
-            try:
-                config_dict = backend_utils.write_cluster_config(
-                    to_provision,
-                    num_nodes,
-                    _get_cluster_config_template(to_provision.cloud),
-                    cluster_name,
-                    self._local_wheel_path,
-                    self._wheel_hash,
-                    region=region,
-                    zones=zones,
-                    dryrun=dryrun,
-                    keep_launch_fields_in_existing_config=cluster_exists,
-                    volume_mounts=volume_mounts,
-                )
-            except exceptions.ResourcesUnavailableError as e:
-                # Failed due to catalog issue, e.g. image not found, or
-                # GPUs are requested in a Kubernetes cluster but the cluster
-                # does not have nodes labeled with GPU types.
-                logger.info(f'{e}')
-                continue
-            except exceptions.InvalidCloudCredentials as e:
-                # Failed due to invalid cloud credentials.
-                logger.warning(f'{common_utils.format_exception(e)}')
-                # We should block the entire cloud for invalid cloud credentials
-                _add_to_blocked_resources(
-                    self._blocked_resources,
-                    to_provision.copy(region=None, zone=None))
-                raise exceptions.ResourcesUnavailableError(
-                    f'Failed to provision on cloud {to_provision.cloud} due to '
-                    f'invalid cloud credentials: '
-                    f'{common_utils.format_exception(e)}')
-            except exceptions.InvalidCloudConfigs as e:
-                # Failed due to invalid user configs in ~/.sky/config.yaml.
-                logger.warning(f'{common_utils.format_exception(e)}')
-                # We should block the entire cloud if the user config is
-                # invalid.
-                _add_to_blocked_resources(
-                    self._blocked_resources,
-                    to_provision.copy(region=None, zone=None))
-                raise exceptions.ResourcesUnavailableError(
-                    f'Failed to provision on cloud {to_provision.cloud} due to '
-                    f'invalid cloud config: {common_utils.format_exception(e)}')
 
-            if ('config_hash' in config_dict and
-                    skip_if_config_hash_matches == config_dict['config_hash']):
-                logger.debug('Skipping provisioning of cluster with matching '
-                             'config hash.')
-                config_dict['provisioning_skipped'] = True
-                return config_dict
-            config_dict['provisioning_skipped'] = False
+            for failover_overrides in to_provision.cloud.yield_cloud_specific_failover_overrides(
+                    region=to_provision.region):
+                try:
+                    config_dict = backend_utils.write_cluster_config(
+                        to_provision,
+                        num_nodes,
+                        _get_cluster_config_template(to_provision.cloud),
+                        cluster_name,
+                        self._local_wheel_path,
+                        self._wheel_hash,
+                        region=region,
+                        zones=zones,
+                        dryrun=dryrun,
+                        keep_launch_fields_in_existing_config=cluster_exists,
+                        volume_mounts=volume_mounts,
+                        cloud_specific_failover_overrides=failover_overrides,
+                    )
+                except exceptions.ResourcesUnavailableError as e:
+                    # Failed due to catalog issue, e.g. image not found, or
+                    # GPUs are requested in a Kubernetes cluster but the cluster
+                    # does not have nodes labeled with GPU types.
+                    logger.info(f'{e}')
+                    continue
+                except exceptions.InvalidCloudCredentials as e:
+                    # Failed due to invalid cloud credentials.
+                    logger.warning(f'{common_utils.format_exception(e)}')
+                    # We should block the entire cloud for invalid cloud credentials
+                    _add_to_blocked_resources(
+                        self._blocked_resources,
+                        to_provision.copy(region=None, zone=None))
+                    raise exceptions.ResourcesUnavailableError(
+                        f'Failed to provision on cloud {to_provision.cloud} due to '
+                        f'invalid cloud credentials: '
+                        f'{common_utils.format_exception(e)}')
+                except exceptions.InvalidCloudConfigs as e:
+                    # Failed due to invalid user configs in ~/.sky/config.yaml.
+                    logger.warning(f'{common_utils.format_exception(e)}')
+                    # We should block the entire cloud if the user config is
+                    # invalid.
+                    _add_to_blocked_resources(
+                        self._blocked_resources,
+                        to_provision.copy(region=None, zone=None))
+                    raise exceptions.ResourcesUnavailableError(
+                        f'Failed to provision on cloud {to_provision.cloud} due to '
+                        f'invalid cloud config: {common_utils.format_exception(e)}'
+                    )
 
-            if dryrun:
-                return config_dict
+                if ('config_hash' in config_dict and skip_if_config_hash_matches
+                        == config_dict['config_hash']):
+                    logger.debug(
+                        'Skipping provisioning of cluster with matching '
+                        'config hash.')
+                    config_dict['provisioning_skipped'] = True
+                    return config_dict
+                config_dict['provisioning_skipped'] = False
 
-            cluster_config_file = config_dict['ray']
+                if dryrun:
+                    return config_dict
 
-            launched_resources = to_provision.copy(region=region.name)
-            if zones and len(zones) == 1:
-                launched_resources = launched_resources.copy(zone=zones[0].name)
-
-            prev_cluster_ips, prev_ssh_ports, prev_cluster_info = (None, None,
-                                                                   None)
-            if prev_handle is not None:
-                prev_cluster_ips = prev_handle.stable_internal_external_ips
-                prev_ssh_ports = prev_handle.stable_ssh_ports
-                prev_cluster_info = prev_handle.cached_cluster_info
-            # Record early, so if anything goes wrong, 'sky status' will show
-            # the cluster name and users can appropriately 'sky down'.  It also
-            # means a second 'sky launch -c <name>' will attempt to reuse.
-            handle = CloudVmRayResourceHandle(
-                cluster_name=cluster_name,
-                # Backward compatibility will be guaranteed by the underlying
-                # backend_utils.write_cluster_config, which gets the cluster
-                # name on cloud from the ray yaml file, if the previous cluster
-                # exists.
-                cluster_name_on_cloud=config_dict['cluster_name_on_cloud'],
-                cluster_yaml=cluster_config_file,
-                launched_nodes=num_nodes,
-                # OK for this to be shown in CLI as status == INIT.
-                launched_resources=launched_resources,
-                # Use the previous cluster's IPs and ports if available to
-                # optimize the case where the cluster is restarted, i.e., no
-                # need to query IPs and ports from the cloud provider.
-                stable_internal_external_ips=prev_cluster_ips,
-                stable_ssh_ports=prev_ssh_ports,
-                cluster_info=prev_cluster_info,
-            )
-            usage_lib.messages.usage.update_final_cluster_status(
-                status_lib.ClusterStatus.INIT)
+                cluster_config_file = config_dict['ray']
+
+                launched_resources = to_provision.copy(region=region.name)
+                if zones and len(zones) == 1:
+                    launched_resources = launched_resources.copy(
+                        zone=zones[0].name)
+
+                prev_cluster_ips, prev_ssh_ports, prev_cluster_info = (None,
+                                                                       None,
+                                                                       None)
+                if prev_handle is not None:
+                    prev_cluster_ips = prev_handle.stable_internal_external_ips
+                    prev_ssh_ports = prev_handle.stable_ssh_ports
+                    prev_cluster_info = prev_handle.cached_cluster_info
+                # Record early, so if anything goes wrong, 'sky status' will show
+                # the cluster name and users can appropriately 'sky down'.  It also
+                # means a second 'sky launch -c <name>' will attempt to reuse.
+                handle = CloudVmRayResourceHandle(
+                    cluster_name=cluster_name,
+                    # Backward compatibility will be guaranteed by the underlying
+                    # backend_utils.write_cluster_config, which gets the cluster
+                    # name on cloud from the ray yaml file, if the previous cluster
+                    # exists.
+                    cluster_name_on_cloud=config_dict['cluster_name_on_cloud'],
+                    cluster_yaml=cluster_config_file,
+                    launched_nodes=num_nodes,
+                    # OK for this to be shown in CLI as status == INIT.
+                    launched_resources=launched_resources,
+                    # Use the previous cluster's IPs and ports if available to
+                    # optimize the case where the cluster is restarted, i.e., no
+                    # need to query IPs and ports from the cloud provider.
+                    stable_internal_external_ips=prev_cluster_ips,
+                    stable_ssh_ports=prev_ssh_ports,
+                    cluster_info=prev_cluster_info,
+                )
+                usage_lib.messages.usage.update_final_cluster_status(
+                    status_lib.ClusterStatus.INIT)
 
-            # This sets the status to INIT (even for a normal, UP cluster).
-            global_user_state.add_or_update_cluster(
-                cluster_name,
-                cluster_handle=handle,
-                requested_resources=requested_resources,
-                ready=False,
-                is_managed=self._is_managed,
-                provision_log_path=log_abs_path,
-            )
+                # This sets the status to INIT (even for a normal, UP cluster).
+                global_user_state.add_or_update_cluster(
+                    cluster_name,
+                    cluster_handle=handle,
+                    requested_resources=requested_resources,
+                    ready=False,
+                    is_managed=self._is_managed,
+                    provision_log_path=log_abs_path,
+                )
 
-            # Add cluster event for actual provisioning start.
-            global_user_state.add_cluster_event(
-                cluster_name, status_lib.ClusterStatus.INIT,
-                f'Provisioning on {to_provision.cloud.display_name()} ' +
-                f'in {to_provision.region}',
-                global_user_state.ClusterEventType.STATUS_CHANGE)
+                # Add cluster event for actual provisioning start.
+                global_user_state.add_cluster_event(
+                    cluster_name, status_lib.ClusterStatus.INIT,
+                    f'Provisioning on {to_provision.cloud.display_name()} ' +
+                    f'in {to_provision.region}',
+                    global_user_state.ClusterEventType.STATUS_CHANGE)
 
-            global_user_state.set_owner_identity_for_cluster(
-                cluster_name, cloud_user_identity)
-
-            if (to_provision.cloud.PROVISIONER_VERSION ==
-                    clouds.ProvisionerVersion.SKYPILOT):
-                # TODO (suquark): Gradually move the other clouds to
-                #  the new provisioner once they are ready.
-                assert to_provision.region == region.name, (to_provision,
-                                                            region)
-                num_nodes = handle.launched_nodes
-                # Some clouds, like RunPod, only support exposing ports during
-                # launch. For those clouds, we pass the ports to open in the
-                # `bulk_provision` to expose the ports during provisioning.
-                # If the `bulk_provision` is to apply on an existing cluster,
-                # it should be ignored by the underlying provisioner impl
-                # as it will only apply to newly-created instances.
-                ports_to_open_on_launch = (
-                    list(resources_utils.port_ranges_to_set(to_provision.ports))
-                    if to_provision.cloud.OPEN_PORTS_VERSION <=
-                    clouds.OpenPortsVersion.LAUNCH_ONLY else None)
-                try:
-                    controller = controller_utils.Controllers.from_name(
-                        cluster_name)
-                    controller_str = ('' if controller is None else
-                                      f' {controller.value.name}')
-                    if isinstance(to_provision.cloud, clouds.Kubernetes):
-                        suffix = '.'
-                        if region.name.startswith('ssh-'):
-                            ssh_node_pool_name = common_utils.removeprefix(
-                                region.name, 'ssh-')
-                            suffix = f' ({ssh_node_pool_name})'
-                        logger.info(
-                            ux_utils.starting_message(
-                                f'Launching{controller_str} on '
-                                f'{to_provision.cloud}{suffix}'))
-                    else:
-                        logger.info(
-                            ux_utils.starting_message(
-                                f'Launching{controller_str} on '
-                                f'{to_provision.cloud} '
-                                f'{region.name}{colorama.Style.RESET_ALL}'
-                                f'{zone_str}.'))
-                    assert handle.cluster_yaml is not None
-                    provision_record = provisioner.bulk_provision(
-                        to_provision.cloud,
-                        region,
-                        zones,
-                        resources_utils.ClusterName(
-                            cluster_name, handle.cluster_name_on_cloud),
-                        num_nodes=num_nodes,
-                        cluster_yaml=handle.cluster_yaml,
-                        prev_cluster_ever_up=prev_cluster_ever_up,
-                        log_dir=self.log_dir,
-                        ports_to_open_on_launch=ports_to_open_on_launch)
-                    # NOTE: We will handle the logic of '_ensure_cluster_ray_started' #pylint: disable=line-too-long
-                    # in 'provision_utils.post_provision_runtime_setup()' in the
-                    # caller.
-                    resources_vars = (
-                        to_provision.cloud.make_deploy_resources_variables(
-                            to_provision,
+                global_user_state.set_owner_identity_for_cluster(
+                    cluster_name, cloud_user_identity)
+
+                if (to_provision.cloud.PROVISIONER_VERSION ==
+                        clouds.ProvisionerVersion.SKYPILOT):
+                    # TODO (suquark): Gradually move the other clouds to
+                    #  the new provisioner once they are ready.
+                    assert to_provision.region == region.name, (to_provision,
+                                                                region)
+                    num_nodes = handle.launched_nodes
+                    # Some clouds, like RunPod, only support exposing ports during
+                    # launch. For those clouds, we pass the ports to open in the
+                    # `bulk_provision` to expose the ports during provisioning.
+                    # If the `bulk_provision` is to apply on an existing cluster,
+                    # it should be ignored by the underlying provisioner impl
+                    # as it will only apply to newly-created instances.
+                    ports_to_open_on_launch = (
+                        list(
+                            resources_utils.port_ranges_to_set(
+                                to_provision.ports))
+                        if to_provision.cloud.OPEN_PORTS_VERSION <=
+                        clouds.OpenPortsVersion.LAUNCH_ONLY else None)
+                    try:
+                        controller = controller_utils.Controllers.from_name(
+                            cluster_name)
+                        controller_str = ('' if controller is None else
+                                          f' {controller.value.name}')
+                        if isinstance(to_provision.cloud, clouds.Kubernetes):
+                            suffix = '.'
+                            if region.name.startswith('ssh-'):
+                                ssh_node_pool_name = common_utils.removeprefix(
+                                    region.name, 'ssh-')
+                                suffix = f' ({ssh_node_pool_name})'
+                            logger.info(
+                                ux_utils.starting_message(
+                                    f'Launching{controller_str} on '
+                                    f'{to_provision.cloud}{suffix}'))
+                        else:
+                            logger.info(
+                                ux_utils.starting_message(
+                                    f'Launching{controller_str} on '
+                                    f'{to_provision.cloud} '
+                                    f'{region.name}{colorama.Style.RESET_ALL}'
+                                    f'{zone_str}.'))
+                        assert handle.cluster_yaml is not None
+                        provision_record = provisioner.bulk_provision(
+                            to_provision.cloud,
+                            region,
+                            zones,
                             resources_utils.ClusterName(
                                 cluster_name, handle.cluster_name_on_cloud),
-                            region, zones, num_nodes))
-                    config_dict['provision_record'] = provision_record
-                    config_dict['resources_vars'] = resources_vars
-                    config_dict['handle'] = handle
-                    return config_dict
-                except provision_common.StopFailoverError:
-                    with ux_utils.print_exception_no_traceback():
-                        raise
-                except exceptions.InconsistentHighAvailabilityError:
-                    # No teardown happens for this error.
-                    with ux_utils.print_exception_no_traceback():
-                        raise
-                except config_lib.KubernetesError as e:
-                    if e.insufficent_resources:
-                        insufficient_resources = e.insufficent_resources
-                    # NOTE: We try to cleanup the cluster even if the previous
-                    # cluster does not exist. Also we are fast at
-                    # cleaning up clusters now if there is no existing node.
-                    CloudVmRayBackend().post_teardown_cleanup(
-                        handle,
-                        terminate=not prev_cluster_ever_up,
-                        remove_from_db=False,
-                        failover=True,
-                    )
-                    # TODO(suquark): other clouds may have different zone
-                    #  blocking strategy. See '_update_blocklist_on_error'
-                    #  for details.
-                    FailoverCloudErrorHandlerV2.update_blocklist_on_error(
-                        self._blocked_resources, to_provision, region, zones, e)
-                    continue
-                except Exception as e:  # pylint: disable=broad-except
-                    # NOTE: We try to cleanup the cluster even if the previous
-                    # cluster does not exist. Also we are fast at
-                    # cleaning up clusters now if there is no existing node..
-                    CloudVmRayBackend().post_teardown_cleanup(
-                        handle,
-                        terminate=not prev_cluster_ever_up,
-                        remove_from_db=False,
-                        failover=True)
-                    # TODO(suquark): other clouds may have different zone
-                    #  blocking strategy. See '_update_blocklist_on_error'
-                    #  for details.
-                    FailoverCloudErrorHandlerV2.update_blocklist_on_error(
-                        self._blocked_resources, to_provision, region, zones, e)
-                    continue
-                # NOTE: The code below in the loop should not be reachable
-                # with the new provisioner.
+                            num_nodes=num_nodes,
+                            cluster_yaml=handle.cluster_yaml,
+                            prev_cluster_ever_up=prev_cluster_ever_up,
+                            log_dir=self.log_dir,
+                            ports_to_open_on_launch=ports_to_open_on_launch)
+                        # NOTE: We will handle the logic of '_ensure_cluster_ray_started'
+                        # in 'provision_utils.post_provision_runtime_setup()' in the
+                        # caller.
+                        resources_vars = (
+                            to_provision.cloud.make_deploy_resources_variables(
+                                to_provision,
+                                resources_utils.ClusterName(
+                                    cluster_name, handle.cluster_name_on_cloud),
+                                region, zones, num_nodes))
+                        config_dict['provision_record'] = provision_record
+                        config_dict['resources_vars'] = resources_vars
+                        config_dict['handle'] = handle
+                        return config_dict
+                    except provision_common.StopFailoverError:
+                        with ux_utils.print_exception_no_traceback():
+                            raise
+                    except exceptions.InconsistentHighAvailabilityError:
+                        # No teardown happens for this error.
+                        with ux_utils.print_exception_no_traceback():
+                            raise
+                    except config_lib.KubernetesError as e:
+                        if e.insufficent_resources:
+                            insufficient_resources = e.insufficent_resources
+                        # NOTE: We try to cleanup the cluster even if the previous
+                        # cluster does not exist. Also we are fast at
+                        # cleaning up clusters now if there is no existing node.
+                        CloudVmRayBackend().post_teardown_cleanup(
+                            handle,
+                            terminate=not prev_cluster_ever_up,
+                            remove_from_db=False,
+                            failover=True,
+                        )
+                        # TODO(suquark): other clouds may have different zone
+                        #  blocking strategy. See '_update_blocklist_on_error'
+                        #  for details.
+                        FailoverCloudErrorHandlerV2.update_blocklist_on_error(
+                            self._blocked_resources, to_provision, region,
+                            zones, e)
+                        continue
+                    except Exception as e:  # pylint: disable=broad-except
+                        # NOTE: We try to cleanup the cluster even if the previous
+                        # cluster does not exist. Also we are fast at
+                        # cleaning up clusters now if there is no existing node..
+                        CloudVmRayBackend().post_teardown_cleanup(
+                            handle,
+                            terminate=not prev_cluster_ever_up,
+                            remove_from_db=False,
+                            failover=True)
+                        # TODO(suquark): other clouds may have different zone
+                        #  blocking strategy. See '_update_blocklist_on_error'
+                        #  for details.
+                        FailoverCloudErrorHandlerV2.update_blocklist_on_error(
+                            self._blocked_resources, to_provision, region,
+                            zones, e)
+                        continue
+                    # NOTE: The code below in the loop should not be reachable
+                    # with the new provisioner.
 
-            logging_info = {
-                'cluster_name': cluster_name,
-                'region_name': region.name,
-                'zone_str': zone_str,
-            }
+                logging_info = {
+                    'cluster_name': cluster_name,
+                    'region_name': region.name,
+                    'zone_str': zone_str,
+                }
 
-            status, stdout, stderr, head_internal_ip, head_external_ip = (
-                self._gang_schedule_ray_up(to_provision.cloud,
-                                           cluster_config_file, handle,
-                                           log_abs_path, stream_logs,
-                                           logging_info, to_provision.use_spot))
+                status, stdout, stderr, head_internal_ip, head_external_ip = (
+                    self._gang_schedule_ray_up(to_provision.cloud,
+                                               cluster_config_file, handle,
+                                               log_abs_path, stream_logs,
+                                               logging_info,
+                                               to_provision.use_spot))
+
+                if status == GangSchedulingStatus.CLUSTER_READY:
+                    # We must query the IPs from the cloud provider, when the
+                    # provisioning is done, to make sure the cluster IPs are
+                    # up-to-date.
+                    # The staled IPs may be caused by the node being restarted
+                    # manually or by the cloud provider.
+                    # Optimize the case where the cluster's head IPs can be parsed
+                    # from the output of 'ray up'.
+                    if handle.launched_nodes == 1:
+                        handle.update_cluster_ips(
+                            max_attempts=_FETCH_IP_MAX_ATTEMPTS,
+                            internal_ips=[head_internal_ip],
+                            external_ips=[head_external_ip])
+                    else:
+                        handle.update_cluster_ips(
+                            max_attempts=_FETCH_IP_MAX_ATTEMPTS)
+                    handle.update_ssh_ports(max_attempts=_FETCH_IP_MAX_ATTEMPTS)
+                    if cluster_exists:
+                        # Guard against the case where there's an existing cluster
+                        # with ray runtime messed up (e.g., manually killed) by (1)
+                        # querying ray status (2) restarting ray if needed.
+                        #
+                        # The above 'ray up' will not restart it automatically due
+                        # to 'ray up # --no-restart' flag.
+                        #
+                        # NOTE: this is performance sensitive and has been observed
+                        # to take 9s. Only do this for existing clusters, not
+                        # freshly launched ones (which should have ray runtime
+                        # started).
+                        self._ensure_cluster_ray_started(handle, log_abs_path)
 
-            if status == GangSchedulingStatus.CLUSTER_READY:
-                # We must query the IPs from the cloud provider, when the
-                # provisioning is done, to make sure the cluster IPs are
-                # up-to-date.
-                # The staled IPs may be caused by the node being restarted
-                # manually or by the cloud provider.
-                # Optimize the case where the cluster's head IPs can be parsed
-                # from the output of 'ray up'.
-                if handle.launched_nodes == 1:
-                    handle.update_cluster_ips(
-                        max_attempts=_FETCH_IP_MAX_ATTEMPTS,
-                        internal_ips=[head_internal_ip],
-                        external_ips=[head_external_ip])
+                    config_dict['handle'] = handle
+                    logger.info(
+                        ux_utils.finishing_message(
+                            f'Cluster launched: {cluster_name!r}.',
+                            log_path,
+                            cluster_name=cluster_name))
+                    return config_dict
+
+                # The cluster is not ready. We must perform error recording and/or
+                # cleanup.
+
+                # If cluster was ever up, stop it; otherwise terminate.
+                terminate_or_stop = not prev_cluster_ever_up
+                definitely_no_nodes_launched = False
+                if status == GangSchedulingStatus.HEAD_FAILED:
+                    # ray up failed for the head node.
+                    definitely_no_nodes_launched = (
+                        FailoverCloudErrorHandlerV1.update_blocklist_on_error(
+                            self._blocked_resources, to_provision, region,
+                            zones, stdout, stderr))
                 else:
-                    handle.update_cluster_ips(
-                        max_attempts=_FETCH_IP_MAX_ATTEMPTS)
-                handle.update_ssh_ports(max_attempts=_FETCH_IP_MAX_ATTEMPTS)
-                if cluster_exists:
-                    # Guard against the case where there's an existing cluster
-                    # with ray runtime messed up (e.g., manually killed) by (1)
-                    # querying ray status (2) restarting ray if needed.
-                    #
-                    # The above 'ray up' will not restart it automatically due
-                    # to 'ray up # --no-restart' flag.
-                    #
-                    # NOTE: this is performance sensitive and has been observed
-                    # to take 9s. Only do this for existing clusters, not
-                    # freshly launched ones (which should have ray runtime
-                    # started).
-                    self._ensure_cluster_ray_started(handle, log_abs_path)
-
-                config_dict['handle'] = handle
-                logger.info(
-                    ux_utils.finishing_message(
-                        f'Cluster launched: {cluster_name!r}.',
-                        log_path,
-                        cluster_name=cluster_name))
-                return config_dict
-
-            # The cluster is not ready. We must perform error recording and/or
-            # cleanup.
-
-            # If cluster was ever up, stop it; otherwise terminate.
-            terminate_or_stop = not prev_cluster_ever_up
-            definitely_no_nodes_launched = False
-            if status == GangSchedulingStatus.HEAD_FAILED:
-                # ray up failed for the head node.
-                definitely_no_nodes_launched = (
-                    FailoverCloudErrorHandlerV1.update_blocklist_on_error(
-                        self._blocked_resources, to_provision, region, zones,
-                        stdout, stderr))
-            else:
-                # gang scheduling failed.
-                assert status == GangSchedulingStatus.GANG_FAILED, status
-                # The stdout/stderr of ray up is not useful here, since
-                # head node is successfully provisioned.
-                definitely_no_nodes_launched = (
-                    FailoverCloudErrorHandlerV1.update_blocklist_on_error(
-                        self._blocked_resources,
-                        to_provision,
-                        region,
-                        zones=zones,
-                        stdout=None,
-                        stderr=None))
-                # GANG_FAILED means head is up, workers failed.
-                assert definitely_no_nodes_launched is False, (
-                    definitely_no_nodes_launched)
-
-                # Only log the errors for GANG_FAILED, since HEAD_FAILED may
-                # not have created any resources (it can happen however) and
-                # HEAD_FAILED can happen in "normal" failover cases.
-                logger.error('*** Failed provisioning the cluster. ***')
-                terminate_str = ('Terminating'
-                                 if terminate_or_stop else 'Stopping')
-                logger.error(f'*** {terminate_str} the failed cluster. ***')
-
-            # If these conditions hold, it *should* be safe to skip the cleanup
-            # action. This is a UX optimization.
-            #
-            # We want to skip mainly for VPC/subnets errors thrown during node
-            # provider bootstrapping: if users encountered "No VPC with name
-            # 'xxx' is found in <region>.", then going ahead to down the
-            # non-existent cluster will itself print out a (caught, harmless)
-            # error with the same message.  This was found to be
-            # confusing. Thus we skip termination.
-            skip_cleanup = not cluster_exists and definitely_no_nodes_launched
-            if skip_cleanup:
-                continue
+                    # gang scheduling failed.
+                    assert status == GangSchedulingStatus.GANG_FAILED, status
+                    # The stdout/stderr of ray up is not useful here, since
+                    # head node is successfully provisioned.
+                    definitely_no_nodes_launched = (
+                        FailoverCloudErrorHandlerV1.update_blocklist_on_error(
+                            self._blocked_resources,
+                            to_provision,
+                            region,
+                            zones=zones,
+                            stdout=None,
+                            stderr=None))
+                    # GANG_FAILED means head is up, workers failed.
+                    assert definitely_no_nodes_launched is False, (
+                        definitely_no_nodes_launched)
+
+                    # Only log the errors for GANG_FAILED, since HEAD_FAILED may
+                    # not have created any resources (it can happen however) and
+                    # HEAD_FAILED can happen in "normal" failover cases.
+                    logger.error('*** Failed provisioning the cluster. ***')
+                    terminate_str = ('Terminating'
+                                     if terminate_or_stop else 'Stopping')
+                    logger.error(f'*** {terminate_str} the failed cluster. ***')
+
+                # If these conditions hold, it *should* be safe to skip the cleanup
+                # action. This is a UX optimization.
+                #
+                # We want to skip mainly for VPC/subnets errors thrown during node
+                # provider bootstrapping: if users encountered "No VPC with name
+                # 'xxx' is found in <region>.", then going ahead to down the
+                # non-existent cluster will itself print out a (caught, harmless)
+                # error with the same message.  This was found to be
+                # confusing. Thus we skip termination.
+                skip_cleanup = not cluster_exists and definitely_no_nodes_launched
+                if skip_cleanup:
+                    continue
 
-            # There may exist partial nodes (e.g., head node) so we must
-            # terminate or stop before moving on to other regions.
-            #
-            # NOTE: even HEAD_FAILED could've left a live head node there,
-            # so we must terminate/stop here too. E.g., node is up, and ray
-            # autoscaler proceeds to setup commands, which may fail:
-            #   ERR updater.py:138 -- New status: update-failed
-            CloudVmRayBackend().teardown_no_lock(handle,
-                                                 terminate=terminate_or_stop,
-                                                 remove_from_db=False)
+                # There may exist partial nodes (e.g., head node) so we must
+                # terminate or stop before moving on to other regions.
+                #
+                # NOTE: even HEAD_FAILED could've left a live head node there,
+                # so we must terminate/stop here too. E.g., node is up, and ray
+                # autoscaler proceeds to setup commands, which may fail:
+                #   ERR updater.py:138 -- New status: update-failed
+                CloudVmRayBackend().teardown_no_lock(
+                    handle, terminate=terminate_or_stop, remove_from_db=False)
 
         message = self._insufficient_resources_msg(to_provision,
                                                    requested_resources,
@@ -2673,6 +2690,13 @@ def add_job(
     ) -> 'jobsv1_pb2.AddJobResponse':
         return self._jobs_stub.AddJob(request, timeout=timeout)
 
+    def set_job_info_without_job_id(
+        self,
+        request: 'jobsv1_pb2.SetJobInfoWithoutJobIdRequest',
+        timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
+    ) -> 'jobsv1_pb2.SetJobInfoWithoutJobIdResponse':
+        return self._jobs_stub.SetJobInfoWithoutJobId(request, timeout=timeout)
+
     def queue_job(
         self,
         request: 'jobsv1_pb2.QueueJobRequest',
@@ -3072,6 +3096,18 @@ def _maybe_clear_external_cluster_failures(
                             f'{cluster_name!r}: {", ".join(failure_details)}'
                             f'{colorama.Style.RESET_ALL}')
 
+    def check_skylet_running(self, handle: CloudVmRayResourceHandle):
+        # For backward compatibility and robustness of skylet, it is checked
+        # and restarted if necessary.
+        logger.debug('Checking if skylet is running on the head node.')
+        with rich_utils.safe_status(
+                ux_utils.spinner_message('Preparing SkyPilot runtime')):
+            # We need to source bashrc for skylet to make sure the autostop
+            # event can access the path to the cloud CLIs.
+            self.run_on_head(handle,
+                             instance_setup.MAYBE_SKYLET_RESTART_CMD,
+                             source_bashrc=True)
+
     def _locked_provision(
         self,
         lock_id: str,
@@ -3326,14 +3362,7 @@ def _get_zone(runner):
 
             # For backward compatibility and robustness of skylet, it is checked
             # and restarted if necessary.
-            logger.debug('Checking if skylet is running on the head node.')
-            with rich_utils.safe_status(
-                    ux_utils.spinner_message('Preparing SkyPilot runtime')):
-                # We need to source bashrc for skylet to make sure the autostop
-                # event can access the path to the cloud CLIs.
-                self.run_on_head(handle,
-                                 instance_setup.MAYBE_SKYLET_RESTART_CMD,
-                                 source_bashrc=True)
+            self.check_skylet_running(handle)
 
             self._update_after_cluster_provisioned(
                 handle, to_provision_config.prev_handle, task,
@@ -3800,10 +3829,10 @@ def _dump_code_to_file(codegen: str,
                 # We choose to sync code + exec, because the alternative of
                 # 'ray submit' may not work as it may use system python
                 # (python2) to execute the script. Happens for AWS.
-                head_runner.rsync(source=fp.name,
-                                  target=script_path,
-                                  up=True,
-                                  stream_logs=False)
+                head_runner.rsync_driver(source=fp.name,
+                                         target=script_path,
+                                         up=True,
+                                         stream_logs=False)
 
         mkdir_code = f'mkdir -p {remote_log_dir} && touch {remote_log_path}'
         encoded_script = shlex.quote(codegen)
@@ -3852,20 +3881,33 @@ def _dump_code_to_file(codegen: str,
                     for task_id, task in enumerate(managed_job_dag.tasks):
                         resources_str = backend_utils.get_task_resources_str(
                             task, is_managed_job=True)
-                        managed_job_tasks.append(
-                            jobsv1_pb2.ManagedJobTask(
-                                task_id=task_id,
-                                name=task.name,
-                                resources_str=resources_str,
-                                metadata_json=task.metadata_json))
-
+                        managed_job_task = jobsv1_pb2.ManagedJobTask(
+                            task_id=task_id,
+                            name=task.name,
+                            resources_str=resources_str,
+                            metadata_json=task.metadata_json)
+                        # Only set is_primary_in_job_group for job groups
+                        if managed_job_dag.is_job_group():
+                            # If primary_task_names is None, all tasks are
+                            # primary
+                            managed_job_task.is_primary_in_job_group = (
+                                managed_job_dag.primary_tasks is None or
+                                task.name in managed_job_dag.primary_tasks)
+                        managed_job_tasks.append(managed_job_task)
+
+                    # Execution mode: 'parallel' for job groups, 'serial' for
+                    # pipelines and single jobs
+                    execution = (managed_job_dag.execution.value
+                                 if managed_job_dag.execution else
+                                 DEFAULT_EXECUTION.value)
                     managed_job_info = jobsv1_pb2.ManagedJobInfo(
                         name=managed_job_dag.name,
                         pool=managed_job_dag.pool,
                         workspace=workspace,
                         entrypoint=entrypoint,
                         tasks=managed_job_tasks,
-                        user_id=managed_job_user_id)
+                        user_id=managed_job_user_id,
+                        execution=execution)
 
                 if backend_utils.is_command_length_over_limit(codegen):
                     _dump_code_to_file(codegen)
@@ -3893,29 +3935,6 @@ def _dump_code_to_file(codegen: str,
                 _dump_code_to_file(codegen)
                 job_submit_cmd = f'{mkdir_code} && {code}'
 
-            def _maybe_add_managed_job_code(job_submit_cmd: str) -> str:
-                if managed_job_dag is not None:
-                    # Add the managed job to job queue database.
-                    managed_job_codegen = managed_jobs.ManagedJobCodeGen()
-                    managed_job_code = managed_job_codegen.set_pending(
-                        job_id,
-                        managed_job_dag,
-                        skypilot_config.get_active_workspace(
-                            force_user_workspace=True),
-                        entrypoint=common_utils.get_current_command(),
-                        user_hash=managed_job_user_id)
-                    # Set the managed job to PENDING state to make sure that
-                    # this managed job appears in the `sky jobs queue`, even
-                    # if it needs to wait to be submitted.
-                    # We cannot set the managed job to PENDING state in the
-                    # job template (jobs-controller.yaml.j2), as it may need
-                    # to wait for the run commands to be scheduled on the job
-                    # controller in high-load cases.
-                    job_submit_cmd += ' && ' + managed_job_code
-                return job_submit_cmd
-
-            job_submit_cmd = _maybe_add_managed_job_code(job_submit_cmd)
-
             # For Slurm, run in background so that SSH returns immediately.
             # This is needed because we add the wait_for_job code above which
             # makes the command block until the job completes.
@@ -3940,7 +3959,6 @@ def _maybe_add_managed_job_code(job_submit_cmd: str) -> str:
                     f'Output: {output}')
                 _dump_code_to_file(codegen)
                 job_submit_cmd = f'{mkdir_code} && {code}'
-                job_submit_cmd = _maybe_add_managed_job_code(job_submit_cmd)
                 # See comment above for why run_in_background=is_slurm.
                 returncode, stdout, stderr = self.run_on_head(
                     handle,
@@ -4026,6 +4044,97 @@ def _add_job(self, handle: CloudVmRayResourceHandle,
                                  f'Returncode: {returncode}') from e
         return job_id, log_dir
 
+    def set_job_info_without_job_id(
+            self,
+            handle: CloudVmRayResourceHandle,
+            name: str,
+            workspace: str,
+            entrypoint: str,
+            pool: Optional[str],
+            pool_hash: Optional[str],
+            user_hash: Optional[str],
+            task_ids: List[int],
+            task_names: List[str],
+            resources_str: str,
+            metadata_jsons: List[str],
+            is_primary_in_job_groups: List[bool],
+            num_jobs: int = 1,
+            execution: str = DEFAULT_EXECUTION.value) -> List[int]:
+        """Set job info without creating entries in the jobs table.
+
+        This creates entries in job_info_table and spot_table without creating
+        entries in the jobs table, which prevents autostop from being blocked
+        by jobs stuck in INIT status.
+        """
+        use_legacy = not handle.is_grpc_enabled_with_flag
+
+        if not use_legacy:
+            try:
+                request = jobsv1_pb2.SetJobInfoWithoutJobIdRequest(
+                    name=name,
+                    workspace=workspace,
+                    entrypoint=entrypoint,
+                    pool=pool,
+                    pool_hash=pool_hash,
+                    user_hash=user_hash,
+                    task_ids=task_ids,
+                    task_names=task_names,
+                    resources_str=resources_str,
+                    metadata_jsons=metadata_jsons,
+                    num_jobs=num_jobs,
+                    execution=execution,
+                    is_primary_in_job_groups=is_primary_in_job_groups)
+                response = backend_utils.invoke_skylet_with_retries(
+                    lambda: SkyletClient(handle.get_grpc_channel()
+                                        ).set_job_info_without_job_id(request))
+                return list(response.job_ids)
+            except exceptions.SkyletMethodNotImplementedError:
+                use_legacy = True
+
+        if use_legacy:
+            code = job_lib.JobLibCodeGen.set_job_info_without_job_id(
+                name=name,
+                workspace=workspace,
+                entrypoint=entrypoint,
+                pool=pool,
+                pool_hash=pool_hash,
+                user_hash=user_hash,
+                task_ids=task_ids,
+                task_names=task_names,
+                resources_str=resources_str,
+                metadata_jsons=metadata_jsons,
+                is_primary_in_job_groups=is_primary_in_job_groups,
+                num_jobs=num_jobs,
+                execution=execution)
+            returncode, result_str, stderr = self.run_on_head(
+                handle,
+                code,
+                stream_logs=False,
+                require_outputs=True,
+                separate_stderr=True)
+            backend_utils.check_stale_runtime_on_remote(returncode, stderr,
+                                                        handle.cluster_name)
+            subprocess_utils.handle_returncode(returncode, code,
+                                               'Failed to fetch job id.',
+                                               stderr)
+            try:
+                # Parse job IDs from output
+                job_ids_match = _JOB_IDS_PATTERN.search(result_str)
+                if job_ids_match:
+                    job_ids = [
+                        int(x.strip())
+                        for x in job_ids_match.group(1).split(',')
+                    ]
+                    return job_ids
+                else:
+                    raise ValueError(
+                        f'Failed to parse job ids from: {result_str}')
+            except ValueError as e:
+                logger.error(stderr)
+                raise ValueError(f'Failed to parse job id: {result_str}; '
+                                 f'Returncode: {returncode}') from e
+        return []
+
     def _execute(
         self,
         handle: CloudVmRayResourceHandle,
@@ -4356,7 +4465,7 @@ def _rsync_down(args) -> None:
             (runner, local_log_dir, remote_log_dir) = args
             try:
                 os.makedirs(os.path.expanduser(local_log_dir), exist_ok=True)
-                runner.rsync(
+                runner.rsync_driver(
                     # Require a `/` at the end to make sure the parent dir
                     # are not created locally. We do not add additional '*' as
                     # kubernetes's rsync does not work with an ending '*'.
@@ -4457,18 +4566,67 @@ def tail_logs(
             final = e.code
         return final
 
+    def tail_autostop_logs(self,
+                           handle: CloudVmRayResourceHandle,
+                           follow: bool = True,
+                           tail: int = 0) -> int:
+        """Tail the autostop hook logs.
+
+        Args:
+            handle: The handle to the cluster.
+            follow: Whether to follow the logs.
+            tail: The number of lines to display from the end of the
+                log file. If 0, print all lines.
+
+        Returns:
+            The exit code of the tail command.
+        """
+        # Construct tail command for the autostop hook log
+        log_path = f'~/{constants.AUTOSTOP_HOOK_LOG_FILE}'
+        tail_cmd_parts = ['tail']
+        if tail > 0:
+            tail_cmd_parts.extend(['-n', str(tail)])
+        if follow:
+            tail_cmd_parts.append('-f')
+        tail_cmd_parts.append(log_path)
+
+        # Add fallback to show helpful message if file doesn't exist
+        tail_cmd = ' '.join(tail_cmd_parts)
+        error_msg = (f'Autostop hook log file not found at {log_path}. '
+                     f'The autostop hook may not have been executed yet.')
+        cmd = (f'if [ -f {log_path} ]; then {tail_cmd}; '
+               f'else echo "{error_msg}"; exit 1; fi')
+
+        # With the stdin=subprocess.DEVNULL, the ctrl-c will not directly
+        # kill the process, so we need to handle it manually here.
+        if threading.current_thread() is threading.main_thread():
+            signal.signal(signal.SIGINT, backend_utils.interrupt_handler)
+            signal.signal(signal.SIGTSTP, backend_utils.stop_handler)
+        try:
+            returncode = self.run_on_head(
+                handle,
+                cmd,
+                stream_logs=True,
+                # Allocate a pseudo-terminal to disable output buffering.
+                ssh_mode=command_runner.SshMode.INTERACTIVE,
+            )
+        except SystemExit as e:
+            returncode = e.code
+        return returncode
+
     def tail_managed_job_logs(self,
                               handle: CloudVmRayResourceHandle,
                               job_id: Optional[int] = None,
                               job_name: Optional[str] = None,
                               controller: bool = False,
                               follow: bool = True,
-                              tail: Optional[int] = None) -> int:
+                              tail: Optional[int] = None,
+                              task: Optional[Union[str, int]] = None) -> int:
         # if job_name is not None, job_id should be None
         assert job_name is None or job_id is None, (job_name, job_id)
         # TODO(kevin): Migrate stream_logs to gRPC
         code = managed_jobs.ManagedJobCodeGen.stream_logs(
-            job_name, job_id, follow, controller, tail)
+            job_name, job_id, follow, controller, tail, task)
 
         # With the stdin=subprocess.DEVNULL, the ctrl-c will not directly
         # kill the process, so we need to handle it manually here.
@@ -4925,7 +5083,7 @@ def teardown_no_lock(self,
             #   configurations (such as VPC not found). So it's safe & good UX
             #   to not print a failure message.
             elif ('TPU must be specified.' not in stderr and
-                  'SKYPILOT_ERROR_NO_NODES_LAUNCHED: ' not in stderr):
+                  provision_constants.ERROR_NO_NODES_LAUNCHED not in stderr):
                 raise RuntimeError(
                     _TEARDOWN_FAILURE_MESSAGE.format(
                         extra_reason='',
@@ -5039,6 +5197,21 @@ def post_teardown_cleanup(self,
                     else:
                         raise
 
+                # Clean up all cluster resources (e.g., Kubernetes services).
+                # This is a no-op for most clouds, but Kubernetes needs it to
+                # clean up orphaned services when pods are deleted externally.
+                try:
+                    provision_lib.cleanup_cluster_resources(
+                        repr(cloud), cluster_name_on_cloud, config['provider'])
+                except Exception as e:  # pylint: disable=broad-except
+                    if purge:
+                        msg = common_utils.format_exception(e, use_bracket=True)
+                        logger.warning(
+                            f'Failed to cleanup cluster resources. Skipping '
+                            f'since purge is set. Details: {msg}')
+                    else:
+                        raise
+
                 if ports_cleaned_up and custom_multi_network_cleaned_up:
                     try:
                         self.remove_cluster_config(handle)
@@ -5144,7 +5317,9 @@ def set_autostop(self,
                      idle_minutes_to_autostop: Optional[int],
                      wait_for: Optional[autostop_lib.AutostopWaitFor],
                      down: bool = False,
-                     stream_logs: bool = True) -> None:
+                     stream_logs: bool = True,
+                     hook: Optional[str] = None,
+                     hook_timeout: Optional[int] = None) -> None:
         # The core.autostop() function should have already checked that the
         # cloud and resources support requested autostop.
         if idle_minutes_to_autostop is not None:
@@ -5197,11 +5372,17 @@ def set_autostop(self,
                     autostopv1_pb2.AUTOSTOP_WAIT_FOR_UNSPECIFIED,
                     down=down,
                 )
+                if hook:
+                    request.hook = hook
+                if hook_timeout is not None:
+                    request.hook_timeout = hook_timeout
+
                 backend_utils.invoke_skylet_with_retries(lambda: SkyletClient(
                     handle.get_grpc_channel()).set_autostop(request))
             else:
                 code = autostop_lib.AutostopCodeGen.set_autostop(
-                    idle_minutes_to_autostop, self.NAME, wait_for, down)
+                    idle_minutes_to_autostop, self.NAME, wait_for, down, hook,
+                    hook_timeout)
                 returncode, _, stderr = self.run_on_head(
                     handle, code, require_outputs=True, stream_logs=stream_logs)
                 subprocess_utils.handle_returncode(returncode,
@@ -5233,13 +5414,16 @@ def is_definitely_autostopping(self,
             # The head node of the cluster is not UP or in an abnormal state.
             # We cannot check if the cluster is autostopping.
             return False
+
+        is_autostopping = False
+
         if handle.is_grpc_enabled_with_flag:
             try:
                 request = autostopv1_pb2.IsAutostoppingRequest()
                 response = backend_utils.invoke_skylet_with_retries(
                     lambda: SkyletClient(handle.get_grpc_channel()
                                         ).is_autostopping(request))
-                return response.is_autostopping
+                is_autostopping = response.is_autostopping
             except Exception as e:  # pylint: disable=broad-except
                 # The cluster may have been terminated, causing the gRPC call
                 # to timeout and fail.
@@ -5250,11 +5434,14 @@ def is_definitely_autostopping(self,
             returncode, stdout, stderr = self.run_on_head(
                 handle, code, require_outputs=True, stream_logs=stream_logs)
             if returncode == 0:
-                return message_utils.decode_payload(stdout)
-            logger.debug('Failed to check if cluster is autostopping with '
-                         f'{returncode}: {stdout+stderr}\n'
-                         f'Command: {code}')
-            return False
+                is_autostopping = message_utils.decode_payload(stdout)
+            else:
+                logger.debug('Failed to check if cluster is autostopping with '
+                             f'{returncode}: {stdout+stderr}\n'
+                             f'Command: {code}')
+                return False
+
+        return is_autostopping
 
     # TODO(zhwu): Refactor this to a CommandRunner class, so different backends
     # can support its own command runner.
@@ -5320,7 +5507,7 @@ def run_on_head(
         if under_remote_workdir:
             cmd = f'cd {SKY_REMOTE_WORKDIR} && {cmd}'
 
-        return head_runner.run(
+        return head_runner.run_driver(
             cmd,
             port_forward=port_forward,
             log_path=log_path,
@@ -5966,7 +6153,8 @@ def _skypilot_predefined_env_vars(
                 'cloud': str(handle.launched_resources.cloud),
                 'region': handle.launched_resources.region,
                 'zone': handle.launched_resources.zone,
-            })
+            }),
+            constants.USER_ENV_VAR: common_utils.get_current_user_name(),
         }
 
     def _get_task_env_vars(self, task: task_lib.Task, job_id: int,
@@ -6003,7 +6191,16 @@ def _get_task_codegen_class(
             slurm_job_id = head_instance.tags.get('job_id')
             assert (slurm_job_id
                     is not None), ('job_id tag not found in head instance')
-            return task_codegen.SlurmCodeGen(slurm_job_id=slurm_job_id)
+            container_image = handle.launched_resources.extract_docker_image()
+            container_name = None
+            if container_image is not None:
+                container_name = slurm_utils.pyxis_container_name(
+                    handle.cluster_name_on_cloud)
+
+            return task_codegen.SlurmCodeGen(
+                slurm_job_id,
+                container_name,
+            )
         else:
             return task_codegen.RayCodeGen()
 
diff --git a/sky/backends/docker_utils.py b/sky/backends/docker_utils.py
index 1da4fdbf873..4198568aa48 100644
--- a/sky/backends/docker_utils.py
+++ b/sky/backends/docker_utils.py
@@ -201,7 +201,7 @@ def push_dockerimage(local_tag, remote_name):
 def make_bash_from_multiline(codegen: str) -> str:
     """Makes a bash script from a multi-line string of commands.
 
-    Automatically includes conda setup prefixes.
+    Automatically includes conda setup prefixes if conda is installed.
     Args:
         codegen: str: multiline commands to be converted to a shell script
 
diff --git a/sky/backends/task_codegen.py b/sky/backends/task_codegen.py
index e188dc9bc17..01b2d1b2ab5 100644
--- a/sky/backends/task_codegen.py
+++ b/sky/backends/task_codegen.py
@@ -5,6 +5,7 @@
 import json
 import math
 import os
+import shlex
 import textwrap
 from typing import Dict, List, Optional, Tuple
 
@@ -130,7 +131,8 @@ def _add_constants(self) -> None:
             CANCELLED_RETURN_CODE = 137
             """))
 
-    def _get_rclone_flush_script(self) -> str:
+    @staticmethod
+    def get_rclone_flush_script() -> str:
         """Generate rclone flush script for cached storage mounts.
 
         This script blocks job completion until all storage mounted with
@@ -612,7 +614,7 @@ def _add_ray_task(self,
         options_str = ', '.join(options)
         logger.debug('Added Task with options: '
                      f'{options_str}')
-        rclone_flush_script = self._get_rclone_flush_script()
+        rclone_flush_script = self.get_rclone_flush_script()
         unset_ray_env_vars = ' && '.join(
             [f'unset {var}' for var in UNSET_RAY_ENV_VARS])
         self._code += [
@@ -664,14 +666,20 @@ def add_epilogue(self) -> None:
 class SlurmCodeGen(TaskCodeGen):
     """Code generator for task execution on Slurm using native srun."""
 
-    def __init__(self, slurm_job_id: str):
-        """Initialize SlurmCodeGen
+    def __init__(
+        self,
+        slurm_job_id: str,
+        container_name: Optional[str],
+    ):
+        """Initialize SlurmCodeGen.
 
         Args:
             slurm_job_id: The Slurm job ID, i.e. SLURM_JOB_ID
+            container_name: pyxis container name, or None
         """
         super().__init__()
         self._slurm_job_id = slurm_job_id
+        self._container_name = container_name
 
     def add_prologue(self, job_id: int) -> None:
         assert not self._has_prologue, 'add_prologue() called twice?'
@@ -805,10 +813,18 @@ def add_task(
                                          for k, v in env_vars.items())
         sky_env_vars_dict_str = '\n'.join(sky_env_vars_dict_str)
 
-        rclone_flush_script = self._get_rclone_flush_script()
+        rclone_flush_script = self.get_rclone_flush_script()
         streaming_msg = self._get_job_started_msg()
         has_setup_cmd = self._setup_cmd is not None
 
+        container_flags = ''
+        if self._container_name is not None:
+            # --container-remap-root must be passed on every srun to get
+            # correct $HOME
+            container_flags = (
+                ' --container-remap-root'
+                f' --container-name={shlex.quote(self._container_name)}:exec')
+
         self._code += [
             sky_env_vars_dict_str,
             textwrap.dedent(f"""\
@@ -886,19 +902,36 @@ def build_task_runner_cmd(user_script, extra_flags, log_dir, env_vars_dict,
                     # allocation. See:
                     # https://support.schedmd.com/show_bug.cgi?id=14298
                     # https://github.com/huggingface/datatrove/issues/248
+                    cmd_parts = []
+                    # Only unset SKY_RUNTIME_DIR for container runs. For non-container
+                    # runs, we want to inherit the node-local SKY_RUNTIME_DIR set by
+                    # SlurmCommandRunner to avoid SQLite WAL issues on shared filesystems.
+                    if {True if container_flags else False}:
+                        cmd_parts.append('unset SKY_RUNTIME_DIR;')
+                    cmd_parts.extend([
+                        constants.SKY_SLURM_PYTHON_CMD,
+                        '-m sky.skylet.executor.slurm',
+                        runner_args,
+                    ])
+                    bash_cmd = shlex.quote(' '.join(cmd_parts))
                     srun_cmd = (
                         "unset $(env | awk -F= '/^SLURM_/ {{print $1}}') && "
                         f'srun --export=ALL --quiet --unbuffered --kill-on-bad-exit --jobid={self._slurm_job_id} '
-                        f'--job-name=sky-{self.job_id}{{job_suffix}} --ntasks-per-node=1 {{extra_flags}} '
-                        f'{{constants.SKY_SLURM_PYTHON_CMD}} -m sky.skylet.executor.slurm {{runner_args}}'
+                        f'--job-name=sky-{self.job_id}{{job_suffix}} --ntasks-per-node=1{container_flags} {{extra_flags}} '
+                        f'/bin/bash -c {{bash_cmd}}'
                     )
-                    return srun_cmd, script_path
+
+                    def cleanup():
+                        if script_path is not None:
+                            os.remove(script_path)
+
+                    return srun_cmd, cleanup
 
                 def run_thread_func():
                     # This blocks until Slurm allocates resources (--exclusive)
                     # --mem=0 to match RayCodeGen's behavior where we don't explicitly request memory.
                     run_flags = f'--nodes={num_nodes} --cpus-per-task={task_cpu_demand} --mem=0 {{gpu_arg}} --exclusive'
-                    srun_cmd, task_script_path = build_task_runner_cmd(
+                    srun_cmd, cleanup = build_task_runner_cmd(
                         script, run_flags, {log_dir!r}, sky_env_vars_dict,
                         task_name={task_name!r},
                         alloc_signal=alloc_signal_file,
@@ -913,8 +946,7 @@ def run_thread_func():
                         print(line, end='', flush=True)
                     proc.wait()
 
-                    if task_script_path is not None:
-                        os.remove(task_script_path)
+                    cleanup()
                     return {{'return_code': proc.returncode, 'pid': proc.pid}}
 
                 run_thread_result = {{'result': None}}
@@ -955,7 +987,7 @@ def run_thread_wrapper():
                     # --overlap as we have already secured allocation with the srun for the run section,
                     # and otherwise this srun would get blocked and deadlock.
                     setup_flags = f'--overlap --nodes={self._setup_num_nodes}'
-                    setup_srun, setup_script_path = build_task_runner_cmd(
+                    setup_srun, setup_cleanup = build_task_runner_cmd(
                         {self._setup_cmd!r}, setup_flags, {self._setup_log_dir!r}, {self._setup_envs!r},
                         is_setup=True
                     )
@@ -969,8 +1001,7 @@ def run_thread_wrapper():
                         print(line, end='', flush=True)
                     setup_proc.wait()
 
-                    if setup_script_path is not None:
-                        os.remove(setup_script_path)
+                    setup_cleanup()
 
                     setup_returncode = setup_proc.returncode
                     if setup_returncode != 0:
diff --git a/sky/catalog/__init__.py b/sky/catalog/__init__.py
index 4180bf057f4..038bd57e52d 100644
--- a/sky/catalog/__init__.py
+++ b/sky/catalog/__init__.py
@@ -335,6 +335,7 @@ def get_common_gpus() -> List[str]:
         'H200',
         'L4',
         'L40S',
+        'RTX5090',
         'T4',
         'V100',
         'V100-32GB',
diff --git a/sky/catalog/common.py b/sky/catalog/common.py
index c284e72e3bc..9be2c23a124 100644
--- a/sky/catalog/common.py
+++ b/sky/catalog/common.py
@@ -3,6 +3,7 @@
 import difflib
 import hashlib
 import os
+import tempfile
 import time
 import typing
 from typing import Callable, Dict, List, NamedTuple, Optional, Tuple, Union
@@ -243,9 +244,19 @@ def _update_catalog():
                             raise e
                 else:
                     # Download successful, save the catalog to a local file.
+                    # Use atomic write (write to temp file, then rename) to
+                    # avoid race conditions when multiple processes read/write
+                    # the catalog file concurrently during parallel test
+                    # execution.
                     os.makedirs(os.path.dirname(catalog_path), exist_ok=True)
-                    with open(catalog_path, 'w', encoding='utf-8') as f:
+                    with tempfile.NamedTemporaryFile(
+                            mode='w',
+                            dir=os.path.dirname(catalog_path),
+                            delete=False,
+                            encoding='utf-8') as f:
                         f.write(r.text)
+                        tmp_path = f.name
+                    os.rename(tmp_path, catalog_path)
                     with open(meta_path + '.md5', 'w', encoding='utf-8') as f:
                         f.write(hashlib.md5(r.text.encode()).hexdigest())
             logger.debug(f'Updated {cloud} catalog {filename}.')
diff --git a/sky/catalog/data_fetchers/fetch_aws.py b/sky/catalog/data_fetchers/fetch_aws.py
index 483639e717e..02da872c38b 100644
--- a/sky/catalog/data_fetchers/fetch_aws.py
+++ b/sky/catalog/data_fetchers/fetch_aws.py
@@ -13,13 +13,14 @@
 import textwrap
 import traceback
 import typing
-from typing import List, Optional, Set, Tuple, Union
+from typing import Any, Dict, List, Optional, Set, Tuple, Union
 
 import numpy as np
 
 from sky import exceptions
 from sky.adaptors import aws
 from sky.adaptors import common as adaptors_common
+from sky.skylet import constants
 from sky.utils import log_utils
 from sky.utils import ux_utils
 
@@ -67,8 +68,21 @@
 
 # The following columns will be included in the final catalog.
 USEFUL_COLUMNS = [
-    'InstanceType', 'AcceleratorName', 'AcceleratorCount', 'vCPUs', 'MemoryGiB',
-    'GpuInfo', 'Price', 'SpotPrice', 'Region', 'AvailabilityZone', 'Arch'
+    'InstanceType',
+    'AcceleratorName',
+    'AcceleratorCount',
+    'vCPUs',
+    'MemoryGiB',
+    'GpuInfo',
+    'Price',
+    'SpotPrice',
+    'Region',
+    'AvailabilityZone',
+    'Arch',
+    'LocalDiskType',
+    'NVMeSupported',
+    'LocalDiskSize',
+    'LocalDiskCount',
 ]
 
 # NOTE: the hard-coded us-east-1 URL is not a typo. AWS pricing endpoint is
@@ -269,23 +283,49 @@ def get_memory_gib(row) -> float:
                 return row['MemoryInfo']['SizeInMiB'] / 1024
             return float(row['Memory'].split(' GiB')[0])
 
+        def get_local_disk_info(row) -> Dict[str, Any]:
+            info: Dict[str, Any] = {}
+            local_disk_supported = row['InstanceStorageSupported']
+            info['LocalDiskType'] = None
+            info['NVMeSupported'] = False
+            info['LocalDiskSize'] = None
+            info['LocalDiskCount'] = None
+
+            if local_disk_supported:
+                raw_info = row['InstanceStorageInfo']
+                info['NVMeSupported'] = raw_info['NvmeSupport'] == 'required'
+                # This is always 1. AWS probably made this as a list
+                # with future changes in consideration.
+                assert len(raw_info['Disks']) == 1, (
+                    f'Instance type {row["InstanceType"]} has '
+                    f'{len(raw_info["Disks"])} disk entries, expected 1.')
+                disk_info = raw_info['Disks'][0]
+                assert disk_info['Type'] in constants.LOCAL_DISK_TYPES, (
+                    f'Instance type {row["InstanceType"]} has unknown '
+                    f'disk type {disk_info["Type"]}.')
+                info['LocalDiskType'] = disk_info['Type']
+                info['LocalDiskSize'] = disk_info['SizeInGB']
+                info['LocalDiskCount'] = disk_info['Count']
+            return info
+
         def get_additional_columns(row) -> pd.Series:
             acc_name, acc_count = get_acc_info(row)
-            # AWS p3dn.24xlarge offers a different V100 GPU.
+            # AWS instance type workarounds for incorrect/missing GPU info.
             # See https://aws.amazon.com/blogs/compute/optimizing-deep-learning-on-p3-and-p3dn-with-efa/ # pylint: disable=line-too-long
             if row['InstanceType'] == 'p3dn.24xlarge':
                 acc_name = 'V100-32GB'
-            if row['InstanceType'] == 'p4de.24xlarge':
+            elif row['InstanceType'] == 'p4de.24xlarge':
                 acc_name = 'A100-80GB'
                 acc_count = 8
-            if row['InstanceType'] == 'p5en.48xlarge':
+            elif row['InstanceType'] in ('p5e.48xlarge', 'p5en.48xlarge'):
                 # TODO(andyl): Check if this workaround still needed after
                 # v0.10.0 released. Currently, the acc_name returned by the
                 # AWS API is 'NVIDIA', which is incorrect. See #4652.
+                # Both p5e.48xlarge and p5en.48xlarge have 8x H200 GPUs.
                 acc_name = 'H200'
                 acc_count = 8
-            if (row['InstanceType'].startswith('g6f') or
-                    row['InstanceType'].startswith('gr6f')):
+            elif (row['InstanceType'].startswith('g6f') or
+                  row['InstanceType'].startswith('gr6f')):
                 # These instance actually have only fractional GPUs, but the API
                 # returns Count: 1 or Count: 0 under GpuInfo. We need to
                 # directly check the GPU memory to get the actual fraction of
@@ -297,14 +337,18 @@ def get_additional_columns(row) -> pd.Series:
                 fraction = row['GpuInfo']['Gpus'][0]['MemoryInfo'][
                     'SizeInMiB'] / L4_GPU_MEMORY
                 acc_count = round(fraction, 3)
-            if row['InstanceType'] == 'p5.4xlarge':
+            elif row['InstanceType'] == 'p5.4xlarge':
                 acc_count = 1
+            elif row['InstanceType'].startswith('g7e'):
+                # Change name from "RTX PRO Server 6000" to "RTXPRO6000" for consistency
+                acc_name = 'RTXPRO6000'
             return pd.Series({
                 'AcceleratorName': acc_name,
                 'AcceleratorCount': acc_count,
                 'vCPUs': get_vcpus(row),
                 'MemoryGiB': get_memory_gib(row),
                 'Arch': get_arch(row),
+                **get_local_disk_info(row)
             })
 
         # The AWS API may not have all the instance types in the pricing table,
diff --git a/sky/catalog/kubernetes_catalog.py b/sky/catalog/kubernetes_catalog.py
index 1ed95b70a1a..4fa29face85 100644
--- a/sky/catalog/kubernetes_catalog.py
+++ b/sky/catalog/kubernetes_catalog.py
@@ -206,6 +206,13 @@ def _list_accelerators(
     for node in nodes:
         # Check if node is ready
         node_is_ready = node.is_ready()
+        node_is_cordoned = node.is_cordoned()
+        node_taints = node.get_taints(
+            exclude_cordon=True,
+            exclude_not_ready=True,
+            exclude_effects=['PreferNoSchedule'],
+            exclude_keys=kubernetes_utils.get_handled_taint_keys())
+        node_is_tainted = len(node_taints) > 0
 
         for key in keys:
             if key in node.metadata.labels:
@@ -268,8 +275,9 @@ def _list_accelerators(
                 total_accelerators_available[accelerator_name] = (
                     total_accelerators_available.get(accelerator_name, 0))
 
-                # Skip availability counting for not-ready nodes
-                if not node_is_ready:
+                # Skip availability counting for not-ready, cordoned,
+                # or tainted nodes
+                if not node_is_ready or node_is_cordoned or node_is_tainted:
                     continue
 
                 if error_on_get_allocated_gpu_qty_by_node:
diff --git a/sky/catalog/yotta_catalog.py b/sky/catalog/yotta_catalog.py
new file mode 100644
index 00000000000..db317b221ec
--- /dev/null
+++ b/sky/catalog/yotta_catalog.py
@@ -0,0 +1,98 @@
+""" Yotta | Catalog
+This module loads the service catalog file and can be used to
+query instance types and pricing information for Yotta.
+"""
+
+import typing
+from typing import Dict, List, Optional, Tuple, Union
+
+from sky.catalog import common
+from sky.utils import ux_utils
+
+if typing.TYPE_CHECKING:
+    from sky.clouds import cloud
+
+_df = common.read_catalog('yotta/vms.csv')
+
+
+def instance_type_exists(instance_type: str) -> bool:
+    return common.instance_type_exists_impl(_df, instance_type)
+
+
+def validate_region_zone(
+        region: Optional[str],
+        zone: Optional[str]) -> Tuple[Optional[str], Optional[str]]:
+    if zone is not None:
+        with ux_utils.print_exception_no_traceback():
+            raise ValueError('Yotta does not support zones.')
+    return common.validate_region_zone_impl('yotta', _df, region, zone)
+
+
+def get_hourly_cost(instance_type: str,
+                    use_spot: bool = False,
+                    region: Optional[str] = None,
+                    zone: Optional[str] = None) -> float:
+    """Returns the cost, or the cheapest cost among all zones for spot."""
+    return common.get_hourly_cost_impl(_df, instance_type, use_spot, region,
+                                       zone)
+
+
+def get_vcpus_mem_from_instance_type(
+        instance_type: str) -> Tuple[Optional[float], Optional[float]]:
+    return common.get_vcpus_mem_from_instance_type_impl(_df, instance_type)
+
+
+def get_default_instance_type(cpus: Optional[str] = None,
+                              memory: Optional[str] = None,
+                              disk_tier: Optional[str] = None,
+                              region: Optional[str] = None,
+                              zone: Optional[str] = None) -> Optional[str]:
+    del disk_tier, region, zone  # Unused.
+    # NOTE: After expanding catalog to multiple entries, you may
+    # want to specify a default instance type or family.
+    return common.get_instance_type_for_cpus_mem_impl(_df, cpus, memory)
+
+
+def get_accelerators_from_instance_type(
+        instance_type: str) -> Optional[Dict[str, Union[int, float]]]:
+    return common.get_accelerators_from_instance_type_impl(_df, instance_type)
+
+
+def get_instance_type_for_accelerator(
+        acc_name: str,
+        acc_count: int,
+        cpus: Optional[str] = None,
+        memory: Optional[str] = None,
+        use_spot: bool = False,
+        region: Optional[str] = None,
+        zone: Optional[str] = None) -> Tuple[Optional[List[str]], List[str]]:
+    """Returns a list of instance types that have the given accelerator."""
+    return common.get_instance_type_for_accelerator_impl(df=_df,
+                                                         acc_name=acc_name,
+                                                         acc_count=acc_count,
+                                                         cpus=cpus,
+                                                         memory=memory,
+                                                         use_spot=use_spot,
+                                                         region=region,
+                                                         zone=zone)
+
+
+def get_region_zones_for_instance_type(instance_type: str,
+                                       use_spot: bool) -> List['cloud.Region']:
+    df = _df[_df['InstanceType'] == instance_type]
+    return common.get_region_zones(df, use_spot)
+
+
+def list_accelerators(
+        gpus_only: bool,
+        name_filter: Optional[str],
+        region_filter: Optional[str],
+        quantity_filter: Optional[int],
+        case_sensitive: bool = True,
+        all_regions: bool = False,
+        require_price: bool = True) -> Dict[str, List[common.InstanceTypeInfo]]:
+    """Returns all instance types in Yotta offering GPUs."""
+    del require_price  # Unused.
+    return common.list_accelerators_impl('Yotta', _df, gpus_only, name_filter,
+                                         region_filter, quantity_filter,
+                                         case_sensitive, all_regions)
diff --git a/sky/client/cli/command.py b/sky/client/cli/command.py
index 0a81f2b50ed..312238f9b39 100644
--- a/sky/client/cli/command.py
+++ b/sky/client/cli/command.py
@@ -28,10 +28,12 @@
 import fnmatch
 import os
 import pathlib
+import re
 import shlex
 import shutil
 import subprocess
 import sys
+import tempfile
 import time
 import traceback
 import typing
@@ -68,6 +70,7 @@
 from sky.schemas.api import responses
 from sky.server import common as server_common
 from sky.server import constants as server_constants
+from sky.server.requests import payloads
 from sky.server.requests import requests
 from sky.skylet import autostop_lib
 from sky.skylet import constants
@@ -123,7 +126,8 @@
 ]
 _DEFAULT_MANAGED_JOB_FIELDS_TO_GET = [
     'job_id', 'task_id', 'workspace', 'job_name', 'task_name', 'resources',
-    'submitted_at', 'end_at', 'job_duration', 'recovery_count', 'status', 'pool'
+    'submitted_at', 'end_at', 'job_duration', 'recovery_count', 'status',
+    'pool', 'is_primary_in_job_group'
 ]
 _VERBOSE_MANAGED_JOB_FIELDS_TO_GET = _DEFAULT_MANAGED_JOB_FIELDS_TO_GET + [
     'current_cluster_name', 'job_id_on_pool_cluster', 'start_at', 'infra',
@@ -317,14 +321,21 @@ def _async_call_or_wait(request_id: server_common.RequestId[T],
             f'{colorama.Style.RESET_ALL}\n')
 
 
-def _merge_env_vars(env_dict: Optional[Dict[str, str]],
-                    env_list: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
-    """Merges all values from env_list into env_dict."""
-    if not env_dict:
-        return env_list
-    for (key, value) in env_list:
-        env_dict[key] = value
-    return list(env_dict.items())
+def _merge_cli_and_file_vars(
+        env_dicts: List[Optional[Dict[str, str]]],
+        env_list: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
+    """Merges all values from env_list and env_dicts. Priority is
+    as follows: env_list has highest priority, and env_dict with
+    higher index has more priority than that of lower index."""
+    final_env_dict = {}
+    for env_dict in env_dicts:
+        if env_dict is None:
+            continue
+        for k, v in env_dict.items():
+            final_env_dict[k] = v
+    for k, v in env_list:
+        final_env_dict[k] = v
+    return list(final_env_dict.items())
 
 
 def _complete_cluster_name(ctx: click.Context, param: click.Parameter,
@@ -709,6 +720,27 @@ def _check_yaml(entrypoint: str) -> Tuple[bool, Optional[Dict[str, Any]]]:
     return is_yaml, result
 
 
+def _check_recipe_reference(entrypoint: str) -> Tuple[bool, Optional[str]]:
+    """Check if entrypoint is a recipe reference like 'recipes:my-recipe'.
+
+    Args:
+        entrypoint: The entrypoint string to check.
+
+    Returns:
+        Tuple of (is_recipe, recipe_name). If is_recipe is True, recipe_name
+        contains the name of the recipe to fetch from the Recipe Hub.
+    """
+    # Pattern matches 'recipes:<valid-recipe-name>'
+    # Recipe names must start with a letter, and can contain letters, numbers,
+    # and dashes, and must end with an alphanumeric character.
+    pattern = re.compile(r'^recipes:(' + constants.RECIPE_NAME_VALID_REGEX +
+                         r')$')
+    match = pattern.match(entrypoint)
+    if match:
+        return True, match.group(1)
+    return False, None
+
+
 def _pop_and_ignore_fields_in_override_params(
         params: Dict[str, Any], field_to_ignore: List[str]) -> None:
     """Pops and ignores fields in override params.
@@ -728,6 +760,55 @@ def _pop_and_ignore_fields_in_override_params(
                             fg='yellow')
 
 
+def _get_recipe_yaml(entrypoint: str) -> Optional[str]:
+    """Checks if entrypoint is a recipe reference and returns the recipe YAML.
+
+    Fetches the recipe content from the API server.
+
+    Args:
+        entrypoint: The entrypoint string to check.
+
+    Returns:
+        The recipe YAML if entrypoint is a recipe reference. Otherwise, None.
+    """
+    is_recipe, recipe_name = _check_recipe_reference(entrypoint)
+    if is_recipe:
+        assert recipe_name is not None  # For mypy
+        click.secho('Recipe to run: ', fg='cyan', nl=False)
+        click.secho(recipe_name)
+        try:
+            # Make API request to fetch recipe from server
+            body = payloads.RecipeGetBody(recipe_name=recipe_name)
+            response = server_common.make_authenticated_request(
+                'POST', '/recipes/get', json=body.model_dump())
+            request_id: server_common.RequestId[Optional[Dict[
+                str, Any]]] = server_common.get_request_id(response)
+            recipe = sdk.get(request_id)
+        except requests_lib.exceptions.ConnectionError as e:
+            raise click.UsageError(
+                f'Failed to connect to API server to fetch recipe '
+                f'{recipe_name!r}: {e}') from e
+        except Exception as e:
+            # Handle errors from the API server (e.g., recipe not found)
+            raise click.UsageError(str(e)) from e
+
+        if recipe is None:
+            raise click.UsageError(f'Recipe not found: {recipe_name}')
+
+        content = recipe.get('content')
+        if content is None:
+            raise click.UsageError(f'Recipe {recipe_name!r} has no content')
+
+        # Write to temp file and treat as YAML
+        with tempfile.NamedTemporaryFile(mode='w', suffix='.yaml',
+                                         delete=False) as f:
+            f.write(content)
+            return f.name
+    else:
+        logger.debug(f'Not a recipe reference: {entrypoint}')
+    return None
+
+
 def _make_task_or_dag_from_entrypoint_with_overrides(
     entrypoint: Tuple[str, ...],
     *,
@@ -766,7 +847,14 @@ def _make_task_or_dag_from_entrypoint_with_overrides(
         raise click.UsageError('Cannot specify both --git-url and --workdir')
 
     entrypoint = ' '.join(entrypoint)
+
+    # Check if entrypoint is a recipe reference (recipes:<name>)
+    recipe_yaml = _get_recipe_yaml(entrypoint)
+    if recipe_yaml is not None:
+        entrypoint = recipe_yaml
+
     is_yaml, _ = _check_yaml(entrypoint)
+
     entrypoint: Optional[str]
     if is_yaml:
         # Treat entrypoint as a yaml.
@@ -801,6 +889,20 @@ def _make_task_or_dag_from_entrypoint_with_overrides(
     if is_yaml:
         assert entrypoint is not None
         usage_lib.messages.usage.update_user_task_yaml(entrypoint)
+
+        # Check if this is a JobGroup YAML
+        if dag_utils.is_job_group_yaml(entrypoint):
+            click.secho('Detected JobGroup YAML', fg='cyan')
+            dag = dag_utils.load_job_group_from_yaml(entrypoint,
+                                                     env_overrides=env,
+                                                     secrets_overrides=secret)
+            if override_params:
+                click.secho(
+                    f'WARNING: override params {override_params} are ignored '
+                    'for JobGroup YAML.',
+                    fg='yellow')
+            return dag
+
         dag = dag_utils.load_chain_dag_from_yaml(entrypoint,
                                                  env_overrides=env,
                                                  secret_overrides=secret)
@@ -1058,6 +1160,7 @@ def launch(
     image_id: Optional[str],
     env_file: Optional[Dict[str, str]],
     env: List[Tuple[str, str]],
+    secret_file: Optional[Dict[str, str]],
     secret: List[Tuple[str, str]],
     disk_size: Optional[int],
     disk_tier: Optional[str],
@@ -1093,7 +1196,8 @@ def launch(
     # job can take up resources on the API server. When there are a lot of
     # `launch` submitted asynchronously, the log tailing may overwhelm the API
     # server, if the jobs are long running.
-    env = _merge_env_vars(env_file, env)
+    env = _merge_cli_and_file_vars([env_file], env)
+    secret = _merge_cli_and_file_vars([env_file, secret_file], secret)
     controller_utils.check_cluster_name_not_controller(
         cluster, operation_str='Launching tasks on it')
     if backend_name is None:
@@ -1247,6 +1351,7 @@ def exec(
     image_id: Optional[str],
     env_file: Optional[Dict[str, str]],
     env: List[Tuple[str, str]],
+    secret_file: Optional[Dict[str, str]],
     secret: List[Tuple[str, str]],
     cpus: Optional[str],
     memory: Optional[str],
@@ -1327,7 +1432,8 @@ def exec(
         raise click.UsageError('Missing argument \'[ENTRYPOINT]...\'')
     assert cluster is not None, (cluster, cluster_option, entrypoint)
 
-    env = _merge_env_vars(env_file, env)
+    env = _merge_cli_and_file_vars([env_file], env)
+    secret = _merge_cli_and_file_vars([env_file, secret_file], secret)
     controller_utils.check_cluster_name_not_controller(
         cluster, operation_str='Executing task on it')
 
@@ -1568,12 +1674,15 @@ def _handle_services_request(
             # print the original error.
             pass
         if not msg:
-            msg = (f'Failed to fetch {noun} statuses due to connection issues. '
-                   'Please try again later. Details: '
-                   f'{common_utils.format_exception(e, use_bracket=True)}')
-    except Exception as e:  # pylint: disable=broad-except
-        msg = (f'Failed to fetch {noun} statuses: '
-               f'{common_utils.format_exception(e, use_bracket=True)}')
+            # This is an actual error (connection issues), not a normal state.
+            # Format the error message and raise a new exception.
+            # Use 'from None' to suppress the exception chain and only show
+            # the formatted message.
+            error_msg = (
+                f'Failed to fetch {noun} statuses due to connection issues. '
+                'Please try again later. Details: '
+                f'{common_utils.format_exception(e, use_bracket=True)}')
+            raise RuntimeError(error_msg) from None
     else:
         if show_endpoint:
             if len(service_records) != 1:
@@ -1630,7 +1739,8 @@ def _show_endpoint(query_clusters: Optional[List[str]],
                     ('endpoint port' if show_single_endpoint else 'endpoints')))
 
     cluster_record = cluster_records[0]
-    if cluster_record['status'] != status_lib.ClusterStatus.UP:
+    if cluster_record['status'] not in (status_lib.ClusterStatus.UP,
+                                        status_lib.ClusterStatus.AUTOSTOPPING):
         with ux_utils.print_exception_no_traceback():
             raise RuntimeError(f'Cluster {cluster_record["name"]!r} '
                                'is not in UP status.')
@@ -1807,6 +1917,7 @@ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
     # Do not show job queue if user specifies clusters, and if user
     # specifies --ip or --endpoint(s).
     show_managed_jobs = show_managed_jobs and not any([clusters, ip, endpoints])
+    show_pools = show_pools and not any([clusters, ip, endpoints])
     show_endpoints = endpoints or endpoint is not None
     show_single_endpoint = endpoint is not None
     show_services = show_services and not any([clusters, ip, endpoints])
@@ -2022,6 +2133,11 @@ def submit_enabled_clouds():
                     sdk.api_cancel(pool_status_request_id, silent=True)
                     num_pools = -1
                     msg = 'KeyboardInterrupt'
+                except Exception as e:  # pylint: disable=broad-except
+                    # For internal calls, handle exceptions gracefully by
+                    # printing the error message instead of crashing.
+                    num_pools = None
+                    msg = str(e)
         if num_pools is not None:
             if num_pools > 0:
                 click.echo(f'\n{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
@@ -2050,6 +2166,11 @@ def submit_enabled_clouds():
                     sdk.api_cancel(service_status_request_id, silent=True)
                     num_services = -1
                     msg = 'KeyboardInterrupt'
+                except Exception as e:  # pylint: disable=broad-except
+                    # For internal calls, handle exceptions gracefully by
+                    # printing the error message instead of crashing.
+                    num_services = None
+                    msg = str(e)
         click.echo(msg)
         if num_services is not None:
             hints.append(
@@ -2247,6 +2368,10 @@ def _get_job_queue(cluster):
               is_flag=True,
               default=False,
               help='Stream the cluster provisioning logs (provision.log).')
+@click.option('--autostop',
+              is_flag=True,
+              default=False,
+              help='Stream the autostop hook logs from the cluster.')
 @click.option('--worker',
               '-w',
               default=None,
@@ -2290,6 +2415,7 @@ def logs(
     cluster: str,
     job_ids: Tuple[str, ...],
     provision: bool,
+    autostop: bool,  # pylint: disable=redefined-outer-name
     worker: Optional[int],
     sync_down: bool,
     status: bool,  # pylint: disable=redefined-outer-name
@@ -2319,6 +2445,9 @@ def logs(
 
     4. If the job fails or fetching the logs fails, the command will exit with
     a non-zero return code.
+
+    5. If ``--autostop`` is specified, stream the autostop hook logs from the
+    cluster. This shows the output of the autostop hook script.
     """
     if worker is not None:
         if not provision:
@@ -2327,11 +2456,20 @@ def logs(
         if worker < 1:
             raise click.UsageError('--worker must be a positive integer.')
 
+    if provision and autostop:
+        raise click.UsageError(
+            '--provision and --autostop cannot be used together.')
+
     if provision and (sync_down or status or job_ids):
         raise click.UsageError(
             '--provision cannot be combined with job log options '
             '(--sync-down/--status/job IDs).')
 
+    if autostop and (sync_down or status or job_ids or worker is not None):
+        raise click.UsageError(
+            '--autostop cannot be combined with job log options '
+            '(--sync-down/--status/--worker/job IDs).')
+
     if sync_down and status:
         raise click.UsageError(
             'Both --sync_down and --status are specified '
@@ -2352,6 +2490,13 @@ def logs(
                                     follow=follow,
                                     tail=tail))
 
+    if autostop:
+        # Stream autostop hook logs
+        sys.exit(
+            sdk.tail_autostop_logs(cluster_name=cluster,
+                                   follow=follow,
+                                   tail=tail))
+
     if sync_down:
         with rich_utils.client_status(
                 ux_utils.spinner_message('Downloading logs')):
@@ -2550,13 +2695,15 @@ def cancel(
 @flags.all_option('Stop all existing clusters.')
 @flags.all_users_option('Stop all existing clusters for all users.')
 @flags.yes_option()
-@_add_click_options(flags.COMMON_OPTIONS)
+@_add_click_options(flags.GRACEFUL_OPTIONS + flags.COMMON_OPTIONS)
 @usage_lib.entrypoint
 def stop(
     clusters: List[str],
     all: bool,  # pylint: disable=redefined-builtin
     all_users: bool,
     yes: bool,
+    graceful: bool,
+    graceful_timeout: Optional[int],
     async_call: bool,
 ):
     # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
@@ -2593,6 +2740,8 @@ def stop(
                            all_users=all_users,
                            down=False,
                            no_confirm=yes,
+                           graceful=graceful,
+                           graceful_timeout=graceful_timeout,
                            async_call=async_call)
 
 
@@ -2970,7 +3119,7 @@ def start(
           ' in certain manual troubleshooting scenarios; with it set, it is the'
           ' user\'s responsibility to ensure there are no leaked instances and '
           'related resources.'))
-@_add_click_options(flags.COMMON_OPTIONS)
+@_add_click_options(flags.GRACEFUL_OPTIONS + flags.COMMON_OPTIONS)
 @usage_lib.entrypoint
 def down(
     clusters: List[str],
@@ -2978,6 +3127,8 @@ def down(
     all_users: bool,
     yes: bool,
     purge: bool,
+    graceful: bool,
+    graceful_timeout: Optional[int],
     async_call: bool,
 ):
     # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
@@ -3014,6 +3165,8 @@ def down(
                            down=True,
                            no_confirm=yes,
                            purge=purge,
+                           graceful=graceful,
+                           graceful_timeout=graceful_timeout,
                            async_call=async_call)
 
 
@@ -3175,6 +3328,8 @@ def _down_or_stop_clusters(
         down: bool = False,  # pylint: disable=redefined-outer-name
         no_confirm: bool = True,
         purge: bool = False,
+        graceful: bool = False,
+        graceful_timeout: Optional[int] = None,
         idle_minutes_to_autostop: Optional[int] = None,
         wait_for: Optional[autostop_lib.AutostopWaitFor] = None,
         async_call: bool = False) -> None:
@@ -3192,6 +3347,10 @@ def _down_or_stop_clusters(
         down: If True, tear down the clusters.
         no_confirm: If True, skip the confirmation prompt.
         purge: If True, forcefully remove the clusters from the cluster table.
+        graceful: If True, cancel the user task, but block until MOUNT_CACHE
+            finishes uploads.
+        graceful_timeout: If not None, sets a timeout for the graceful option
+            above (in seconds).
         idle_minutes_to_autostop: The number of minutes to wait before
             automatically stopping the cluster.
         wait_for: Determines the condition for resetting the idleness timer.
@@ -3375,9 +3534,15 @@ def _down_or_stop(name: str):
         else:
             try:
                 if down:
-                    request_id = sdk.down(name, purge=purge)
+                    request_id = sdk.down(name,
+                                          purge=purge,
+                                          graceful=graceful,
+                                          graceful_timeout=graceful_timeout)
                 else:
-                    request_id = sdk.stop(name, purge=purge)
+                    request_id = sdk.stop(name,
+                                          purge=purge,
+                                          graceful=graceful,
+                                          graceful_timeout=graceful_timeout)
                 request_ids.append(request_id)
                 progress.stop()
                 _async_call_or_wait(
@@ -3734,7 +3899,10 @@ def _count_not_ready_gpus(
                     continue
 
                 node_is_ready = getattr(node_info, 'is_ready', True)
-                if not node_is_ready:
+                node_is_cordoned = getattr(node_info, 'is_cordoned', False)
+                node_taints = getattr(node_info, 'taints', None) or []
+                node_is_tainted = len(node_taints) > 0
+                if not node_is_ready or node_is_cordoned or node_is_tainted:
                     not_ready_counts[accelerator_type] += accelerator_count
             return not_ready_counts
 
@@ -3890,7 +4058,7 @@ def _format_kubernetes_node_info_combined(
             context_title_str: str = 'CONTEXT') -> str:
         node_table = log_utils.create_table([
             context_title_str, 'NODE', 'vCPU', 'Memory (GB)', 'GPU',
-            'GPU UTILIZATION'
+            'GPU UTILIZATION', 'NODE STATUS'
         ])
 
         no_permissions_str = '<no permissions>'
@@ -3949,15 +4117,41 @@ def _format_kubernetes_node_info_combined(
                 utilization_str = (
                     f'{available} of '
                     f'{node_info.total["accelerator_count"]} free')
+
+                # Build node status string
+                status_info = []
                 # Check if node is ready (defaults to True for backward
                 # compatibility with older server versions)
                 node_is_ready = getattr(node_info, 'is_ready', True)
                 if not node_is_ready:
-                    utilization_str += ' (Node NotReady)'
-
+                    status_info.append('NotReady')
+                node_is_cordoned = getattr(node_info, 'is_cordoned', False)
+                if node_is_cordoned:
+                    status_info.append('Cordoned')
+                # Add taint info grouped by effect
+                taints = getattr(node_info, 'taints', None)
+                if taints:
+                    # Group taints by effect: 'NoSchedule Taint [key1, key2],
+                    # NoExecute Taint [key3]'
+                    taints_by_effect: Dict[str, List[str]] = {}
+                    for taint in taints:
+                        effect = taint['effect']
+                        key = taint['key']
+                        if effect not in taints_by_effect:
+                            taints_by_effect[effect] = []
+                        taints_by_effect[effect].append(key)
+                    taints_strs = []
+                    for effect, keys in taints_by_effect.items():
+                        taints_strs.append(
+                            f'{effect} Taint [{", ".join(keys)}]')
+                    if taints_strs:
+                        status_info.append(', '.join(taints_strs))
+
+                status_str = ', '.join(
+                    status_info) if status_info else 'Healthy'
                 node_table.add_row([
                     context_name, node_name, cpu_str, memory_str, acc_type,
-                    utilization_str
+                    utilization_str, status_str
                 ])
 
         k8s_per_node_acc_message = (f'{cloud_str} per-node GPU availability')
@@ -4005,6 +4199,30 @@ def _format_slurm_node_info(slurm_cluster_names: List[str]) -> str:
                 f'{colorama.Style.RESET_ALL}\n'
                 f'{node_table.get_string()}')
 
+    def _get_labeled_zero_gpu_hint(
+            all_nodes_info: List[Tuple[str,
+                                       'models.KubernetesNodesInfo']]) -> str:
+        """Returns a hint if any nodes have GPU labels but 0 GPU resources."""
+        # Collect nodes with GPU labels but 0 GPU resources
+        labeled_zero_gpu_nodes = [
+            (context, node_name)
+            for context, nodes_info in all_nodes_info
+            for node_name, node_info in nodes_info.node_info_dict.items()
+            if (node_info.accelerator_type is not None and
+                node_info.total.get('accelerator_count', 0) == 0)
+        ]
+
+        if not labeled_zero_gpu_nodes:
+            return ''
+
+        num_affected_nodes = len(labeled_zero_gpu_nodes)
+        node_list = ', '.join(
+            f'{ctx}/{name}' for ctx, name in labeled_zero_gpu_nodes[:3])
+        ellipsis = '...' if len(labeled_zero_gpu_nodes) > 3 else ''
+        return (f'Note: Some Kubernetes nodes have GPU labels but report 0 GPU '
+                f'resources. Please check the node labels and configuration. '
+                f'Affected {num_affected_nodes} node(s): {node_list}{ellipsis}')
+
     def _format_kubernetes_realtime_gpu(
             total_table: Optional['prettytable.PrettyTable'],
             k8s_realtime_infos: List[Tuple[str, 'prettytable.PrettyTable']],
@@ -4070,6 +4288,11 @@ def _possibly_show_k8s_like_realtime(
                                                            show_node_info=True,
                                                            is_ssh=is_ssh)
 
+                # Check for nodes with GPU labels but 0 GPU resources
+                labeled_zero_hint = _get_labeled_zero_gpu_hint(all_nodes_info)
+                if labeled_zero_hint:
+                    k8s_messages += labeled_zero_hint
+
             if kubernetes_autoscaling:
                 k8s_messages += ('\n' +
                                  kubernetes_utils.KUBERNETES_AUTOSCALER_NOTE)
@@ -4078,6 +4301,8 @@ def _possibly_show_k8s_like_realtime(
                 if not ssh_is_enabled:
                     yield ('SSH Node Pools are not enabled. To fix, run: '
                            'sky check ssh ')
+                if k8s_messages and print_section_titles:
+                    yield '\n\n'
                 yield k8s_messages
                 return True, print_section_titles, ''
         else:
@@ -4085,6 +4310,8 @@ def _possibly_show_k8s_like_realtime(
                 if not kubernetes_is_enabled:
                     yield ('Kubernetes is not enabled. To fix, run: '
                            'sky check kubernetes ')
+                if k8s_messages and print_section_titles:
+                    yield '\n\n'
                 yield k8s_messages
                 return True, print_section_titles, ''
         return False, print_section_titles, k8s_messages
@@ -4112,6 +4339,11 @@ def _possibly_show_k8s_like_realtime_for_acc(
                                                            all_nodes_info,
                                                            show_node_info=False,
                                                            is_ssh=is_ssh)
+
+                # Check for nodes with GPU labels but 0 GPU resources
+                labeled_zero_hint = _get_labeled_zero_gpu_hint(all_nodes_info)
+                if labeled_zero_hint:
+                    k8s_messages += labeled_zero_hint
             except ValueError as e:
                 # In the case of a specific accelerator, show the error message
                 # immediately (e.g., "Resources H100 not found ...")
@@ -4195,6 +4427,8 @@ def _output() -> Generator[str, None, None]:
                 stop_iter = stop_iter or stop_iter_one
                 print_section_titles = (print_section_titles or
                                         print_section_titles_one)
+                if k8s_messages and k8s_messages_one:
+                    k8s_messages += '\n'
                 k8s_messages += k8s_messages_one
                 prev_print_section_titles = print_section_titles_one
             if stop_iter:
@@ -4381,11 +4615,8 @@ def _output() -> Generator[str, None, None]:
                                                    min_spot_price=('spot_price',
                                                                    'min'))
             df = df.merge(min_price_df, on='cloud')
-            # Sort within each cloud by price.
-            df = df.groupby('cloud', group_keys=False).apply(
-                lambda x: x.sort_values(by=['price', 'spot_price']))
-            # Sort across groups (clouds).
-            df = df.sort_values(by=['min_price', 'min_spot_price'])
+            df = df.sort_values(
+                by=['min_price', 'min_spot_price', 'price', 'spot_price'])
             df = df.drop(columns=['min_price', 'min_spot_price'])
             sorted_dataclasses = [
                 catalog_common.InstanceTypeInfo(*row)
@@ -4644,6 +4875,12 @@ def volumes_apply(
     volume_config_dict: Dict[str, Any] = {}
     if entrypoint is not None and len(entrypoint) > 0:
         entrypoint_str = ' '.join(entrypoint)
+
+        # Check if the entrypoint is a recipe reference
+        recipe_yaml = _get_recipe_yaml(entrypoint_str)
+        if recipe_yaml is not None:
+            entrypoint_str = recipe_yaml
+
         is_yaml, yaml_config, yaml_file_provided, invalid_reason = (
             _check_yaml_only(entrypoint_str))
         if not is_yaml:
@@ -4717,10 +4954,18 @@ def _build_volume_override_config(
               is_flag=True,
               required=False,
               help='Show all information in full.')
+@click.option('--refresh',
+              '-r',
+              default=False,
+              is_flag=True,
+              required=False,
+              help='Refresh volume state from cloud APIs before listing. '
+              'Without this flag, cached data is returned which is updated '
+              'periodically by the background daemon.')
 @usage_lib.entrypoint
-def volumes_ls(verbose: bool):
+def volumes_ls(verbose: bool, refresh: bool):
     """List volumes managed by SkyPilot."""
-    request_id = volumes_sdk.ls()
+    request_id = volumes_sdk.ls(refresh=refresh)
     all_volumes = sdk.stream_and_get(request_id)
     volume_table = table_utils.format_volume_table(all_volumes,
                                                    show_all=verbose)
@@ -4881,6 +5126,7 @@ def jobs_launch(
     job_recovery: Optional[str],
     env_file: Optional[Dict[str, str]],
     env: List[Tuple[str, str]],
+    secret_file: Optional[Dict[str, str]],
     secret: List[Tuple[str, str]],
     disk_size: Optional[int],
     disk_tier: Optional[str],
@@ -4917,7 +5163,8 @@ def jobs_launch(
             raise click.UsageError('Cannot specify both --name and --cluster. '
                                    'Use one of the flags as they are alias.')
         name = cluster
-    env = _merge_env_vars(env_file, env)
+    env = _merge_cli_and_file_vars([env_file], env)
+    secret = _merge_cli_and_file_vars([env_file, secret_file], secret)
     cloud, region, zone = _handle_infra_cloud_region_zone_options(
         infra, cloud, region, zone)
     task_or_dag = _make_task_or_dag_from_entrypoint_with_overrides(
@@ -5291,10 +5538,31 @@ def jobs_cancel(
               required=False,
               help='Download logs for all jobs shown in the queue.')
 @click.argument('job_id', required=False, type=int)
+@click.argument('task', required=False, type=str, default=None)
 @usage_lib.entrypoint
 def jobs_logs(name: Optional[str], job_id: Optional[int], follow: bool,
-              controller: bool, refresh: bool, sync_down: bool):
-    """Tail or sync down the log of a managed job."""
+              controller: bool, refresh: bool, sync_down: bool,
+              task: Optional[str]):
+    """Tail or sync down the log of a managed job.
+
+    TASK can be a task ID (integer) or task name. Numeric values are treated
+    as task IDs. If not specified, logs for all tasks are shown.
+
+
+    Examples:
+
+    \b
+    # View logs for job ID 1, task 0
+    sky jobs logs 1 0
+
+    \b
+    # View logs for job named 'my-job', task 'train'
+    sky jobs logs -n my-job train
+
+    \b
+    # View logs for job named 'my-job', task 'eval'
+    sky jobs logs -n my-job eval
+    """
     try:
         if sync_down:
             with rich_utils.client_status(
@@ -5311,11 +5579,17 @@ def jobs_logs(name: Optional[str], job_id: Optional[int], follow: bool,
                 logger.info(f'{fore.CYAN}Job {job} logs{controller_str}: '
                             f'{log_local_path}{style.RESET_ALL}')
         else:
+            # Parse task argument: if numeric, treat as task ID (int),
+            # otherwise treat as task name (str)
+            parsed_task: Optional[Union[str, int]] = None
+            if task is not None:
+                parsed_task = int(task) if task.isdigit() else task
             returncode = managed_jobs.tail_logs(name=name,
                                                 job_id=job_id,
                                                 follow=follow,
                                                 controller=controller,
-                                                refresh=refresh)
+                                                refresh=refresh,
+                                                task=parsed_task)
             sys.exit(returncode)
     except exceptions.ClusterNotUpError:
         with ux_utils.print_exception_no_traceback():
@@ -5384,6 +5658,7 @@ def jobs_pool_apply(
     image_id: Optional[str],
     env_file: Optional[Dict[str, str]],
     env: List[Tuple[str, str]],
+    secret_file: Optional[Dict[str, str]],
     secret: List[Tuple[str, str]],
     gpus: Optional[str],
     instance_type: Optional[str],
@@ -5417,6 +5692,12 @@ def jobs_pool_apply(
             'Cannot specify both --workers and POOL_YAML. Please use one of '
             'them.')
 
+    if pool_yaml is not None and len(pool_yaml) > 0:
+        recipe_yaml = _get_recipe_yaml(pool_yaml[0])
+        if recipe_yaml is not None:
+            click.secho('Recipe to run: ', fg='cyan', nl=False)
+            pool_yaml = (recipe_yaml,)
+
     if pool_yaml is None or len(pool_yaml) == 0:
         if pool is None:
             raise click.UsageError(
@@ -5444,6 +5725,7 @@ def jobs_pool_apply(
             image_id=image_id,
             env_file=env_file,
             env=env,
+            secret_file=secret_file,
             secret=secret,
             disk_size=disk_size,
             disk_tier=disk_tier,
@@ -5885,7 +6167,8 @@ def _generate_task_with_service(
     image_id: Optional[str],
     env_file: Optional[Dict[str, str]],
     env: List[Tuple[str, str]],
-    secret: Optional[List[Tuple[str, str]]],
+    secret_file: Optional[Dict[str, str]],
+    secret: List[Tuple[str, str]],
     gpus: Optional[str],
     instance_type: Optional[str],
     ports: Optional[Tuple[str]],
@@ -5904,7 +6187,8 @@ def _generate_task_with_service(
     yaml_name = 'SERVICE_YAML' if not pool else 'POOL_YAML'
     if not is_yaml:
         raise click.UsageError(f'{yaml_name} must be a valid YAML file.')
-    env = _merge_env_vars(env_file, env)
+    env = _merge_cli_and_file_vars([env_file], env)
+    secret = _merge_cli_and_file_vars([env_file, secret_file], secret)
     # We keep nargs=-1 in service_yaml argument to reuse this function.
     task = _make_task_or_dag_from_entrypoint_with_overrides(
         service_yaml_args,
@@ -6042,6 +6326,7 @@ def serve_up(
     image_id: Optional[str],
     env_file: Optional[Dict[str, str]],
     env: List[Tuple[str, str]],
+    secret_file: Optional[Dict[str, str]],
     secret: List[Tuple[str, str]],
     gpus: Optional[str],
     instance_type: Optional[str],
@@ -6105,6 +6390,7 @@ def serve_up(
         image_id=image_id,
         env_file=env_file,
         env=env,
+        secret_file=secret_file,
         secret=secret,
         disk_size=disk_size,
         disk_tier=disk_tier,
@@ -6156,12 +6442,12 @@ def serve_up(
 @timeline.event
 @usage_lib.entrypoint
 def serve_update(
-        service_name: str, service_yaml: Tuple[str,
-                                               ...], workdir: Optional[str],
-        infra: Optional[str], cloud: Optional[str], region: Optional[str],
-        zone: Optional[str], num_nodes: Optional[int], use_spot: Optional[bool],
-        image_id: Optional[str], env_file: Optional[Dict[str, str]],
-        env: List[Tuple[str, str]], secret: List[Tuple[str, str]],
+        service_name: str, service_yaml: Tuple[str, ...],
+        workdir: Optional[str], infra: Optional[str], cloud: Optional[str],
+        region: Optional[str], zone: Optional[str], num_nodes: Optional[int],
+        use_spot: Optional[bool], image_id: Optional[str],
+        env_file: Optional[Dict[str, str]], env: List[Tuple[str, str]],
+        secret_file: Optional[Dict[str, str]], secret: List[Tuple[str, str]],
         gpus: Optional[str], instance_type: Optional[str], ports: Tuple[str],
         cpus: Optional[str], memory: Optional[str], disk_size: Optional[int],
         disk_tier: Optional[str], network_tier: Optional[str], mode: str,
@@ -6215,6 +6501,7 @@ def serve_update(
         image_id=image_id,
         env_file=env_file,
         env=env,
+        secret_file=secret_file,
         secret=secret,
         disk_size=disk_size,
         disk_tier=disk_tier,
diff --git a/sky/client/cli/flags.py b/sky/client/cli/flags.py
index 531c47281d6..d2012af9cb5 100644
--- a/sky/client/cli/flags.py
+++ b/sky/client/cli/flags.py
@@ -52,6 +52,21 @@ def _parse_secret_var(secret_var: str) -> Tuple[str, str]:
                  help=('Run the command asynchronously.'))
 ]
 
+GRACEFUL_OPTIONS = [
+    click.option(
+        '--graceful',
+        is_flag=True,
+        default=False,
+        help=('Wait for MOUNT_CACHED uploads to complete before '
+              'stopping/terminating. Will cancel current jobs first.')),
+    click.option('--graceful-timeout',
+                 type=int,
+                 default=None,
+                 help=('Timeout in seconds for `--graceful` flag. When not '
+                       'set, will wait for MOUNT_CACHED uploads until they are '
+                       'finished.')),
+]
+
 TASK_OPTIONS = [
     click.option(
         '--workdir',
@@ -155,7 +170,11 @@ def _parse_secret_var(secret_var: str) -> Tuple[str, str]:
         node.
 
         If any values from ``--env-file`` conflict with values set by
-        ``--env``, the ``--env`` value will be preferred."""),
+        ``--env``, the ``--env`` value will be preferred.
+
+        Values from ``--env-file`` will also load to secrets with lower
+        preference compared to ``--secret`` or ``--secret-file``.
+        """),
     click.option(
         '--env',
         required=False,
@@ -176,6 +195,16 @@ def _parse_secret_var(secret_var: str) -> Tuple[str, str]:
         3. ``--env MY_ENV3``: set ``$MY_ENV3`` on the cluster to be the
         same value of ``$MY_ENV3`` in the local environment.""",
     ),
+    click.option(
+        '--secret-file',
+        required=False,
+        type=dotenv.dotenv_values,
+        help="""\
+        Path to a dotenv file with secret variables to set on the remote node.
+
+        If any values from ``--secret-file`` conflict with values set by
+        ``--secret``, the ``--secret`` value will be preferred.""",
+    ),
     click.option(
         '--secret',
         required=False,
diff --git a/sky/client/cli/table_utils.py b/sky/client/cli/table_utils.py
index dd9fa4876ee..fe07d7e907b 100644
--- a/sky/client/cli/table_utils.py
+++ b/sky/client/cli/table_utils.py
@@ -204,15 +204,23 @@ def format(self) -> str:
 class PVCVolumeTable(VolumeTable):
     """The PVC volume table."""
 
+    def __init__(self,
+                 volumes: List[responses.VolumeRecord],
+                 show_all: bool = False):
+        # Check if any volume has an error before creating the table
+        self._has_errors = any(row.get('error_message') for row in volumes)
+        super().__init__(volumes, show_all)
+
     def _create_table(self, show_all: bool = False) -> prettytable.PrettyTable:
         """Create the PVC volume table."""
         #  If show_all is False, show the table with the columns:
         #   NAME, TYPE, INFRA, SIZE, USER, WORKSPACE,
         #   AGE, STATUS, LAST_USE, USED_BY, IS_EPHEMERAL
+        #   (+ MESSAGE if any volume is not ready)
         #  If show_all is True, show the table with the columns:
         #   NAME, TYPE, INFRA, SIZE, USER, WORKSPACE,
         #   AGE, STATUS, LAST_USE, USED_BY, IS_EPHEMERAL, NAME_ON_CLOUD
-        #   STORAGE_CLASS, ACCESS_MODE
+        #   STORAGE_CLASS, ACCESS_MODE, MESSAGE
 
         columns = _BASIC_COLUMNS + [
             'IS_EPHEMERAL',
@@ -222,7 +230,11 @@ def _create_table(self, show_all: bool = False) -> prettytable.PrettyTable:
                 'NAME_ON_CLOUD',
                 'STORAGE_CLASS',
                 'ACCESS_MODE',
+                'MESSAGE',
             ]
+        elif self._has_errors:
+            # Show MESSAGE column even without show_all if there are issues
+            columns = columns + ['MESSAGE']
 
         table = log_utils.create_table(columns)
         return table
@@ -239,6 +251,17 @@ def _add_rows(self,
                 table_row.append(
                     row.get('config', {}).get('storage_class_name', '-'))
                 table_row.append(row.get('config', {}).get('access_mode', ''))
+                # Add error message
+                error_msg = row.get('error_message', '')
+                table_row.append(error_msg if error_msg else '-')
+            elif self._has_errors:
+                # Show error message even without show_all if there are errors
+                error_msg = row.get('error_message', '')
+                # Truncate error message for display
+                if error_msg:
+                    error_msg = common_utils.truncate_long_string(
+                        error_msg, constants.ERROR_MESSAGE_TRUNC_LENGTH)
+                table_row.append(error_msg if error_msg else '-')
 
             self.table.add_row(table_row)
 
diff --git a/sky/client/oauth.py b/sky/client/oauth.py
index 3afc1f2366e..da1d3006935 100644
--- a/sky/client/oauth.py
+++ b/sky/client/oauth.py
@@ -5,7 +5,7 @@
 import time
 from typing import Dict, Optional
 
-AUTH_TIMEOUT = 300  # 5 minutes
+from sky.server import constants as server_constants
 
 
 class _AuthCallbackHandler(BaseHTTPRequestHandler):
@@ -44,10 +44,12 @@ def log_message(self, *args):  # pylint: disable=unused-argument
         pass
 
 
-def start_local_auth_server(port: int,
-                            token_store: Dict[str, Optional[str]],
-                            remote_endpoint: str,
-                            timeout: int = AUTH_TIMEOUT) -> HTTPServer:
+def start_local_auth_server(
+        port: int,
+        token_store: Dict[str, Optional[str]],
+        remote_endpoint: str,
+        timeout: int = server_constants.AUTH_SESSION_TIMEOUT_SECONDS
+) -> HTTPServer:
     """Start a local HTTP server to handle OAuth callback.
 
     Args:
diff --git a/sky/client/sdk.py b/sky/client/sdk.py
index fe858966930..007cf847a14 100644
--- a/sky/client/sdk.py
+++ b/sky/client/sdk.py
@@ -36,6 +36,7 @@
 from sky.jobs import utils as managed_job_utils
 from sky.schemas.api import responses
 from sky.server import common as server_common
+from sky.server import constants as server_constants
 from sky.server import rest
 from sky.server import versions
 from sky.server.requests import payloads
@@ -65,6 +66,7 @@
     import binascii
     import io
     import pathlib
+    import secrets
     import time
     import webbrowser
 
@@ -82,6 +84,8 @@
     base64 = adaptors_common.LazyImport('base64')
     binascii = adaptors_common.LazyImport('binascii')
     pathlib = adaptors_common.LazyImport('pathlib')
+    requests = adaptors_common.LazyImport('requests')
+    secrets = adaptors_common.LazyImport('secrets')
     time = adaptors_common.LazyImport('time')
     # only used in dashboard() and api_login()
     webbrowser = adaptors_common.LazyImport('webbrowser')
@@ -374,7 +378,7 @@ def optimize(
             for a task.
         exceptions.NoCloudAccessError: if no public clouds are enabled.
     """
-    dag_str = dag_utils.dump_chain_dag_to_yaml_str(dag)
+    dag_str = dag_utils.dump_dag_to_yaml_str(dag)
 
     body = payloads.OptimizeBody(dag=dag_str,
                                  minimize=minimize,
@@ -434,7 +438,7 @@ def validate(
         task.expand_and_validate_workdir()
         if not workdir_only:
             task.expand_and_validate_file_mounts()
-    dag_str = dag_utils.dump_chain_dag_to_yaml_str(dag)
+    dag_str = dag_utils.dump_dag_to_yaml_str(dag)
     body = payloads.ValidateBody(dag=dag_str,
                                  request_options=admin_policy_request_options)
     response = server_common.make_authenticated_request(
@@ -732,7 +736,7 @@ def _launch(
 
     dag = client_common.upload_mounts_to_api_server(dag)
 
-    dag_str = dag_utils.dump_chain_dag_to_yaml_str(dag)
+    dag_str = dag_utils.dump_dag_to_yaml_str(dag)
 
     body = payloads.LaunchBody(
         task=dag_str,
@@ -823,7 +827,7 @@ def exec(  # pylint: disable=redefined-builtin
     dag = dag_utils.convert_entrypoint_to_dag(task)
     validate(dag, workdir_only=True)
     dag = client_common.upload_mounts_to_api_server(dag, workdir_only=True)
-    dag_str = dag_utils.dump_chain_dag_to_yaml_str(dag)
+    dag_str = dag_utils.dump_dag_to_yaml_str(dag)
     body = payloads.ExecBody(
         task=dag_str,
         cluster_name=cluster_name,
@@ -1017,6 +1021,44 @@ def tail_provision_logs(cluster_name: str,
     return 0
 
 
+@usage_lib.entrypoint
+@server_common.check_server_healthy_or_start
+@annotations.client_api
+def tail_autostop_logs(cluster_name: str,
+                       follow: bool = True,
+                       tail: int = 0) -> int:
+    """Tails the autostop hook logs (autostop_hook.log) for a cluster.
+
+    Args:
+        cluster_name: name of the cluster.
+        follow: whether to follow the logs.
+        tail: number of lines to display from the end of the log file.
+
+    Returns:
+        Exit code 0 on streaming success; non-zero on failure.
+
+    Request Raises:
+        ValueError: if arguments are invalid or the cluster is not supported.
+        sky.exceptions.ClusterDoesNotExist: if the cluster does not exist.
+        sky.exceptions.ClusterNotUpError: if the cluster is not UP.
+        sky.exceptions.NotSupportedError: if the cluster is not based on
+          CloudVmRayBackend.
+        sky.exceptions.ClusterOwnerIdentityMismatchError: if the current user is
+          not the same as the user who created the cluster.
+        sky.exceptions.CloudUserIdentityError: if we fail to get the current
+          user identity.
+    """
+    body = payloads.AutostopLogsBody(cluster_name=cluster_name,
+                                     follow=follow,
+                                     tail=tail)
+
+    response = server_common.make_authenticated_request(
+        'POST', '/autostop_logs', json=json.loads(body.model_dump_json()))
+    request_id: server_common.RequestId[int] = server_common.get_request_id(
+        response)
+    return stream_and_get(request_id)
+
+
 @usage_lib.entrypoint
 @server_common.check_server_healthy_or_start
 @annotations.client_api
@@ -1153,8 +1195,12 @@ def start(
 @usage_lib.entrypoint
 @server_common.check_server_healthy_or_start
 @annotations.client_api
-def down(cluster_name: str,
-         purge: bool = False) -> server_common.RequestId[None]:
+def down(
+    cluster_name: str,
+    purge: bool = False,
+    graceful: bool = False,
+    graceful_timeout: Optional[int] = None,
+) -> server_common.RequestId[None]:
     """Tears down a cluster.
 
     Tearing down a cluster will delete all associated resources (all billing
@@ -1169,6 +1215,10 @@ def down(cluster_name: str,
             troubleshooting scenarios; with it set, it is the user's
             responsibility to ensure there are no leaked instances and related
             resources.
+        graceful: Cancel the user's task but block until MOUNT_CACHED data is
+            fully uploaded. This helps with preserving user data integrity.
+        graceful_timeout: If not None, sets a timeout for the graceful option
+            above (in seconds).
 
     Returns:
         The request ID of the down request.
@@ -1184,9 +1234,15 @@ def down(cluster_name: str,
             jobs controller.
 
     """
+    version = versions.get_remote_api_version()
+    if graceful and version is not None and version < 32:
+        logger.warning('`--graceful` is ignored because the server does '
+                       'not support it yet.')
     body = payloads.StopOrDownBody(
         cluster_name=cluster_name,
         purge=purge,
+        graceful=graceful,
+        graceful_timeout=graceful_timeout,
     )
     response = server_common.make_authenticated_request(
         'POST', '/down', json=json.loads(body.model_dump_json()), timeout=5)
@@ -1196,8 +1252,12 @@ def down(cluster_name: str,
 @usage_lib.entrypoint
 @server_common.check_server_healthy_or_start
 @annotations.client_api
-def stop(cluster_name: str,
-         purge: bool = False) -> server_common.RequestId[None]:
+def stop(
+    cluster_name: str,
+    purge: bool = False,
+    graceful: bool = False,
+    graceful_timeout: Optional[int] = None,
+) -> server_common.RequestId[None]:
     """Stops a cluster.
 
     Data on attached disks is not lost when a cluster is stopped.  Billing for
@@ -1230,9 +1290,15 @@ def stop(cluster_name: str,
             cluster, or a TPU VM Pod cluster, or the managed jobs controller.
 
     """
+    version = versions.get_remote_api_version()
+    if graceful and version is not None and version < 32:
+        logger.warning('`--graceful` is ignored because the server does '
+                       'not support it yet.')
     body = payloads.StopOrDownBody(
         cluster_name=cluster_name,
         purge=purge,
+        graceful=graceful,
+        graceful_timeout=graceful_timeout,
     )
     response = server_common.make_authenticated_request(
         'POST', '/stop', json=json.loads(body.model_dump_json()), timeout=5)
@@ -1243,10 +1309,12 @@ def stop(cluster_name: str,
 @server_common.check_server_healthy_or_start
 @annotations.client_api
 def autostop(
-        cluster_name: str,
-        idle_minutes: int,
-        wait_for: Optional[autostop_lib.AutostopWaitFor] = None,
-        down: bool = False,  # pylint: disable=redefined-outer-name
+    cluster_name: str,
+    idle_minutes: int,
+    wait_for: Optional[autostop_lib.AutostopWaitFor] = None,
+    down: bool = False,  # pylint: disable=redefined-outer-name
+    hook: Optional[str] = None,
+    hook_timeout: Optional[int] = None,
 ) -> server_common.RequestId[None]:
     """Schedules an autostop/autodown for a cluster.
 
@@ -1287,6 +1355,13 @@ def autostop(
             3. "none" - Wait for nothing; autostop right after ``idle_minutes``.
         down: if true, use autodown (tear down the cluster; non-restartable),
             rather than autostop (restartable).
+        hook: optional script to execute on the remote cluster before autostop.
+            The script runs before the cluster is stopped or torn down. If the
+            hook fails, autostop will still proceed but a warning will be
+            logged.
+        hook_timeout: timeout in seconds for hook execution. If None, uses
+            DEFAULT_AUTOSTOP_HOOK_TIMEOUT_SECONDS (3600 = 1 hour). The hook will
+            be terminated if it exceeds this timeout.
 
     Returns:
         The request ID of the autostop request.
@@ -1295,6 +1370,7 @@ def autostop(
         None
 
     Request Raises:
+        ValueError: if arguments are invalid.
         sky.exceptions.ClusterDoesNotExist: if the cluster does not exist.
         sky.exceptions.ClusterNotUpError: if the cluster is not UP.
         sky.exceptions.NotSupportedError: if the cluster is not based on
@@ -1304,17 +1380,28 @@ def autostop(
         sky.exceptions.CloudUserIdentityError: if we fail to get the current
             user identity.
     """
+    if hook_timeout is not None and hook is None:
+        raise ValueError('hook_timeout can only be set if hook is set.')
+
     remote_api_version = versions.get_remote_api_version()
     if wait_for is not None and (remote_api_version is None or
                                  remote_api_version < 13):
         logger.warning('wait_for is not supported in your API server. '
                        'Please upgrade to a newer API server to use it.')
 
+    # Hook support requires API version 28 or higher
+    if hook is not None and (remote_api_version is None or
+                             remote_api_version < 28):
+        logger.warning('Autostop hook is not supported in your API server. '
+                       'Please upgrade to a newer API server to use it.')
+
     body = payloads.AutostopBody(
         cluster_name=cluster_name,
         idle_minutes=idle_minutes,
         wait_for=wait_for,
         down=down,
+        hook=hook,
+        hook_timeout=hook_timeout,
     )
     response = server_common.make_authenticated_request(
         'POST', '/autostop', json=json.loads(body.model_dump_json()), timeout=5)
@@ -2479,6 +2566,117 @@ def _check_endpoint_in_env_var(is_login: bool) -> None:
                                'clear the environment variable.')
 
 
+def _try_polling_auth(endpoint: str) -> Optional[str]:
+    """Try the polling-based authentication flow."""
+    try:
+        # Generate code verifier (random secret) and challenge (hash)
+        code_verifier = common_utils.base64_url_encode(secrets.token_bytes(32))
+        code_challenge = common_utils.compute_code_challenge(code_verifier)
+
+        # Open browser to authorization page
+        auth_url = f'{endpoint}/auth/authorize?code_challenge={code_challenge}'
+        if not webbrowser.open(auth_url):
+            logger.debug('Failed to open browser.')
+            return None
+
+        click.echo(f'{colorama.Fore.GREEN}Browser opened at {auth_url}'
+                   f'{colorama.Style.RESET_ALL}\n'
+                   f'Please click "Authorize" to complete login.\n'
+                   f'{colorama.Style.DIM}Press ctrl+c to fall back to legacy '
+                   f'auth method.{colorama.Style.RESET_ALL}')
+
+        # Poll for token
+        start_time = time.time()
+        while time.time(
+        ) - start_time < server_constants.AUTH_SESSION_TIMEOUT_SECONDS:
+            time.sleep(1)
+            resp = requests.get(f'{endpoint}/api/v1/auth/token',
+                                params={'code_verifier': code_verifier},
+                                timeout=10)
+
+            if resp.status_code == 200:
+                data = resp.json()
+                if 'token' in data:
+                    return data['token']
+            elif resp.status_code != 404:
+                # 404 means user hasn't clicked Authorize yet, keep polling
+                logger.debug(f'Poll failed: {resp.status_code}')
+                return None
+
+        click.echo(f'{colorama.Fore.YELLOW}Authentication timed out.'
+                   f'{colorama.Style.RESET_ALL}')
+        return None
+
+    except KeyboardInterrupt:
+        click.echo(f'\n{colorama.Style.DIM}Interrupted.'
+                   f'{colorama.Style.RESET_ALL}')
+        return None
+    except Exception as e:  # pylint: disable=broad-except
+        logger.debug(f'Polling auth failed: {e}')
+        return None
+
+
+def _try_localhost_callback_auth(endpoint: str) -> Optional[str]:
+    """Try the localhost callback authentication flow (legacy)."""
+    server: Optional[oauth_lib.HTTPServer] = None
+    try:
+        callback_port = common_utils.find_free_port(8000)
+        token_container: Dict[str, Optional[str]] = {'token': None}
+        server = oauth_lib.start_local_auth_server(callback_port,
+                                                   token_container, endpoint)
+
+        token_url = f'{endpoint}/token?local_port={callback_port}'
+        if not webbrowser.open(token_url):
+            return None
+
+        click.echo(f'{colorama.Fore.GREEN}Browser opened at {token_url}'
+                   f'{colorama.Style.RESET_ALL}\n'
+                   f'{colorama.Style.DIM}Press ctrl+c to enter token manually.'
+                   f'{colorama.Style.RESET_ALL}')
+
+        start_time = time.time()
+        while (token_container['token'] is None and time.time() - start_time <
+               server_constants.AUTH_SESSION_TIMEOUT_SECONDS):
+            time.sleep(1)
+
+        if token_container['token'] is None:
+            click.echo(f'{colorama.Fore.YELLOW}Authentication timed out.'
+                       f'{colorama.Style.RESET_ALL}')
+            return None
+        return token_container['token']
+
+    except KeyboardInterrupt:
+        click.echo(f'\n{colorama.Style.DIM}Interrupted.'
+                   f'{colorama.Style.RESET_ALL}')
+        return None
+    except Exception as e:  # pylint: disable=broad-except
+        logger.debug(f'Localhost callback failed: {e}')
+        return None
+    finally:
+        if server is not None:
+            try:
+                server.server_close()
+            except Exception:  # pylint: disable=broad-except
+                pass
+
+
+def _try_manual_token_entry(endpoint: str) -> Optional[str]:
+    """Fall back to manual token entry."""
+    try:
+        token_url = f'{endpoint}/token'
+        click.echo(
+            f'Visit this URL to get the token:\n\n'
+            f'{colorama.Style.BRIGHT}{token_url}{colorama.Style.RESET_ALL}\n')
+        return click.prompt('Paste the token') or None
+    except (KeyboardInterrupt, click.Abort):
+        click.echo(
+            f'\n{colorama.Style.DIM}Cancelled.{colorama.Style.RESET_ALL}')
+        return None
+    except Exception as e:  # pylint: disable=broad-except
+        logger.debug(f'Manual token entry failed: {e}')
+        return None
+
+
 @usage_lib.entrypoint
 @annotations.client_api
 def api_login(endpoint: Optional[str] = None,
@@ -2581,59 +2779,26 @@ def _set_user_hash(user_hash: Optional[str]) -> None:
     if server_status == server_common.ApiServerStatus.NEEDS_AUTH or relogin:
         # We detected an auth proxy, so go through the auth proxy cookie flow.
         token: Optional[str] = None
-        server: Optional[oauth_lib.HTTPServer] = None
-        try:
-            callback_port = common_utils.find_free_port(8000)
-
-            token_container: Dict[str, Optional[str]] = {'token': None}
-            logger.debug('Starting local authentication server...')
-            server = oauth_lib.start_local_auth_server(callback_port,
-                                                       token_container,
-                                                       endpoint)
-
-            token_url = (f'{endpoint}/token?local_port={callback_port}')
-            if webbrowser.open(token_url):
-                click.echo(f'{colorama.Fore.GREEN}A web browser has been '
-                           f'opened at {token_url}. Please continue the login '
-                           f'in the web browser.{colorama.Style.RESET_ALL}\n'
-                           f'{colorama.Style.DIM}To manually copy the token, '
-                           f'press ctrl+c.{colorama.Style.RESET_ALL}')
-            else:
-                raise ValueError('Failed to open browser.')
 
-            start_time = time.time()
+        # Try methods in order:
+        # 1. New polling-based flow - only on servers >= API v30
+        # 2. Old localhost callback flow
+        # 3. Manual token entry
+        remote_api_version = versions.get_remote_api_version()
+        if remote_api_version is not None and remote_api_version >= 30:
+            token = _try_polling_auth(endpoint)
 
-            while (token_container['token'] is None and
-                   time.time() - start_time < oauth_lib.AUTH_TIMEOUT):
-                time.sleep(1)
+        if token is None:
+            # Polling auth not available or failed, try localhost callback
+            token = _try_localhost_callback_auth(endpoint)
 
-            if token_container['token'] is None:
-                click.echo(f'{colorama.Fore.YELLOW}Authentication timed out '
-                           f'after {oauth_lib.AUTH_TIMEOUT} seconds.')
-            else:
-                token = token_container['token']
-
-        except (Exception, KeyboardInterrupt) as e:  # pylint: disable=broad-except
-            logger.debug(f'Automatic authentication failed: {e}, '
-                         'falling back to manual token entry.')
-            if isinstance(e, KeyboardInterrupt):
-                click.echo(f'\n{colorama.Style.DIM}Interrupted. Press ctrl+c '
-                           f'again to exit.{colorama.Style.RESET_ALL}')
-            # Fall back to manual token entry
-            token_url = f'{endpoint}/token'
-            click.echo('Authentication is needed. Please visit this URL '
-                       f'to set up the token:{colorama.Style.BRIGHT}\n\n'
-                       f'{token_url}\n{colorama.Style.RESET_ALL}')
-            token = click.prompt('Paste the token')
-        finally:
-            if server is not None:
-                try:
-                    server.server_close()
-                except Exception:  # pylint: disable=broad-except
-                    pass
-            if not token:
-                with ux_utils.print_exception_no_traceback():
-                    raise ValueError('Authentication failed.')
+        if token is None:
+            # All automatic methods failed, fall back to manual entry
+            token = _try_manual_token_entry(endpoint)
+
+        if not token:
+            with ux_utils.print_exception_no_traceback():
+                raise ValueError('Authentication failed.')
 
         # Parse the token.
         # b64decode will ignore invalid characters, but does some length and
@@ -2642,7 +2807,6 @@ def _set_user_hash(user_hash: Optional[str]) -> None:
             data = base64.b64decode(token)
         except binascii.Error as e:
             raise ValueError(f'Malformed token: {token}') from e
-        logger.debug(f'Token data: {data!r}')
         try:
             json_data = json.loads(data)
         except (json.JSONDecodeError, UnicodeDecodeError) as e:
diff --git a/sky/client/sdk_async.py b/sky/client/sdk_async.py
index abac1db20ff..bf962e68339 100644
--- a/sky/client/sdk_async.py
+++ b/sky/client/sdk_async.py
@@ -10,6 +10,7 @@
     statuses = await sky.get(request_id)
 
 """
+import asyncio
 import dataclasses
 import logging
 import typing
@@ -33,7 +34,6 @@
 from sky.usage import usage_lib
 from sky.utils import annotations
 from sky.utils import common
-from sky.utils import context_utils
 from sky.utils import env_options
 from sky.utils import rich_utils
 from sky.utils import ux_utils
@@ -280,8 +280,8 @@ async def check(
     stream_logs: Optional[StreamConfig] = DEFAULT_STREAM_CONFIG
 ) -> Dict[str, List[str]]:
     """Async version of check() that checks the credentials to enable clouds."""
-    request_id = await context_utils.to_thread(sdk.check, infra_list, verbose,
-                                               workspace)
+    request_id = await asyncio.to_thread(sdk.check, infra_list, verbose,
+                                         workspace)
     if stream_logs is not None:
         return await _stream_and_get(request_id, stream_logs)
     else:
@@ -296,8 +296,7 @@ async def enabled_clouds(
         stream_logs: Optional[StreamConfig] = DEFAULT_STREAM_CONFIG
 ) -> List[str]:
     """Async version of enabled_clouds() that gets the enabled clouds."""
-    request_id = await context_utils.to_thread(sdk.enabled_clouds, workspace,
-                                               expand)
+    request_id = await asyncio.to_thread(sdk.enabled_clouds, workspace, expand)
     if stream_logs is not None:
         return await _stream_and_get(request_id, stream_logs)
     else:
@@ -319,11 +318,10 @@ async def list_accelerators(
 ) -> Dict[str, List[catalog.common.InstanceTypeInfo]]:
     """Async version of list_accelerators() that lists the names of all
     accelerators offered by Sky."""
-    request_id = await context_utils.to_thread(sdk.list_accelerators, gpus_only,
-                                               name_filter, region_filter,
-                                               quantity_filter, clouds,
-                                               all_regions, require_price,
-                                               case_sensitive)
+    request_id = await asyncio.to_thread(sdk.list_accelerators, gpus_only,
+                                         name_filter, region_filter,
+                                         quantity_filter, clouds, all_regions,
+                                         require_price, case_sensitive)
     if stream_logs is not None:
         return await _stream_and_get(request_id, stream_logs)
     else:
@@ -342,10 +340,9 @@ async def list_accelerator_counts(
 ) -> Dict[str, List[int]]:
     """Async version of list_accelerator_counts() that lists all accelerators
       offered by Sky and available counts."""
-    request_id = await context_utils.to_thread(sdk.list_accelerator_counts,
-                                               gpus_only, name_filter,
-                                               region_filter, quantity_filter,
-                                               clouds)
+    request_id = await asyncio.to_thread(sdk.list_accelerator_counts, gpus_only,
+                                         name_filter, region_filter,
+                                         quantity_filter, clouds)
     if stream_logs is not None:
         return await _stream_and_get(request_id, stream_logs)
     else:
@@ -363,8 +360,8 @@ async def optimize(
 ) -> 'sky.Dag':
     """Async version of optimize() that finds the best execution plan for the
       given DAG."""
-    request_id = await context_utils.to_thread(sdk.optimize, dag, minimize,
-                                               admin_policy_request_options)
+    request_id = await asyncio.to_thread(sdk.optimize, dag, minimize,
+                                         admin_policy_request_options)
     if stream_logs is not None:
         return await _stream_and_get(request_id, stream_logs)
     else:
@@ -377,7 +374,7 @@ async def workspaces(
     stream_logs: Optional[StreamConfig] = DEFAULT_STREAM_CONFIG
 ) -> Dict[str, Any]:
     """Async version of workspaces() that gets the workspaces."""
-    request_id = await context_utils.to_thread(sdk.workspaces)
+    request_id = await asyncio.to_thread(sdk.workspaces)
     if stream_logs is not None:
         return await _stream_and_get(request_id, stream_logs)
     else:
@@ -408,7 +405,7 @@ async def launch(
     stream_logs: Optional[StreamConfig] = DEFAULT_STREAM_CONFIG,
 ) -> Tuple[Optional[int], Optional['backends.ResourceHandle']]:
     """Async version of launch() that launches a cluster or task."""
-    request_id = await context_utils.to_thread(
+    request_id = await asyncio.to_thread(
         sdk.launch, task, cluster_name, retry_until_up,
         idle_minutes_to_autostop, wait_for, dryrun, down, backend,
         optimize_target, no_setup, clone_disk_from, fast, _need_confirmation,
@@ -431,8 +428,8 @@ async def exec(  # pylint: disable=redefined-builtin
     stream_logs: Optional[StreamConfig] = DEFAULT_STREAM_CONFIG,
 ) -> Tuple[Optional[int], Optional['backends.ResourceHandle']]:
     """Async version of exec() that executes a task on an existing cluster."""
-    request_id = await context_utils.to_thread(sdk.exec, task, cluster_name,
-                                               dryrun, down, backend)
+    request_id = await asyncio.to_thread(sdk.exec, task, cluster_name, dryrun,
+                                         down, backend)
     if stream_logs is not None:
         return await _stream_and_get(request_id, stream_logs)
     else:
@@ -447,8 +444,8 @@ async def tail_logs(cluster_name: str,
                     tail: int = 0,
                     output_stream: Optional['io.TextIOBase'] = None) -> int:
     """Async version of tail_logs() that tails the logs of a job."""
-    return await context_utils.to_thread(sdk.tail_logs, cluster_name, job_id,
-                                         follow, tail, output_stream)
+    return await asyncio.to_thread(sdk.tail_logs, cluster_name, job_id, follow,
+                                   tail, output_stream)
 
 
 @usage_lib.entrypoint
@@ -456,8 +453,7 @@ async def tail_logs(cluster_name: str,
 async def download_logs(cluster_name: str,
                         job_ids: Optional[List[str]]) -> Dict[str, str]:
     """Async version of download_logs() that downloads the logs of jobs."""
-    return await context_utils.to_thread(sdk.download_logs, cluster_name,
-                                         job_ids)
+    return await asyncio.to_thread(sdk.download_logs, cluster_name, job_ids)
 
 
 @usage_lib.entrypoint
@@ -472,10 +468,9 @@ async def start(
     stream_logs: Optional[StreamConfig] = DEFAULT_STREAM_CONFIG,
 ) -> 'backends.CloudVmRayResourceHandle':
     """Async version of start() that restarts a cluster."""
-    request_id = await context_utils.to_thread(sdk.start, cluster_name,
-                                               idle_minutes_to_autostop,
-                                               wait_for, retry_until_up, down,
-                                               force)
+    request_id = await asyncio.to_thread(sdk.start, cluster_name,
+                                         idle_minutes_to_autostop, wait_for,
+                                         retry_until_up, down, force)
     if stream_logs is not None:
         return await _stream_and_get(request_id, stream_logs)
     else:
@@ -487,9 +482,12 @@ async def start(
 async def down(
         cluster_name: str,
         purge: bool = False,
+        graceful: bool = False,
+        graceful_timeout: Optional[int] = None,
         stream_logs: Optional[StreamConfig] = DEFAULT_STREAM_CONFIG) -> None:
     """Async version of down() that tears down a cluster."""
-    request_id = await context_utils.to_thread(sdk.down, cluster_name, purge)
+    request_id = await asyncio.to_thread(sdk.down, cluster_name, purge,
+                                         graceful, graceful_timeout)
     if stream_logs is not None:
         return await _stream_and_get(request_id, stream_logs)
     else:
@@ -501,9 +499,12 @@ async def down(
 async def stop(
         cluster_name: str,
         purge: bool = False,
+        graceful: bool = False,
+        graceful_timeout: Optional[int] = None,
         stream_logs: Optional[StreamConfig] = DEFAULT_STREAM_CONFIG) -> None:
     """Async version of stop() that stops a cluster."""
-    request_id = await context_utils.to_thread(sdk.stop, cluster_name, purge)
+    request_id = await asyncio.to_thread(sdk.stop, cluster_name, purge,
+                                         graceful, graceful_timeout)
     if stream_logs is not None:
         return await _stream_and_get(request_id, stream_logs)
     else:
@@ -521,8 +522,8 @@ async def autostop(
 ) -> None:
     """Async version of autostop() that schedules an autostop/autodown for a
       cluster."""
-    request_id = await context_utils.to_thread(sdk.autostop, cluster_name,
-                                               idle_minutes, wait_for, down)
+    request_id = await asyncio.to_thread(sdk.autostop, cluster_name,
+                                         idle_minutes, wait_for, down)
     if stream_logs is not None:
         return await _stream_and_get(request_id, stream_logs)
     else:
@@ -538,8 +539,8 @@ async def queue(
     stream_logs: Optional[StreamConfig] = DEFAULT_STREAM_CONFIG
 ) -> List[responses.ClusterJobRecord]:
     """Async version of queue() that gets the job queue of a cluster."""
-    request_id = await context_utils.to_thread(sdk.queue, cluster_name,
-                                               skip_finished, all_users)
+    request_id = await asyncio.to_thread(sdk.queue, cluster_name, skip_finished,
+                                         all_users)
     if stream_logs is not None:
         return await _stream_and_get(request_id, stream_logs)
     else:
@@ -555,8 +556,7 @@ async def job_status(
 ) -> Dict[Optional[int], Optional['job_lib.JobStatus']]:
     """Async version of job_status() that gets the status of jobs on a
       cluster."""
-    request_id = await context_utils.to_thread(sdk.job_status, cluster_name,
-                                               job_ids)
+    request_id = await asyncio.to_thread(sdk.job_status, cluster_name, job_ids)
     if stream_logs is not None:
         return await _stream_and_get(request_id, stream_logs)
     else:
@@ -574,9 +574,9 @@ async def cancel(
         _try_cancel_if_cluster_is_init: bool = False,
         stream_logs: Optional[StreamConfig] = DEFAULT_STREAM_CONFIG) -> None:
     """Async version of cancel() that cancels jobs on a cluster."""
-    request_id = await context_utils.to_thread(sdk.cancel, cluster_name, all,
-                                               all_users, job_ids,
-                                               _try_cancel_if_cluster_is_init)
+    request_id = await asyncio.to_thread(sdk.cancel, cluster_name, all,
+                                         all_users, job_ids,
+                                         _try_cancel_if_cluster_is_init)
     if stream_logs is not None:
         return await _stream_and_get(request_id, stream_logs)
     else:
@@ -594,7 +594,7 @@ async def status(
     _include_credentials: bool = False,
 ) -> List[Dict[str, Any]]:
     """Async version of status() that gets cluster statuses."""
-    request_id = await context_utils.to_thread(
+    request_id = await asyncio.to_thread(
         sdk.status,
         cluster_names,
         refresh,
@@ -615,7 +615,7 @@ async def endpoints(
 ) -> Dict[int, str]:
     """Async version of endpoints() that gets the endpoint for a given cluster
       and port number."""
-    request_id = await context_utils.to_thread(sdk.endpoints, cluster, port)
+    request_id = await asyncio.to_thread(sdk.endpoints, cluster, port)
     if stream_logs is not None:
         return await _stream_and_get(request_id, stream_logs)
     else:
@@ -628,7 +628,7 @@ async def cost_report(
     stream_logs: Optional[StreamConfig] = DEFAULT_STREAM_CONFIG
 ) -> List[Dict[str, Any]]:
     """Async version of cost_report() that gets all cluster cost reports."""
-    request_id = await context_utils.to_thread(sdk.cost_report)
+    request_id = await asyncio.to_thread(sdk.cost_report)
     if stream_logs is not None:
         return await _stream_and_get(request_id, stream_logs)
     else:
@@ -641,7 +641,7 @@ async def storage_ls(
     stream_logs: Optional[StreamConfig] = DEFAULT_STREAM_CONFIG
 ) -> List[Dict[str, Any]]:
     """Async version of storage_ls() that gets the storages."""
-    request_id = await context_utils.to_thread(sdk.storage_ls)
+    request_id = await asyncio.to_thread(sdk.storage_ls)
     if stream_logs is not None:
         return await _stream_and_get(request_id, stream_logs)
     else:
@@ -654,7 +654,7 @@ async def storage_delete(
         name: str,
         stream_logs: Optional[StreamConfig] = DEFAULT_STREAM_CONFIG) -> None:
     """Async version of storage_delete() that deletes a storage."""
-    request_id = await context_utils.to_thread(sdk.storage_delete, name)
+    request_id = await asyncio.to_thread(sdk.storage_delete, name)
     if stream_logs is not None:
         return await _stream_and_get(request_id, stream_logs)
     else:
@@ -670,8 +670,7 @@ async def local_up(
         stream_logs: Optional[StreamConfig] = DEFAULT_STREAM_CONFIG) -> None:
     """Async version of local_up() that launches a Kubernetes cluster on
     local machines."""
-    request_id = await context_utils.to_thread(sdk.local_up, gpus, name,
-                                               port_start)
+    request_id = await asyncio.to_thread(sdk.local_up, gpus, name, port_start)
     if stream_logs is not None:
         return await _stream_and_get(request_id, stream_logs)
     else:
@@ -685,7 +684,7 @@ async def local_down(
         stream_logs: Optional[StreamConfig] = DEFAULT_STREAM_CONFIG) -> None:
     """Async version of local_down() that tears down the Kubernetes cluster
     started by local_up."""
-    request_id = await context_utils.to_thread(sdk.local_down, name)
+    request_id = await asyncio.to_thread(sdk.local_down, name)
     if stream_logs is not None:
         return await _stream_and_get(request_id, stream_logs)
     else:
@@ -699,7 +698,7 @@ async def ssh_up(
         stream_logs: Optional[StreamConfig] = DEFAULT_STREAM_CONFIG) -> None:
     """Async version of ssh_up() that deploys the SSH Node Pools defined in
       ~/.sky/ssh_targets.yaml."""
-    request_id = await context_utils.to_thread(sdk.ssh_up, infra)
+    request_id = await asyncio.to_thread(sdk.ssh_up, infra)
     if stream_logs is not None:
         return await _stream_and_get(request_id, stream_logs)
     else:
@@ -713,7 +712,7 @@ async def ssh_down(
         stream_logs: Optional[StreamConfig] = DEFAULT_STREAM_CONFIG) -> None:
     """Async version of ssh_down() that tears down a Kubernetes cluster on SSH
     targets."""
-    request_id = await context_utils.to_thread(sdk.ssh_down, infra)
+    request_id = await asyncio.to_thread(sdk.ssh_down, infra)
     if stream_logs is not None:
         return await _stream_and_get(request_id, stream_logs)
     else:
@@ -731,7 +730,7 @@ async def realtime_kubernetes_gpu_availability(
 ) -> List[Tuple[str, List['models.RealtimeGpuAvailability']]]:
     """Async version of realtime_kubernetes_gpu_availability() that gets the
       real-time Kubernetes GPU availability."""
-    request_id = await context_utils.to_thread(
+    request_id = await asyncio.to_thread(
         sdk.realtime_kubernetes_gpu_availability, context, name_filter,
         quantity_filter, is_ssh)
     if stream_logs is not None:
@@ -748,8 +747,7 @@ async def kubernetes_node_info(
 ) -> 'models.KubernetesNodesInfo':
     """Async version of kubernetes_node_info() that gets the resource
     information for all the nodes in the cluster."""
-    request_id = await context_utils.to_thread(sdk.kubernetes_node_info,
-                                               context)
+    request_id = await asyncio.to_thread(sdk.kubernetes_node_info, context)
     if stream_logs is not None:
         return await _stream_and_get(request_id, stream_logs)
     else:
@@ -765,7 +763,7 @@ async def status_kubernetes(
            List[Dict[str, Any]], Optional[str]]:
     """Async version of status_kubernetes() that gets all SkyPilot clusters
       and jobs in the Kubernetes cluster."""
-    request_id = await context_utils.to_thread(sdk.status_kubernetes)
+    request_id = await asyncio.to_thread(sdk.status_kubernetes)
     if stream_logs is not None:
         return await _stream_and_get(request_id, stream_logs)
     else:
@@ -781,8 +779,8 @@ async def api_cancel(
         stream_logs: Optional[StreamConfig] = DEFAULT_STREAM_CONFIG
 ) -> List[str]:
     """Async version of api_cancel() that aborts a request or all requests."""
-    request_id = await context_utils.to_thread(sdk.api_cancel, request_ids,
-                                               all_users, silent)
+    request_id = await asyncio.to_thread(sdk.api_cancel, request_ids, all_users,
+                                         silent)
     if stream_logs is not None:
         return await _stream_and_get(request_id, stream_logs)
     else:
@@ -794,15 +792,14 @@ async def api_cancel(
 async def api_status(request_ids: Optional[List[str]] = None,
                      all_status: bool = False) -> List[payloads.RequestPayload]:
     """Async version of api_status() that lists all requests."""
-    return await context_utils.to_thread(sdk.api_status, request_ids,
-                                         all_status)
+    return await asyncio.to_thread(sdk.api_status, request_ids, all_status)
 
 
 @usage_lib.entrypoint
 @annotations.client_api
 async def dashboard(starting_page: Optional[str] = None) -> None:
     """Async version of dashboard() that starts the dashboard for SkyPilot."""
-    return await context_utils.to_thread(sdk.dashboard, starting_page)
+    return await asyncio.to_thread(sdk.dashboard, starting_page)
 
 
 @usage_lib.entrypoint
@@ -810,14 +807,14 @@ async def dashboard(starting_page: Optional[str] = None) -> None:
 async def api_info() -> responses.APIHealthResponse:
     """Async version of api_info() that gets the server's status, commit and
       version."""
-    return await context_utils.to_thread(sdk.api_info)
+    return await asyncio.to_thread(sdk.api_info)
 
 
 @usage_lib.entrypoint
 @annotations.client_api
 async def api_stop() -> None:
     """Async version of api_stop() that stops the API server."""
-    return await context_utils.to_thread(sdk.api_stop)
+    return await asyncio.to_thread(sdk.api_stop)
 
 
 @usage_lib.entrypoint
@@ -825,7 +822,7 @@ async def api_stop() -> None:
 async def api_server_logs(follow: bool = True,
                           tail: Optional[int] = None) -> None:
     """Async version of api_server_logs() that streams the API server logs."""
-    return await context_utils.to_thread(sdk.api_server_logs, follow, tail)
+    return await asyncio.to_thread(sdk.api_server_logs, follow, tail)
 
 
 @usage_lib.entrypoint
@@ -833,4 +830,4 @@ async def api_server_logs(follow: bool = True,
 async def api_login(endpoint: Optional[str] = None,
                     get_token: bool = False) -> None:
     """Async version of api_login() that logs into a SkyPilot API server."""
-    return await context_utils.to_thread(sdk.api_login, endpoint, get_token)
+    return await asyncio.to_thread(sdk.api_login, endpoint, get_token)
diff --git a/sky/clouds/__init__.py b/sky/clouds/__init__.py
index 7efdd8bae7f..a9a3229459b 100644
--- a/sky/clouds/__init__.py
+++ b/sky/clouds/__init__.py
@@ -35,6 +35,7 @@
 from sky.clouds.ssh import SSH
 from sky.clouds.vast import Vast
 from sky.clouds.vsphere import Vsphere
+from sky.clouds.yotta import Yotta
 
 __all__ = [
     'IBM',
@@ -66,6 +67,7 @@
     'Nebius',
     'Hyperbolic',
     'Seeweb',
+    'Yotta',
     # Utility functions
     'cloud_in_iterable',
 ]
diff --git a/sky/clouds/aws.py b/sky/clouds/aws.py
index 4323ed8ac64..af24f1537a1 100644
--- a/sky/clouds/aws.py
+++ b/sky/clouds/aws.py
@@ -9,8 +9,8 @@
 import subprocess
 import time
 import typing
-from typing import (Any, Callable, Dict, Iterator, List, Literal, Optional, Set,
-                    Tuple, TypeVar, Union)
+from typing import (Any, Callable, Dict, Iterable, Iterator, List, Literal,
+                    Optional, Set, Tuple, TypeVar, Union)
 
 import colorama
 from typing_extensions import ParamSpec
@@ -85,6 +85,7 @@
     'p5e.',
     'p5en.',
     'p6-b200.',
+    'p6-b300.',
 ]
 
 # Docker run options for EFA.
@@ -1654,3 +1655,18 @@ def is_label_valid(cls, label_key: str,
         if not key_valid or not value_valid:
             return False, error_msg
         return True, None
+
+    @classmethod
+    def yield_cloud_specific_failover_overrides(cls,
+                                                region: Optional[str] = None
+                                               ) -> Iterable[Dict[str, Any]]:
+        vpc_names = skypilot_config.get_effective_region_config(
+            cloud='aws', region=region, keys=('vpc_names',), default_value=None)
+        if vpc_names:
+            if isinstance(vpc_names, str):
+                vpc_names = [vpc_names]
+            for vpc_name in vpc_names:
+                yield {'vpc_name': vpc_name}
+        else:
+            yield {}
+        return
diff --git a/sky/clouds/cloud.py b/sky/clouds/cloud.py
index 8fbca890789..83dd8acd64c 100644
--- a/sky/clouds/cloud.py
+++ b/sky/clouds/cloud.py
@@ -978,6 +978,21 @@ def display_name(cls) -> str:
         """Name of the cloud used in messages displayed to the user."""
         return cls.canonical_name()
 
+    # === Misc Failovers ===
+
+    @classmethod
+    def yield_cloud_specific_failover_overrides(cls,
+                                                region: Optional[str] = None
+                                               ) -> Iterable[Dict[str, Any]]:
+        """Some clouds may have configurations that require them to have
+        non-region/zone failovers. This method yields override keys for the
+        cluster config. Refer to the implementation for AWS for an example."""
+        del region  # unused
+        yield {}
+        return
+
+    # === End of Misc Failovers ===
+
     def __repr__(self):
         return self._REPR
 
diff --git a/sky/clouds/gcp.py b/sky/clouds/gcp.py
index 648c9b8ea4b..1b5a62a1017 100644
--- a/sky/clouds/gcp.py
+++ b/sky/clouds/gcp.py
@@ -542,6 +542,8 @@ def make_deploy_resources_variables(
                     'runtime_version']
                 resources_vars['tpu_node_name'] = r.accelerator_args.get(
                     'tpu_name')
+                resources_vars['gcp_queued_resource'] = r.accelerator_args.get(
+                    'gcp_queued_resource')
                 # TPU VMs require privileged mode for docker containers to
                 # access TPU devices.
                 resources_vars['docker_run_options'] = ['--privileged']
diff --git a/sky/clouds/kubernetes.py b/sky/clouds/kubernetes.py
index 23345800557..fd40ecfc864 100644
--- a/sky/clouds/kubernetes.py
+++ b/sky/clouds/kubernetes.py
@@ -1,10 +1,11 @@
 """Kubernetes."""
 import concurrent.futures
+import math
 import os
 import re
 import subprocess
 import tempfile
-from typing import Dict, Iterator, List, Optional, Set, Tuple, Union
+from typing import Any, Dict, Iterator, List, Optional, Set, Tuple, Union
 
 import colorama
 
@@ -44,6 +45,8 @@
 # addons/fuse-proxy/README.md for more details.
 _FUSERMOUNT_SHARED_DIR = '/var/run/fusermount'
 
+AWS_EFA_RESOURCE_KEY = 'vpc.amazonaws.com/efa'
+
 
 @registry.CLOUD_REGISTRY.register(aliases=['k8s'])
 class Kubernetes(clouds.Cloud):
@@ -604,7 +607,8 @@ def _get_image_id(resources: 'resources_lib.Resources') -> str:
             cloud='kubernetes',
             region=context,
             keys=('remote_identity',),
-            default_value=schemas.get_default_remote_identity('kubernetes'))
+            default_value=schemas.get_default_remote_identity('kubernetes'),
+            override_configs=resources.cluster_config_overrides)
 
         if isinstance(remote_identity, dict):
             # If remote_identity is a dict, use the service account for the
@@ -620,13 +624,16 @@ def _get_image_id(resources: 'resources_lib.Resources') -> str:
 
         lc = schemas.RemoteIdentityOptions.LOCAL_CREDENTIALS.value
         sa = schemas.RemoteIdentityOptions.SERVICE_ACCOUNT.value
+        no_upload = schemas.RemoteIdentityOptions.NO_UPLOAD.value
 
-        if k8s_service_account_name == lc or k8s_service_account_name == sa:
+        if k8s_service_account_name in (lc, sa, no_upload):
             # Use the default service account if remote identity is not set.
             # For LOCAL_CREDENTIALS, this is for in-cluster authentication
             # which needs a serviceaccount (specifically for SSH node pools
             # which uses in-cluster authentication internally, and we would
             # like to support exec-auth when the user is also using SSH infra)
+            # For NO_UPLOAD, we don't upload credentials but still need a
+            # service account for pod creation.
             k8s_service_account_name = (
                 kubernetes_utils.DEFAULT_SERVICE_ACCOUNT_NAME)
 
@@ -637,8 +644,18 @@ def _get_image_id(resources: 'resources_lib.Resources') -> str:
         if resources.use_spot:
             spot_label_key, spot_label_value = kubernetes_utils.get_spot_label()
 
-        network_type, machine_type = self._detect_network_type(
-            context, resources.network_tier)
+        network_type, metadata = self._detect_network_type(
+            context, resources.network_tier, k8s_acc_label_key,
+            k8s_resource_key, acc_count)
+
+        k8s_efa_count = None
+        if network_type == KubernetesHighPerformanceNetworkType.AWS_EFA:
+            if metadata and 'efa_count' in metadata:
+                k8s_efa_count = metadata['efa_count']
+            else:
+                logger.warning(
+                    f'No EFA interfaces detected on AWS nodes with '
+                    f'accelerator {k8s_acc_label_key}, skipping enabling EFA.')
 
         # Check if this cluster supports high performance networking and
         # configure appropriate settings for different cluster types
@@ -673,6 +690,19 @@ def _get_image_id(resources: 'resources_lib.Resources') -> str:
                 keys=('high_availability', 'storage_class_name'),
                 default_value=None))
 
+        # Get the config for setting pod CPU/memory limits relative to requests.
+        # This is useful for clusters that require limits to be set (e.g., for
+        # LimitRange enforcement or resource quotas).
+        # Can be: False (default, no limits), True (limits = requests),
+        # or a number (limits = requests * multiplier).
+        set_pod_resource_limits_config = (
+            skypilot_config.get_effective_workspace_region_config(
+                cloud='kubernetes',
+                region=context,
+                keys=('set_pod_resource_limits',),
+                default_value=False,
+                override_configs=resources.cluster_config_overrides))
+
         k8s_kueue_local_queue_name = (
             skypilot_config.get_effective_workspace_region_config(
                 # TODO(kyuds): Support SSH node pools as well.
@@ -732,6 +762,8 @@ def _get_image_id(resources: 'resources_lib.Resources') -> str:
             'memory': str(mem),
             'accelerator_count': str(acc_count),
             'timeout': str(timeout),
+            'k8s_efa_count': str(k8s_efa_count)
+                             if k8s_efa_count is not None else None,
             'k8s_port_mode': port_mode.value,
             'k8s_acc_label_key': k8s_acc_label_key,
             'k8s_acc_label_values': k8s_acc_label_values,
@@ -781,6 +813,17 @@ def _get_image_id(resources: 'resources_lib.Resources') -> str:
             'k8s_network_type': network_type.value,
         }
 
+        # Calculate CPU/memory limits if set_pod_resource_limits is configured.
+        # Convert config: False -> no limits, True -> multiplier 1.0,
+        # number -> that multiplier
+        if set_pod_resource_limits_config is not False:
+            if set_pod_resource_limits_config is True:
+                multiplier = 1.0
+            else:
+                multiplier = float(set_pod_resource_limits_config)
+            deploy_vars['k8s_cpu_limit'] = round(cpus * multiplier, 3)
+            deploy_vars['k8s_memory_limit'] = round(mem * multiplier, 3)
+
         # Add kubecontext if it is set. It may be None if SkyPilot is running
         # inside a pod with in-cluster auth.
         if context is not None:
@@ -797,7 +840,8 @@ def _get_image_id(resources: 'resources_lib.Resources') -> str:
         rdma_enabled = (network_type ==
                         KubernetesHighPerformanceNetworkType.GCP_GPUDIRECT_RDMA)
         deploy_vars['k8s_enable_gpudirect_rdma'] = rdma_enabled
-        if rdma_enabled and machine_type.startswith('a4'):
+        if (rdma_enabled and metadata and 'instance_type' in metadata and
+                metadata['instance_type'].startswith('a4')):
             deploy_vars['k8s_enable_gpudirect_rdma_a4'] = True
         else:
             deploy_vars['k8s_enable_gpudirect_rdma_a4'] = False
@@ -1151,22 +1195,31 @@ def expand_infras(cls) -> List[str]:
     def _detect_network_type(
         cls,
         context: str,
-        network_tier: Optional['resources_utils.NetworkTier'] = None
-    ) -> Tuple[KubernetesHighPerformanceNetworkType, str]:
+        network_tier: Optional['resources_utils.NetworkTier'] = None,
+        k8s_acc_label_key: Optional[str] = None,
+        k8s_resource_key: Optional[str] = None,
+        acc_count: Optional[int] = None,
+    ) -> Tuple[KubernetesHighPerformanceNetworkType, Optional[Dict[str, Any]]]:
         """Detect the type of Kubernetes network based on node labels.
 
         Args:
             context: The Kubernetes context to check.
             network_tier: The network tier requested. If None or not BEST,
                          returns NONE (no high-performance networking).
+            k8s_acc_label_key: The key of the Kubernetes accelerator label.
+            k8s_resource_key: The key of the Kubernetes resource.
+            acc_count: The number of accelerators requested.
 
         Returns:
-            A tuple of the detected network type and the instance type.
+            A tuple of (network_type, metadata).
+            - network_type: The detected high-performance network type
+            - metadata: Optional dict with cloud-specific info
+              (e.g., {'instance_type': str, 'efa_count': int})
         """
         # If network_tier is None or not BEST, return NONE
         if (network_tier is None or
                 network_tier != resources_utils.NetworkTier.BEST):
-            return KubernetesHighPerformanceNetworkType.NONE, ''
+            return KubernetesHighPerformanceNetworkType.NONE, None
 
         try:
             nodes = kubernetes_utils.get_kubernetes_nodes(context=context)
@@ -1176,11 +1229,49 @@ def _detect_network_type(
                     for label_key, _ in node.metadata.labels.items():
                         if label_key.startswith('nebius.com/'):
                             return (KubernetesHighPerformanceNetworkType.NEBIUS,
-                                    '')
+                                    None)
                         if label_key.startswith('ib.coreweave.cloud/'):
                             return (
                                 KubernetesHighPerformanceNetworkType.COREWEAVE,
-                                '')
+                                None)
+                        if label_key.startswith('node-role.together.ai/'):
+                            return (
+                                KubernetesHighPerformanceNetworkType.TOGETHER,
+                                None)
+                        if label_key.startswith('k8s.io/cloud-provider-aws'):
+                            network_type = (
+                                KubernetesHighPerformanceNetworkType.AWS_EFA)
+                            metadata: Optional[Dict[str, Any]] = None
+                            # Only check for AWS EFA count if GPU is specified
+                            if (not k8s_acc_label_key or not k8s_resource_key or
+                                    not acc_count):
+                                return (network_type, metadata)
+                            if (k8s_acc_label_key not in node.metadata.labels or
+                                    k8s_resource_key
+                                    not in node.status.allocatable or
+                                    int(node.status.
+                                        allocatable[k8s_resource_key]) <
+                                    acc_count):
+                                continue
+                            # Calculate EFA count proportionally
+                            if AWS_EFA_RESOURCE_KEY in node.status.allocatable:
+                                node_gpu_count = int(
+                                    node.status.allocatable[k8s_resource_key])
+                                node_efa_count = int(
+                                    node.status.
+                                    allocatable[AWS_EFA_RESOURCE_KEY])
+                                if node_efa_count > 0:
+                                    # Proportional allocation:
+                                    # user_gpu / node_gpu * node_efa
+                                    calculated_efa = math.floor(acc_count /
+                                                                node_gpu_count *
+                                                                node_efa_count)
+                                    efa_count = max(
+                                        1, min(calculated_efa, node_efa_count))
+                                    metadata = {'efa_count': efa_count}
+                                    return (network_type, metadata)
+                            # No EFA available, but it's an AWS node
+                            return (network_type, metadata)
 
                     # Check for GKE clusters with specific GPUDirect variants
                     machine_family = node.metadata.labels.get(
@@ -1196,26 +1287,36 @@ def _detect_network_type(
                         # variant
                         if 'a3-highgpu-8g' in instance_type:
                             return (
-                                KubernetesHighPerformanceNetworkType.GCP_TCPX,
-                                'a3-highgpu-8g')
+                                KubernetesHighPerformanceNetworkType.GCP_TCPX, {
+                                    'instance_type': 'a3-highgpu-8g'
+                                })
                         elif 'a3-edgegpu-8g' in instance_type:
                             return (
-                                KubernetesHighPerformanceNetworkType.GCP_TCPX,
-                                'a3-edgegpu-8g')
+                                KubernetesHighPerformanceNetworkType.GCP_TCPX, {
+                                    'instance_type': 'a3-edgegpu-8g'
+                                })
                         elif 'a3-megagpu-8g' in instance_type:
                             return (
                                 KubernetesHighPerformanceNetworkType.GCP_TCPXO,
-                                'a3-megagpu-8g')
+                                {
+                                    'instance_type': 'a3-megagpu-8g'
+                                })
                         elif 'a4-highgpu-8g' in instance_type:
                             return (KubernetesHighPerformanceNetworkType.
-                                    GCP_GPUDIRECT_RDMA, 'a4-highgpu-8g')
+                                    GCP_GPUDIRECT_RDMA, {
+                                        'instance_type': 'a4-highgpu-8g'
+                                    })
                         elif 'a3-ultragpu-8g' in instance_type:
                             return (KubernetesHighPerformanceNetworkType.
-                                    GCP_GPUDIRECT_RDMA, 'a3-ultragpu-8g')
+                                    GCP_GPUDIRECT_RDMA, {
+                                        'instance_type': 'a3-ultragpu-8g'
+                                    })
                         # Generic A3/A4 detection as fallback
                         elif machine_family == 'a4':
                             return (KubernetesHighPerformanceNetworkType.
-                                    GCP_GPUDIRECT_RDMA, 'a4')
+                                    GCP_GPUDIRECT_RDMA, {
+                                        'instance_type': 'a4'
+                                    })
 
                     # Fallback: Check for GPU Direct TCPX capable instance
                     # types with high-perf GPUs
@@ -1229,8 +1330,9 @@ def _detect_network_type(
                     if is_gpu_direct_tcpx_instance and has_high_perf_gpu:
                         # Default to TCPX if we can't determine the specific
                         # variant
-                        return (KubernetesHighPerformanceNetworkType.GCP_TCPX,
-                                instance_type)
+                        return (KubernetesHighPerformanceNetworkType.GCP_TCPX, {
+                            'instance_type': instance_type
+                        })
 
         except exceptions.KubeAPIUnreachableError:
             # If we can't reach the cluster, assume no high perf networking
@@ -1246,26 +1348,31 @@ def _detect_network_type(
             default_value=None)
         if (autoscaler_type !=
                 kubernetes_enums.KubernetesAutoscalerType.GKE.value):
-            return KubernetesHighPerformanceNetworkType.NONE, ''
+            return KubernetesHighPerformanceNetworkType.NONE, None
         autoscaler = kubernetes_utils.get_autoscaler(
             kubernetes_enums.KubernetesAutoscalerType(autoscaler_type))
         logger.debug(f'{context} has autoscaler of type: {autoscaler_type}')
         machine_types = autoscaler.get_available_machine_types(context)
         # Check if any machine type supports high perf networking for GKE.
         if 'a3-highgpu-8g' in machine_types:
-            return (KubernetesHighPerformanceNetworkType.GCP_TCPX,
-                    'a3-highgpu-8g')
+            return (KubernetesHighPerformanceNetworkType.GCP_TCPX, {
+                'instance_type': 'a3-highgpu-8g'
+            })
         elif 'a3-edgegpu-8g' in machine_types:
-            return (KubernetesHighPerformanceNetworkType.GCP_TCPX,
-                    'a3-edgegpu-8g')
+            return (KubernetesHighPerformanceNetworkType.GCP_TCPX, {
+                'instance_type': 'a3-edgegpu-8g'
+            })
         elif 'a3-megagpu-8g' in machine_types:
-            return (KubernetesHighPerformanceNetworkType.GCP_TCPXO,
-                    'a3-megagpu-8g')
+            return (KubernetesHighPerformanceNetworkType.GCP_TCPXO, {
+                'instance_type': 'a3-megagpu-8g'
+            })
         elif 'a4-highgpu-8g' in machine_types:
-            return (KubernetesHighPerformanceNetworkType.GCP_GPUDIRECT_RDMA,
-                    'a4-highgpu-8g')
+            return (KubernetesHighPerformanceNetworkType.GCP_GPUDIRECT_RDMA, {
+                'instance_type': 'a4-highgpu-8g'
+            })
         elif 'a3-ultragpu-8g' in machine_types:
-            return (KubernetesHighPerformanceNetworkType.GCP_GPUDIRECT_RDMA,
-                    'a3-ultragpu-8g')
+            return (KubernetesHighPerformanceNetworkType.GCP_GPUDIRECT_RDMA, {
+                'instance_type': 'a3-ultragpu-8g'
+            })
 
-        return KubernetesHighPerformanceNetworkType.NONE, ''
+        return KubernetesHighPerformanceNetworkType.NONE, None
diff --git a/sky/clouds/slurm.py b/sky/clouds/slurm.py
index 7af4398655d..cab850d8a08 100644
--- a/sky/clouds/slurm.py
+++ b/sky/clouds/slurm.py
@@ -47,10 +47,6 @@ class Slurm(clouds.Cloud):
             'controllers is not '
             'well tested with '
             'Slurm.',
-        clouds.CloudImplementationFeatures.IMAGE_ID: 'Specifying image ID is '
-                                                     'not supported in Slurm.',
-        clouds.CloudImplementationFeatures.DOCKER_IMAGE: 'Docker image is not '
-                                                         'supported in Slurm.',
     }
     _MAX_CLUSTER_NAME_LEN_LIMIT = 120
     _regions: List[clouds.Region] = []
@@ -65,7 +61,6 @@ class Slurm(clouds.Cloud):
     STATUS_VERSION = clouds.StatusVersion.SKYPILOT
 
     _SSH_CONFIG_KEY_MAPPING = {
-        'identityfile': 'IdentityFile',
         'user': 'User',
         'hostname': 'HostName',
     }
@@ -366,6 +361,8 @@ def make_deploy_resources_variables(
         if acc_type:
             acc_type = slurm_utils.get_gres_gpu_type(cluster, acc_type)
 
+        image_id = resources.extract_docker_image()
+
         deploy_vars = {
             'instance_type': resources.instance_type,
             'custom_resources': custom_resources,
@@ -383,11 +380,12 @@ def make_deploy_resources_variables(
             'slurm_proxy_jump': ssh_config_dict.get('proxyjump', None),
             # TODO(jwj): Solve naming collision with 'ssh_private_key'.
             # Please refer to slurm-ray.yml.j2 'ssh' and 'auth' sections.
-            'slurm_private_key': ssh_config_dict['identityfile'][0],
+            'slurm_private_key': slurm_utils.get_identity_file(ssh_config_dict),
             'slurm_sshd_host_key_filename':
                 (slurm_utils.SLURM_SSHD_HOST_KEY_FILENAME),
             'slurm_cluster_name_env_var':
                 (constants.SKY_CLUSTER_NAME_ENV_VAR_KEY),
+            'image_id': image_id,
         }
 
         return deploy_vars
@@ -509,7 +507,7 @@ def _check_compute_credentials(
                     ssh_config_dict['hostname'],
                     int(ssh_config_dict.get('port', 22)),
                     ssh_config_dict['user'],
-                    ssh_config_dict['identityfile'][0],
+                    slurm_utils.get_identity_file(ssh_config_dict),
                     ssh_proxy_command=ssh_config_dict.get('proxycommand', None),
                     ssh_proxy_jump=ssh_config_dict.get('proxyjump', None))
                 info = client.info()
diff --git a/sky/clouds/vast.py b/sky/clouds/vast.py
index abf4b542c5c..cbd5ca9585d 100644
--- a/sky/clouds/vast.py
+++ b/sky/clouds/vast.py
@@ -309,7 +309,7 @@ def _check_compute_credentials(
                 '        $ pip install vastai\n'
                 '        $ mkdir -p ~/.config/vastai\n'
                 f'        $ echo [key] > {_CREDENTIAL_PATH}\n'
-                '    For more information, see https://skypilot.readthedocs.io/en/latest/getting-started/installation.html#vast'  # pylint: disable=line-too-long
+                '    For more information, see https://docs.skypilot.co/en/latest/getting-started/installation.html#vast'  # pylint: disable=line-too-long
             )
 
         return True, None
diff --git a/sky/clouds/yotta.py b/sky/clouds/yotta.py
new file mode 100644
index 00000000000..ef87ab111da
--- /dev/null
+++ b/sky/clouds/yotta.py
@@ -0,0 +1,327 @@
+""" Yotta Cloud. """
+
+import os
+import typing
+from typing import Any, Dict, Iterator, List, Optional, Tuple, Union
+
+from sky import catalog
+from sky import clouds
+from sky.provision.yotta.yotta_utils import CREDENTIAL_FILE
+from sky.provision.yotta.yotta_utils import yotta_client
+from sky.utils import registry
+from sky.utils import resources_utils
+
+if typing.TYPE_CHECKING:
+    from sky import resources as resources_lib
+    from sky.utils import volume as volume_lib
+
+_CLOUD = 'yotta'
+_BASE_IMAGE = (
+    'yottalabsai/pytorch:2.9.0-py3.11-cuda12.8.1-cudnn-devel-ubuntu22.04')
+
+
+@registry.CLOUD_REGISTRY.register
+class Yotta(clouds.Cloud):
+    """ Yotta GPU Cloud
+
+    _REPR | The string representation for the Yotta GPU cloud object.
+    """
+    _REPR = 'Yotta'
+    _CLOUD_UNSUPPORTED_FEATURES = {
+        clouds.CloudImplementationFeatures.STOP: 'Stopping not supported.',
+        clouds.CloudImplementationFeatures.MULTI_NODE:
+            ('Multi-node not supported yet, as the interconnection among nodes '
+             'are non-trivial on Yotta.'),
+        clouds.CloudImplementationFeatures.CLONE_DISK_FROM_CLUSTER:
+            ('Disk cloning not supported yet on Yotta.'),
+        clouds.CloudImplementationFeatures.SPOT_INSTANCE:
+            ('Spot instances not supported yet on Yotta.'),
+        clouds.CloudImplementationFeatures.CUSTOM_DISK_TIER:
+            ('Customizing disk tier is not supported yet on Yotta.'),
+        clouds.CloudImplementationFeatures.CUSTOM_NETWORK_TIER:
+            ('Custom network tier is not supported yet on Yotta.'),
+        clouds.CloudImplementationFeatures.STORAGE_MOUNTING:
+            ('Mounting object stores is not supported on Yotta. To read data '
+             'from object stores on Yotta, use `mode: COPY` to copy the data '
+             'to local disk.'),
+        clouds.CloudImplementationFeatures.HOST_CONTROLLERS:
+            ('Host controllers not supported yet on Yotta.'),
+        clouds.CloudImplementationFeatures.HIGH_AVAILABILITY_CONTROLLERS:
+            ('High availability controllers are not supported yet on Yotta.'),
+        clouds.CloudImplementationFeatures.AUTO_TERMINATE:
+            ('Auto-termination not supported yet on Yotta.'),
+        clouds.CloudImplementationFeatures.AUTOSTOP:
+            ('Auto-stop not supported yet on Yotta.'),
+        clouds.CloudImplementationFeatures.AUTODOWN:
+            ('Auto-down not supported yet on Yotta.'),
+        clouds.CloudImplementationFeatures.CUSTOM_MULTI_NETWORK:
+            ('Customized multiple network interfaces are not supported yet on '
+             'Yotta.'),
+    }
+
+    _MAX_CLUSTER_NAME_LEN_LIMIT = 255
+    _regions: List[clouds.Region] = []
+
+    PROVISIONER_VERSION = clouds.ProvisionerVersion.SKYPILOT
+    STATUS_VERSION = clouds.StatusVersion.SKYPILOT
+    OPEN_PORTS_VERSION = clouds.OpenPortsVersion.LAUNCH_ONLY
+
+    @classmethod
+    def _unsupported_features_for_resources(
+        cls,
+        resources: 'resources_lib.Resources',
+        region: Optional[str] = None,
+    ) -> Dict[clouds.CloudImplementationFeatures, str]:
+        """The features not supported based on the resources provided.
+
+        This method is used by check_features_are_supported() to check if the
+        cloud implementation supports all the requested features.
+
+        Returns:
+            A dict of {feature: reason} for the features not supported by the
+            cloud implementation.
+        """
+        del resources  # unused
+        return cls._CLOUD_UNSUPPORTED_FEATURES
+
+    @classmethod
+    def _max_cluster_name_length(cls) -> Optional[int]:
+        return cls._MAX_CLUSTER_NAME_LEN_LIMIT
+
+    @classmethod
+    def regions_with_offering(
+        cls,
+        instance_type: str,
+        accelerators: Optional[Dict[str, int]],
+        use_spot: bool,
+        region: Optional[str],
+        zone: Optional[str],
+        resources: Optional['resources_lib.Resources'] = None,
+    ) -> List[clouds.Region]:
+        del accelerators  # unused
+        regions = catalog.get_region_zones_for_instance_type(
+            instance_type, use_spot, _CLOUD)
+
+        if region is not None:
+            regions = [r for r in regions if r.name == region]
+
+        if zone is not None:
+            for r in regions:
+                assert r.zones is not None, r
+                r.set_zones([z for z in r.zones if z.name == zone])
+            regions = [r for r in regions if r.zones]
+        return regions
+
+    @classmethod
+    def get_vcpus_mem_from_instance_type(
+        cls,
+        instance_type: str,
+    ) -> Tuple[Optional[float], Optional[float]]:
+        return catalog.get_vcpus_mem_from_instance_type(instance_type,
+                                                        clouds=_CLOUD)
+
+    @classmethod
+    def zones_provision_loop(
+        cls,
+        *,
+        region: str,
+        num_nodes: int,
+        instance_type: str,
+        accelerators: Optional[Dict[str, int]] = None,
+        use_spot: bool = False,
+    ) -> Iterator[Optional[List['clouds.Zone']]]:
+        del num_nodes  # unused
+        regions = cls.regions_with_offering(instance_type,
+                                            accelerators,
+                                            use_spot,
+                                            region=region,
+                                            zone=None)
+        for r in regions:
+            assert r
+            yield r.zones
+
+    def instance_type_to_hourly_cost(self,
+                                     instance_type: str,
+                                     use_spot: bool,
+                                     region: Optional[str] = None,
+                                     zone: Optional[str] = None) -> float:
+        return catalog.get_hourly_cost(instance_type,
+                                       use_spot=use_spot,
+                                       region=region,
+                                       zone=zone,
+                                       clouds=_CLOUD)
+
+    def accelerators_to_hourly_cost(self,
+                                    accelerators: Dict[str, int],
+                                    use_spot: bool,
+                                    region: Optional[str] = None,
+                                    zone: Optional[str] = None) -> float:
+        """Returns the hourly cost of the accelerators, in dollars/hour."""
+        del accelerators, use_spot, region, zone  # unused
+        return 0.0  # Yotta includes accelerators in the hourly cost.
+
+    def get_egress_cost(self, num_gigabytes: float) -> float:
+        return 0.0
+
+    @classmethod
+    def get_default_instance_type(cls,
+                                  cpus: Optional[str] = None,
+                                  memory: Optional[str] = None,
+                                  disk_tier: Optional[
+                                      resources_utils.DiskTier] = None,
+                                  region: Optional[str] = None,
+                                  zone: Optional[str] = None) -> Optional[str]:
+        """Returns the default instance type for Yotta."""
+        return catalog.get_default_instance_type(cpus=cpus,
+                                                 memory=memory,
+                                                 disk_tier=disk_tier,
+                                                 region=region,
+                                                 zone=zone,
+                                                 clouds=_CLOUD)
+
+    @classmethod
+    def get_accelerators_from_instance_type(
+            cls, instance_type: str) -> Optional[Dict[str, Union[int, float]]]:
+        return catalog.get_accelerators_from_instance_type(instance_type,
+                                                           clouds=_CLOUD)
+
+    @classmethod
+    def get_zone_shell_cmd(cls) -> Optional[str]:
+        return None
+
+    def make_deploy_resources_variables(
+        self,
+        resources: 'resources_lib.Resources',
+        cluster_name: resources_utils.ClusterName,
+        region: 'clouds.Region',
+        zones: Optional[List['clouds.Zone']],
+        num_nodes: int,
+        dryrun: bool = False,
+        volume_mounts: Optional[List['volume_lib.VolumeMount']] = None,
+    ) -> Dict[str, Any]:
+        del dryrun, cluster_name, zones, num_nodes  # unused
+        resources = resources.assert_launchable()
+        acc_dict = self.get_accelerators_from_instance_type(
+            resources.instance_type)
+        custom_resources = resources_utils.make_ray_custom_resources_str(
+            acc_dict)
+
+        if resources.image_id is None:
+            image_id: Optional[str] = _BASE_IMAGE
+        elif resources.extract_docker_image() is not None:
+            image_id = resources.extract_docker_image()
+        else:
+            image_id = resources.image_id[resources.region]
+
+        instance_type = resources.instance_type
+        use_spot = resources.use_spot
+        hourly_cost = self.instance_type_to_hourly_cost(
+            instance_type=instance_type, use_spot=use_spot)
+
+        return {
+            'instance_type': instance_type,
+            'custom_resources': custom_resources,
+            'region': region.name,
+            'image_id': image_id,
+            'use_spot': use_spot,
+            'bid_per_gpu': str(hourly_cost),
+            'docker_login_config': resources.docker_login_config,
+        }
+
+    def _get_feasible_launchable_resources(
+        self, resources: 'resources_lib.Resources'
+    ) -> 'resources_utils.FeasibleResources':
+        """Returns a list of feasible resources for the given resources."""
+        if resources.instance_type is not None:
+            assert resources.is_launchable(), resources
+            resources = resources.copy(accelerators=None)
+            return resources_utils.FeasibleResources([resources], [], None)
+
+        def _make(instance_list):
+            resource_list = []
+            for instance_type in instance_list:
+                r = resources.copy(
+                    cloud=Yotta(),
+                    instance_type=instance_type,
+                    accelerators=None,
+                    cpus=None,
+                )
+                resource_list.append(r)
+            return resource_list
+
+        # Currently, handle a filter on accelerators only.
+        accelerators = resources.accelerators
+        if accelerators is None:
+            # Return a default instance type
+            default_instance_type = Yotta.get_default_instance_type(
+                cpus=resources.cpus,
+                memory=resources.memory,
+                disk_tier=resources.disk_tier,
+                region=resources.region,
+                zone=resources.zone)
+            if default_instance_type is None:
+                # TODO: Add hints to all return values in this method to help
+                #  users understand why the resources are not launchable.
+                return resources_utils.FeasibleResources([], [], None)
+            else:
+                return resources_utils.FeasibleResources(
+                    _make([default_instance_type]), [], None)
+
+        assert len(accelerators) == 1, resources
+        acc, acc_count = list(accelerators.items())[0]
+        (instance_list,
+         fuzzy_candidate_list) = catalog.get_instance_type_for_accelerator(
+             acc,
+             acc_count,
+             use_spot=resources.use_spot,
+             cpus=resources.cpus,
+             region=resources.region,
+             zone=resources.zone,
+             clouds=_CLOUD)
+        if instance_list is None:
+            return resources_utils.FeasibleResources([], fuzzy_candidate_list,
+                                                     None)
+        return resources_utils.FeasibleResources(_make(instance_list),
+                                                 fuzzy_candidate_list, None)
+
+    @classmethod
+    def _check_compute_credentials(cls) -> Tuple[bool, Optional[str]]:
+        """Checks if the user has access credentials to
+        Yotta's compute service."""
+        msg = ('Failed to access Yotta Cloud with credentials. '
+               'To configure credentials, go to:\n    '
+               '  https://console.yottalabs.ai \n    '
+               'to obtain an API key, then add save the contents '
+               f'to {CREDENTIAL_FILE} \n')
+        if not os.path.exists(os.path.expanduser(CREDENTIAL_FILE)):
+            return False, msg
+
+        try:
+            valid = yotta_client.check_api_key()
+            if not valid:
+                return False, msg
+            return True, None
+        except Exception as e:  # pylint: disable=broad-except
+            return False, str(e)
+
+    def get_credential_file_mounts(self) -> Dict[str, str]:
+        return {CREDENTIAL_FILE: CREDENTIAL_FILE}
+
+    @classmethod
+    def get_user_identities(cls) -> Optional[List[List[str]]]:
+        # NOTE: used for very advanced SkyPilot functionality
+        # Can implement later if desired
+        return None
+
+    def instance_type_exists(self, instance_type: str) -> bool:
+        return catalog.instance_type_exists(instance_type, _CLOUD)
+
+    def validate_region_zone(self, region: Optional[str], zone: Optional[str]):
+        return catalog.validate_region_zone(region, zone, clouds=_CLOUD)
+
+    @classmethod
+    def get_image_size(cls, image_id: str, region: Optional[str]) -> float:
+        # TODO: use 0.0 for now to allow all images. We should change this to
+        # return the docker image size.
+        del image_id, region  # unused
+        return 0.0
diff --git a/sky/core.py b/sky/core.py
index 58b1772cfd8..730664d6017 100644
--- a/sky/core.py
+++ b/sky/core.py
@@ -1,4 +1,5 @@
 """SDK functions for cluster/job management."""
+import shlex
 import typing
 from typing import Any, Dict, List, Literal, Optional, Tuple, Union
 
@@ -20,6 +21,7 @@
 from sky.adaptors import common as adaptors_common
 from sky.backends import backend_utils
 from sky.backends import cloud_vm_ray_backend
+from sky.backends import task_codegen
 from sky.clouds import cloud as sky_cloud
 from sky.jobs.server import core as managed_jobs_core
 from sky.provision.kubernetes import constants as kubernetes_constants
@@ -89,6 +91,14 @@ def optimize(
             request_name=request_names.AdminPolicyRequestName.OPTIMIZE,
             request_options=request_options) as dag:
         dag.resolve_and_validate_volumes()
+        # Use job group optimizer for job groups to properly handle
+        # co-location constraints and show the combined optimizer table
+        if dag.is_job_group():
+            return optimizer.Optimizer.optimize_job_group(
+                dag=dag,
+                minimize=minimize,
+                blocked_resources=blocked_resources,
+                quiet=quiet)
         return optimizer.Optimizer.optimize(dag=dag,
                                             minimize=minimize,
                                             blocked_resources=blocked_resources,
@@ -540,6 +550,8 @@ def _start(
             f'Starting cluster {cluster_name!r} with backend {backend.NAME} '
             'is not supported.')
 
+    hook: Optional[str] = None
+    hook_timeout: Optional[int] = None
     controller = controller_utils.Controllers.from_name(cluster_name)
     if controller is not None:
         if down or idle_minutes_to_autostop:
@@ -568,6 +580,9 @@ def _start(
                 controller_autostop_config.enabled):
             idle_minutes_to_autostop = controller_autostop_config.idle_minutes
             down = controller_autostop_config.down
+            wait_for = controller_autostop_config.wait_for
+            hook = controller_autostop_config.hook
+            hook_timeout = controller_autostop_config.hook_timeout
     else:
         # For non-controller clusters, restore autostop configuration from
         # database if not explicitly provided.
@@ -613,7 +628,15 @@ def _start(
                              all_file_mounts=None,
                              storage_mounts=storage_mounts)
     if idle_minutes_to_autostop is not None:
-        backend.set_autostop(handle, idle_minutes_to_autostop, wait_for, down)
+        # For controller clusters, hook comes from controller_autostop_config.
+        # For regular clusters, hook is None so it will be inherited from the
+        # existing config on the remote cluster.
+        backend.set_autostop(handle,
+                             idle_minutes_to_autostop,
+                             wait_for,
+                             down,
+                             hook=hook,
+                             hook_timeout=hook_timeout)
     return handle
 
 
@@ -695,8 +718,84 @@ def _stop_not_supported_message(resources: 'resources_lib.Resources') -> str:
     return message
 
 
+def _graceful_job_cancel(handle: backends.ResourceHandle,
+                         backend: backends.Backend,
+                         cluster_name: str,
+                         timeout: Optional[int] = None,
+                         terminate: bool = True) -> None:
+    """Stop jobs and flush rclone uploads on all nodes in parallel."""
+    op = 'shutdown' if terminate else 'stop'
+    if (not isinstance(handle, backends.CloudVmRayResourceHandle) or
+            not isinstance(backend, backends.CloudVmRayBackend)):
+        logger.warning(f'Graceful {op} only available for '
+                       'CloudVmRayBackend. Skipping...')
+        return
+
+    # Kill all running jobs on the cluster
+    logger.info(f'Graceful {op} enabled. Terminating user jobs on '
+                f'{cluster_name}...')
+    try:
+        backend.cancel_jobs(handle, jobs=None, cancel_all=True)
+    except Exception as e:  # pylint: disable=broad-except
+        logger.warning(f'Failed to cancel jobs: {e}')
+
+    # Flush rclone uploads on all nodes in parallel
+    logger.info('Flushing MOUNT_CACHED uploads on all nodes of '
+                f'{cluster_name!r}...')
+
+    # Get the flush script
+    flush_script = task_codegen.TaskCodeGen.get_rclone_flush_script()
+
+    # Wrap with timeout if specified
+    if timeout:
+        flush_script = f'timeout {timeout} bash -c {shlex.quote(flush_script)}'
+
+    runners = handle.get_command_runners()
+    node_args = [(i, runner) for i, runner in enumerate(runners)]
+    errors = []
+    logger.debug(f'Waiting for uploads on {len(runners)} node(s)...')
+
+    def run_flush_on_node(args):
+        """Run flush script on a single node."""
+        node_id, runner = args
+        try:
+            returncode, stdout, stderr = runner.run(
+                flush_script,
+                stream_logs=False,
+                require_outputs=True,
+            )
+            return (node_id, returncode, stdout, stderr)
+        except Exception as e:  # pylint: disable=broad-except
+            return (node_id, -1, '', str(e))
+
+    parallel_results = subprocess_utils.run_in_parallel(
+        run_flush_on_node,
+        node_args,
+        num_threads=len(runners),
+    )
+
+    for node_id, returncode, _, stderr in parallel_results:
+        if returncode == 0:
+            logger.debug(f'Node {node_id}: uploads flushed successfully')
+        elif returncode == 124:  # timeout exit code
+            logger.warning(f'Node {node_id}: flush timed out after {timeout}s')
+            errors.append(f'Node {node_id}: timeout')
+        else:
+            logger.warning(
+                f'Node {node_id}: flush failed (rc={returncode}): {stderr}')
+            errors.append(f'Node {node_id}: {stderr}')
+
+    if errors:
+        logger.warning(f'Some nodes had flush errors: {errors}')
+    else:
+        logger.debug(f'All MOUNT_CACHED uploads completed on {cluster_name!r}')
+
+
 @usage_lib.entrypoint
-def down(cluster_name: str, purge: bool = False) -> None:
+def down(cluster_name: str,
+         purge: bool = False,
+         graceful: bool = False,
+         graceful_timeout: Optional[int] = None) -> None:
     # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
     """Tears down a cluster.
 
@@ -712,6 +811,10 @@ def down(cluster_name: str, purge: bool = False) -> None:
             troubleshooting scenarios; with it set, it is the user's
             responsibility to ensure there are no leaked instances and related
             resources.
+        graceful: Cancel the user's task but block until MOUNT_CACHED data is
+            fully uploaded. This helps with preserving user data integrity.
+        graceful_timeout: If not None, sets a timeout for the graceful option
+            above (in seconds).
 
     Raises:
         sky.exceptions.ClusterDoesNotExist: the specified cluster does not
@@ -724,14 +827,24 @@ def down(cluster_name: str, purge: bool = False) -> None:
     if handle is None:
         raise exceptions.ClusterDoesNotExist(
             f'Cluster {cluster_name!r} does not exist.')
+    backend = backend_utils.get_backend_from_handle(handle)
+
+    if graceful:
+        _graceful_job_cancel(handle,
+                             backend,
+                             cluster_name,
+                             graceful_timeout,
+                             terminate=True)
 
     usage_lib.record_cluster_name_for_current_operation(cluster_name)
-    backend = backend_utils.get_backend_from_handle(handle)
     backend.teardown(handle, terminate=True, purge=purge)
 
 
 @usage_lib.entrypoint
-def stop(cluster_name: str, purge: bool = False) -> None:
+def stop(cluster_name: str,
+         purge: bool = False,
+         graceful: bool = False,
+         graceful_timeout: Optional[int] = None) -> None:
     # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
     """Stops a cluster.
 
@@ -750,6 +863,10 @@ def stop(cluster_name: str, purge: bool = False) -> None:
             certain manual troubleshooting scenarios; with it set, it is the
             user's responsibility to ensure there are no leaked instances and
             related resources.
+        graceful: Cancel the user's task but block until MOUNT_CACHED data is
+            fully uploaded. This helps with preserving user data integrity.
+        graceful_timeout: If not None, sets a timeout for the graceful option
+            above (in seconds).
 
     Raises:
         sky.exceptions.ClusterDoesNotExist: the specified cluster does not
@@ -791,17 +908,26 @@ def stop(cluster_name: str, purge: bool = False) -> None:
                 '  To terminate the cluster instead, run: '
                 f'{colorama.Style.BRIGHT}sky down {cluster_name}') from e
 
+    if graceful:
+        _graceful_job_cancel(handle,
+                             backend,
+                             cluster_name,
+                             graceful_timeout,
+                             terminate=False)
+
     usage_lib.record_cluster_name_for_current_operation(cluster_name)
     backend.teardown(handle, terminate=False, purge=purge)
 
 
 @usage_lib.entrypoint
 def autostop(
-        cluster_name: str,
-        idle_minutes: int,
-        wait_for: Optional[autostop_lib.AutostopWaitFor] = autostop_lib.
-    DEFAULT_AUTOSTOP_WAIT_FOR,
-        down: bool = False,  # pylint: disable=redefined-outer-name
+    cluster_name: str,
+    idle_minutes: int,
+    wait_for: Optional[
+        autostop_lib.AutostopWaitFor] = autostop_lib.DEFAULT_AUTOSTOP_WAIT_FOR,
+    down: bool = False,  # pylint: disable=redefined-outer-name
+    hook: Optional[str] = None,
+    hook_timeout: Optional[int] = None,
 ) -> None:
     # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
     """Schedules an autostop/autodown for a cluster.
@@ -835,6 +961,12 @@ def autostop(
           to a negative number cancels any autostop/autodown setting.
         down: if true, use autodown (tear down the cluster; non-restartable),
           rather than autostop (restartable).
+        hook: optional script to execute on the remote cluster before autostop.
+          The script runs before the cluster is stopped or torn down. If the
+          hook fails, autostop will still proceed but a warning will be logged.
+        hook_timeout: timeout in seconds for hook execution. If None, uses
+          DEFAULT_AUTOSTOP_HOOK_TIMEOUT_SECONDS (3600 = 1 hour). The hook will
+          be terminated if it exceeds this timeout.
 
     Raises:
         sky.exceptions.ClusterDoesNotExist: if the cluster does not exist.
@@ -890,7 +1022,12 @@ def autostop(
                 f'see reason above.') from e
 
     usage_lib.record_cluster_name_for_current_operation(cluster_name)
-    backend.set_autostop(handle, idle_minutes, wait_for, down)
+    backend.set_autostop(handle,
+                         idle_minutes,
+                         wait_for,
+                         down,
+                         hook=hook,
+                         hook_timeout=hook_timeout)
 
 
 # ==================
@@ -1132,6 +1269,43 @@ def tail_logs(cluster_name: str,
     return returnval
 
 
+@usage_lib.entrypoint
+def tail_autostop_logs(cluster_name: str,
+                       follow: bool = True,
+                       tail: int = 0) -> int:
+    """Tails the autostop hook logs of a cluster.
+
+    Args:
+        cluster_name: name of the cluster.
+        follow: whether to follow the logs.
+        tail: number of lines to display from the end of the log file.
+
+    Raises:
+        ValueError: if arguments are invalid or the cluster is not supported.
+        sky.exceptions.ClusterDoesNotExist: if the cluster does not exist.
+        sky.exceptions.ClusterNotUpError: if the cluster is not UP.
+        sky.exceptions.NotSupportedError: if the cluster is not based on
+          CloudVmRayBackend.
+        sky.exceptions.ClusterOwnerIdentityMismatchError: if the current user is
+          not the same as the user who created the cluster.
+        sky.exceptions.CloudUserIdentityError: if we fail to get the current
+          user identity.
+
+    Returns:
+        Return code 0 on success, non-zero on failure.
+    """
+    # Check the status of the cluster.
+    handle = backend_utils.check_cluster_available(
+        cluster_name,
+        operation='tailing autostop logs',
+    )
+    backend = backend_utils.get_backend_from_handle(handle)
+
+    usage_lib.record_cluster_name_for_current_operation(cluster_name)
+    returnval = backend.tail_autostop_logs(handle, follow=follow, tail=tail)
+    return returnval
+
+
 @usage_lib.entrypoint
 def download_logs(
         cluster_name: str,
@@ -1342,11 +1516,13 @@ def _realtime_kubernetes_gpu_availability_single(
             region_filter=context,
             quantity_filter=quantity_filter,
             case_sensitive=False)
-        assert (set(counts.keys()) == set(capacity.keys()) == set(
-            available.keys())), (f'Keys of counts ({list(counts.keys())}), '
-                                 f'capacity ({list(capacity.keys())}), '
-                                 f'and available ({list(available.keys())}) '
-                                 'must be the same.')
+
+        all_keys = set(counts.keys()) | set(capacity.keys()) | set(
+            available.keys())
+        counts = {key: counts.get(key, []) for key in all_keys}
+        capacity = {key: capacity.get(key, 0) for key in all_keys}
+        available = {key: available.get(key, 0) for key in all_keys}
+
         realtime_gpu_availability_list: List[
             models.RealtimeGpuAvailability] = []
 
diff --git a/sky/dag.py b/sky/dag.py
index 349f3ef1b04..d736ea667f0 100644
--- a/sky/dag.py
+++ b/sky/dag.py
@@ -1,13 +1,28 @@
 """DAGs: user applications to be run."""
+import enum
 import pprint
 import threading
 import typing
-from typing import List, Optional
+from typing import Dict, List, Optional, Union
 
 if typing.TYPE_CHECKING:
     from sky import task
 
 
+class DagExecution(enum.Enum):
+    """Execution mode for DAGs with multiple tasks.
+
+    This controls how tasks in a multi-task DAG are executed.
+    """
+    SERIAL = 'serial'  # Tasks execute sequentially (pipeline)
+    PARALLEL = 'parallel'  # All tasks start in parallel (job group)
+
+
+# Default execution mode for jobs without an explicit execution mode set.
+# Used for single jobs and as a fallback for pipelines.
+DEFAULT_EXECUTION = DagExecution.SERIAL
+
+
 class Dag:
     """Dag: a user application, represented as a DAG of Tasks.
 
@@ -15,6 +30,11 @@ class Dag:
         >>> import sky
         >>> with sky.Dag() as dag:
         >>>     task = sky.Task(...)
+
+    For JobGroups (heterogeneous parallel workloads):
+        >>> dag = dag_utils.load_job_group_from_yaml('job_group.yaml')
+        >>> # dag.is_job_group() returns True
+        >>> # dag.tasks contains jobs to run in parallel
     """
 
     def __init__(self) -> None:
@@ -26,6 +46,18 @@ def __init__(self) -> None:
         self.policy_applied: bool = False
         self.pool: Optional[str] = None
 
+        # Execution mode for multi-task DAGs
+        self.execution: Optional[DagExecution] = None
+
+        # Primary/auxiliary task support for job groups
+        # If set, only the named tasks are "primary"; others are "auxiliary".
+        # When all primary tasks complete, auxiliary tasks are terminated.
+        self.primary_tasks: Optional[List[str]] = None
+        # Termination delay for auxiliary tasks when primary tasks complete.
+        # Can be a string like "30s" (applies to all auxiliary tasks) or
+        # a dict like {"default": "30s", "replay-buffer": "1m"}.
+        self.termination_delay: Optional[Union[str, Dict[str, str]]] = None
+
     def add(self, task: 'task.Task') -> None:
         self.graph.add_node(task)
         self.tasks.append(task)
@@ -56,6 +88,74 @@ def __repr__(self) -> str:
     def get_graph(self):
         return self.graph
 
+    def is_job_group(self) -> bool:
+        """Check if this DAG represents a JobGroup.
+
+        A DAG is a JobGroup if it has parallel execution mode. This is the
+        defining characteristic that distinguishes job groups from pipelines.
+        """
+        return self.execution == DagExecution.PARALLEL
+
+    def set_execution(self, execution: 'DagExecution') -> None:
+        """Configure this DAG with the given execution mode."""
+        self.execution = execution
+
+    def get_termination_delay_secs(self, task_name: str) -> int:
+        """Get termination delay in seconds for a specific task.
+
+        Args:
+            task_name: The name of the task to get the delay for.
+
+        Returns:
+            Termination delay in seconds. Returns 0 if not configured.
+        """
+        if self.termination_delay is None:
+            return 0
+
+        # Import here to avoid circular imports
+        # pylint: disable=import-outside-toplevel
+        from sky.utils import resources_utils
+
+        # Get the delay string based on format (str or dict)
+        if isinstance(self.termination_delay, str):
+            delay_str = self.termination_delay
+        else:
+            delay_str = self.termination_delay.get(
+                task_name, self.termination_delay.get('default', '0s'))
+
+        return resources_utils.parse_time_seconds(delay_str)
+
+    def is_primary_task(self, task_name: str) -> bool:
+        """Check if a task is a primary task.
+
+        Args:
+            task_name: The name of the task to check.
+
+        Returns:
+            True if the task is primary. When primary_tasks is None or empty,
+            all tasks are considered primary.
+        """
+        if self.primary_tasks is None or len(self.primary_tasks) == 0:
+            return True
+        # pylint: disable=unsupported-membership-test
+        return task_name in self.primary_tasks
+
+    def get_auxiliary_task_names(self) -> typing.List[str]:
+        """Get the names of all auxiliary (non-primary) tasks.
+
+        Returns:
+            List of auxiliary task names. Returns empty list if all tasks
+            are primary (when primary_tasks is None or empty).
+        """
+        if self.primary_tasks is None or len(self.primary_tasks) == 0:
+            return []
+        # pylint: disable=unsupported-membership-test
+        return [
+            t.name
+            for t in self.tasks
+            if t.name is not None and t.name not in self.primary_tasks
+        ]
+
     def is_chain(self) -> bool:
         """Check if the DAG is a linear chain of tasks."""
 
diff --git a/sky/dashboard/package-lock.json b/sky/dashboard/package-lock.json
index e41998e426f..db81f1f3f14 100644
--- a/sky/dashboard/package-lock.json
+++ b/sky/dashboard/package-lock.json
@@ -8,6 +8,7 @@
       "name": "dashboard",
       "version": "0.1.0",
       "dependencies": {
+        "@codemirror/lang-yaml": "^6.1.2",
         "@emotion/react": "^11.13.0",
         "@emotion/styled": "^11.13.0",
         "@mui/material": "^5.16.7",
@@ -18,6 +19,7 @@
         "@radix-ui/react-label": "^2.1.0",
         "@radix-ui/react-select": "^2.1.1",
         "@radix-ui/react-slot": "^1.1.0",
+        "@uiw/react-codemirror": "^4.25.4",
         "chart.js": "^4.4.3",
         "class-variance-authority": "^0.7.0",
         "clsx": "^2.1.1",
@@ -1920,6 +1922,114 @@
       "dev": true,
       "license": "MIT"
     },
+    "node_modules/@codemirror/autocomplete": {
+      "version": "6.20.0",
+      "resolved": "https://registry.npmjs.org/@codemirror/autocomplete/-/autocomplete-6.20.0.tgz",
+      "integrity": "sha512-bOwvTOIJcG5FVo5gUUupiwYh8MioPLQ4UcqbcRf7UQ98X90tCa9E1kZ3Z7tqwpZxYyOvh1YTYbmZE9RTfTp5hg==",
+      "license": "MIT",
+      "dependencies": {
+        "@codemirror/language": "^6.0.0",
+        "@codemirror/state": "^6.0.0",
+        "@codemirror/view": "^6.17.0",
+        "@lezer/common": "^1.0.0"
+      }
+    },
+    "node_modules/@codemirror/commands": {
+      "version": "6.10.1",
+      "resolved": "https://registry.npmjs.org/@codemirror/commands/-/commands-6.10.1.tgz",
+      "integrity": "sha512-uWDWFypNdQmz2y1LaNJzK7fL7TYKLeUAU0npEC685OKTF3KcQ2Vu3klIM78D7I6wGhktme0lh3CuQLv0ZCrD9Q==",
+      "license": "MIT",
+      "dependencies": {
+        "@codemirror/language": "^6.0.0",
+        "@codemirror/state": "^6.4.0",
+        "@codemirror/view": "^6.27.0",
+        "@lezer/common": "^1.1.0"
+      }
+    },
+    "node_modules/@codemirror/lang-yaml": {
+      "version": "6.1.2",
+      "resolved": "https://registry.npmjs.org/@codemirror/lang-yaml/-/lang-yaml-6.1.2.tgz",
+      "integrity": "sha512-dxrfG8w5Ce/QbT7YID7mWZFKhdhsaTNOYjOkSIMt1qmC4VQnXSDSYVHHHn8k6kJUfIhtLo8t1JJgltlxWdsITw==",
+      "license": "MIT",
+      "dependencies": {
+        "@codemirror/autocomplete": "^6.0.0",
+        "@codemirror/language": "^6.0.0",
+        "@codemirror/state": "^6.0.0",
+        "@lezer/common": "^1.2.0",
+        "@lezer/highlight": "^1.2.0",
+        "@lezer/lr": "^1.0.0",
+        "@lezer/yaml": "^1.0.0"
+      }
+    },
+    "node_modules/@codemirror/language": {
+      "version": "6.12.1",
+      "resolved": "https://registry.npmjs.org/@codemirror/language/-/language-6.12.1.tgz",
+      "integrity": "sha512-Fa6xkSiuGKc8XC8Cn96T+TQHYj4ZZ7RdFmXA3i9xe/3hLHfwPZdM+dqfX0Cp0zQklBKhVD8Yzc8LS45rkqcwpQ==",
+      "license": "MIT",
+      "dependencies": {
+        "@codemirror/state": "^6.0.0",
+        "@codemirror/view": "^6.23.0",
+        "@lezer/common": "^1.5.0",
+        "@lezer/highlight": "^1.0.0",
+        "@lezer/lr": "^1.0.0",
+        "style-mod": "^4.0.0"
+      }
+    },
+    "node_modules/@codemirror/lint": {
+      "version": "6.9.3",
+      "resolved": "https://registry.npmjs.org/@codemirror/lint/-/lint-6.9.3.tgz",
+      "integrity": "sha512-y3YkYhdnhjDBAe0VIA0c4wVoFOvnp8CnAvfLqi0TqotIv92wIlAAP7HELOpLBsKwjAX6W92rSflA6an/2zBvXw==",
+      "license": "MIT",
+      "dependencies": {
+        "@codemirror/state": "^6.0.0",
+        "@codemirror/view": "^6.35.0",
+        "crelt": "^1.0.5"
+      }
+    },
+    "node_modules/@codemirror/search": {
+      "version": "6.6.0",
+      "resolved": "https://registry.npmjs.org/@codemirror/search/-/search-6.6.0.tgz",
+      "integrity": "sha512-koFuNXcDvyyotWcgOnZGmY7LZqEOXZaaxD/j6n18TCLx2/9HieZJ5H6hs1g8FiRxBD0DNfs0nXn17g872RmYdw==",
+      "license": "MIT",
+      "dependencies": {
+        "@codemirror/state": "^6.0.0",
+        "@codemirror/view": "^6.37.0",
+        "crelt": "^1.0.5"
+      }
+    },
+    "node_modules/@codemirror/state": {
+      "version": "6.5.4",
+      "resolved": "https://registry.npmjs.org/@codemirror/state/-/state-6.5.4.tgz",
+      "integrity": "sha512-8y7xqG/hpB53l25CIoit9/ngxdfoG+fx+V3SHBrinnhOtLvKHRyAJJuHzkWrR4YXXLX8eXBsejgAAxHUOdW1yw==",
+      "license": "MIT",
+      "dependencies": {
+        "@marijn/find-cluster-break": "^1.0.0"
+      }
+    },
+    "node_modules/@codemirror/theme-one-dark": {
+      "version": "6.1.3",
+      "resolved": "https://registry.npmjs.org/@codemirror/theme-one-dark/-/theme-one-dark-6.1.3.tgz",
+      "integrity": "sha512-NzBdIvEJmx6fjeremiGp3t/okrLPYT0d9orIc7AFun8oZcRk58aejkqhv6spnz4MLAevrKNPMQYXEWMg4s+sKA==",
+      "license": "MIT",
+      "dependencies": {
+        "@codemirror/language": "^6.0.0",
+        "@codemirror/state": "^6.0.0",
+        "@codemirror/view": "^6.0.0",
+        "@lezer/highlight": "^1.0.0"
+      }
+    },
+    "node_modules/@codemirror/view": {
+      "version": "6.39.11",
+      "resolved": "https://registry.npmjs.org/@codemirror/view/-/view-6.39.11.tgz",
+      "integrity": "sha512-bWdeR8gWM87l4DB/kYSF9A+dVackzDb/V56Tq7QVrQ7rn86W0rgZFtlL3g3pem6AeGcb9NQNoy3ao4WpW4h5tQ==",
+      "license": "MIT",
+      "dependencies": {
+        "@codemirror/state": "^6.5.0",
+        "crelt": "^1.0.6",
+        "style-mod": "^4.1.0",
+        "w3c-keyname": "^2.2.4"
+      }
+    },
     "node_modules/@emnapi/core": {
       "version": "1.4.3",
       "resolved": "https://registry.npmjs.org/@emnapi/core/-/core-1.4.3.tgz",
@@ -2893,6 +3003,47 @@
       "integrity": "sha512-M5UknZPHRu3DEDWoipU6sE8PdkZ6Z/S+v4dD+Ke8IaNlpdSQah50lz1KtcFBa2vsdOnwbbnxJwVM4wty6udA5w==",
       "license": "MIT"
     },
+    "node_modules/@lezer/common": {
+      "version": "1.5.0",
+      "resolved": "https://registry.npmjs.org/@lezer/common/-/common-1.5.0.tgz",
+      "integrity": "sha512-PNGcolp9hr4PJdXR4ix7XtixDrClScvtSCYW3rQG106oVMOOI+jFb+0+J3mbeL/53g1Zd6s0kJzaw6Ri68GmAA==",
+      "license": "MIT"
+    },
+    "node_modules/@lezer/highlight": {
+      "version": "1.2.3",
+      "resolved": "https://registry.npmjs.org/@lezer/highlight/-/highlight-1.2.3.tgz",
+      "integrity": "sha512-qXdH7UqTvGfdVBINrgKhDsVTJTxactNNxLk7+UMwZhU13lMHaOBlJe9Vqp907ya56Y3+ed2tlqzys7jDkTmW0g==",
+      "license": "MIT",
+      "dependencies": {
+        "@lezer/common": "^1.3.0"
+      }
+    },
+    "node_modules/@lezer/lr": {
+      "version": "1.4.8",
+      "resolved": "https://registry.npmjs.org/@lezer/lr/-/lr-1.4.8.tgz",
+      "integrity": "sha512-bPWa0Pgx69ylNlMlPvBPryqeLYQjyJjqPx+Aupm5zydLIF3NE+6MMLT8Yi23Bd9cif9VS00aUebn+6fDIGBcDA==",
+      "license": "MIT",
+      "dependencies": {
+        "@lezer/common": "^1.0.0"
+      }
+    },
+    "node_modules/@lezer/yaml": {
+      "version": "1.0.3",
+      "resolved": "https://registry.npmjs.org/@lezer/yaml/-/yaml-1.0.3.tgz",
+      "integrity": "sha512-GuBLekbw9jDBDhGur82nuwkxKQ+a3W5H0GfaAthDXcAu+XdpS43VlnxA9E9hllkpSP5ellRDKjLLj7Lu9Wr6xA==",
+      "license": "MIT",
+      "dependencies": {
+        "@lezer/common": "^1.2.0",
+        "@lezer/highlight": "^1.0.0",
+        "@lezer/lr": "^1.4.0"
+      }
+    },
+    "node_modules/@marijn/find-cluster-break": {
+      "version": "1.0.2",
+      "resolved": "https://registry.npmjs.org/@marijn/find-cluster-break/-/find-cluster-break-1.0.2.tgz",
+      "integrity": "sha512-l0h88YhZFyKdXIFNfSWpyjStDjGHwZ/U7iobcK1cQQD8sejsONdQtTVU+1wVN1PBw40PiiHB1vA5S7VTfQiP9g==",
+      "license": "MIT"
+    },
     "node_modules/@mui/core-downloads-tracker": {
       "version": "5.17.1",
       "resolved": "https://registry.npmjs.org/@mui/core-downloads-tracker/-/core-downloads-tracker-5.17.1.tgz",
@@ -5300,6 +5451,59 @@
         "url": "https://opencollective.com/eslint"
       }
     },
+    "node_modules/@uiw/codemirror-extensions-basic-setup": {
+      "version": "4.25.4",
+      "resolved": "https://registry.npmjs.org/@uiw/codemirror-extensions-basic-setup/-/codemirror-extensions-basic-setup-4.25.4.tgz",
+      "integrity": "sha512-YzNwkm0AbPv1EXhCHYR5v0nqfemG2jEB0Z3Att4rBYqKrlG7AA9Rhjc3IyBaOzsBu18wtrp9/+uhTyu7TXSRng==",
+      "license": "MIT",
+      "dependencies": {
+        "@codemirror/autocomplete": "^6.0.0",
+        "@codemirror/commands": "^6.0.0",
+        "@codemirror/language": "^6.0.0",
+        "@codemirror/lint": "^6.0.0",
+        "@codemirror/search": "^6.0.0",
+        "@codemirror/state": "^6.0.0",
+        "@codemirror/view": "^6.0.0"
+      },
+      "funding": {
+        "url": "https://jaywcjlove.github.io/#/sponsor"
+      },
+      "peerDependencies": {
+        "@codemirror/autocomplete": ">=6.0.0",
+        "@codemirror/commands": ">=6.0.0",
+        "@codemirror/language": ">=6.0.0",
+        "@codemirror/lint": ">=6.0.0",
+        "@codemirror/search": ">=6.0.0",
+        "@codemirror/state": ">=6.0.0",
+        "@codemirror/view": ">=6.0.0"
+      }
+    },
+    "node_modules/@uiw/react-codemirror": {
+      "version": "4.25.4",
+      "resolved": "https://registry.npmjs.org/@uiw/react-codemirror/-/react-codemirror-4.25.4.tgz",
+      "integrity": "sha512-ipO067oyfUw+DVaXhQCxkB0ZD9b7RnY+ByrprSYSKCHaULvJ3sqWYC/Zen6zVQ8/XC4o5EPBfatGiX20kC7XGA==",
+      "license": "MIT",
+      "dependencies": {
+        "@babel/runtime": "^7.18.6",
+        "@codemirror/commands": "^6.1.0",
+        "@codemirror/state": "^6.1.1",
+        "@codemirror/theme-one-dark": "^6.0.0",
+        "@uiw/codemirror-extensions-basic-setup": "4.25.4",
+        "codemirror": "^6.0.0"
+      },
+      "funding": {
+        "url": "https://jaywcjlove.github.io/#/sponsor"
+      },
+      "peerDependencies": {
+        "@babel/runtime": ">=7.11.0",
+        "@codemirror/state": ">=6.0.0",
+        "@codemirror/theme-one-dark": ">=6.0.0",
+        "@codemirror/view": ">=6.0.0",
+        "codemirror": ">=6.0.0",
+        "react": ">=17.0.0",
+        "react-dom": ">=17.0.0"
+      }
+    },
     "node_modules/@ungap/structured-clone": {
       "version": "1.3.0",
       "resolved": "https://registry.npmjs.org/@ungap/structured-clone/-/structured-clone-1.3.0.tgz",
@@ -6680,6 +6884,21 @@
         "node": ">= 0.12.0"
       }
     },
+    "node_modules/codemirror": {
+      "version": "6.0.2",
+      "resolved": "https://registry.npmjs.org/codemirror/-/codemirror-6.0.2.tgz",
+      "integrity": "sha512-VhydHotNW5w1UGK0Qj96BwSk/Zqbp9WbnyK2W/eVMv4QyF41INRGpjUhFJY7/uDNuudSc33a/PKr4iDqRduvHw==",
+      "license": "MIT",
+      "dependencies": {
+        "@codemirror/autocomplete": "^6.0.0",
+        "@codemirror/commands": "^6.0.0",
+        "@codemirror/language": "^6.0.0",
+        "@codemirror/lint": "^6.0.0",
+        "@codemirror/search": "^6.0.0",
+        "@codemirror/state": "^6.0.0",
+        "@codemirror/view": "^6.0.0"
+      }
+    },
     "node_modules/collect-v8-coverage": {
       "version": "1.0.2",
       "resolved": "https://registry.npmjs.org/collect-v8-coverage/-/collect-v8-coverage-1.0.2.tgz",
@@ -6871,6 +7090,12 @@
         "node": "^14.15.0 || ^16.10.0 || >=18.0.0"
       }
     },
+    "node_modules/crelt": {
+      "version": "1.0.6",
+      "resolved": "https://registry.npmjs.org/crelt/-/crelt-1.0.6.tgz",
+      "integrity": "sha512-VQ2MBenTq1fWZUH9DJNGti7kKv6EeAuYr3cLwxUWhIu1baTaXh4Ib5W2CqHVqib4/MqbYGJqiL3Zb8GJZr3l4g==",
+      "license": "MIT"
+    },
     "node_modules/cross-spawn": {
       "version": "7.0.6",
       "resolved": "https://registry.npmjs.org/cross-spawn/-/cross-spawn-7.0.6.tgz",
@@ -13873,6 +14098,12 @@
         "url": "https://github.com/sponsors/sindresorhus"
       }
     },
+    "node_modules/style-mod": {
+      "version": "4.1.3",
+      "resolved": "https://registry.npmjs.org/style-mod/-/style-mod-4.1.3.tgz",
+      "integrity": "sha512-i/n8VsZydrugj3Iuzll8+x/00GH2vnYsk1eomD8QiRrSAeW6ItbCQDtfXCeJHd0iwiNagqjQkvpvREEPtW3IoQ==",
+      "license": "MIT"
+    },
     "node_modules/styled-jsx": {
       "version": "5.1.1",
       "resolved": "https://registry.npmjs.org/styled-jsx/-/styled-jsx-5.1.1.tgz",
@@ -14755,6 +14986,12 @@
         "node": ">= 0.8"
       }
     },
+    "node_modules/w3c-keyname": {
+      "version": "2.2.8",
+      "resolved": "https://registry.npmjs.org/w3c-keyname/-/w3c-keyname-2.2.8.tgz",
+      "integrity": "sha512-dpojBhNsCNN7T82Tm7k26A6G9ML3NkhDsnw9n/eoxSRlVBB4CEtIQ/KTCLI2Fwf3ataSXRhYFkQi3SlnFwPvPQ==",
+      "license": "MIT"
+    },
     "node_modules/w3c-xmlserializer": {
       "version": "4.0.0",
       "resolved": "https://registry.npmjs.org/w3c-xmlserializer/-/w3c-xmlserializer-4.0.0.tgz",
diff --git a/sky/dashboard/package.json b/sky/dashboard/package.json
index 31dbbf5bfab..05f5f7b6bfe 100644
--- a/sky/dashboard/package.json
+++ b/sky/dashboard/package.json
@@ -6,7 +6,7 @@
     "dev": "node server.js",
     "build": "next build",
     "start": "NODE_ENV=production node server.js",
-    "lint": "next lint",
+    "lint": "next lint --max-warnings 0",
     "lint:fix": "next lint --fix",
     "format": "prettier --write .",
     "format:check": "prettier --check .",
@@ -15,6 +15,7 @@
     "test:watch": "jest --watch"
   },
   "dependencies": {
+    "@codemirror/lang-yaml": "^6.1.2",
     "@emotion/react": "^11.13.0",
     "@emotion/styled": "^11.13.0",
     "@mui/material": "^5.16.7",
@@ -25,6 +26,7 @@
     "@radix-ui/react-label": "^2.1.0",
     "@radix-ui/react-select": "^2.1.1",
     "@radix-ui/react-slot": "^1.1.0",
+    "@uiw/react-codemirror": "^4.25.4",
     "chart.js": "^4.4.3",
     "class-variance-authority": "^0.7.0",
     "clsx": "^2.1.1",
diff --git a/sky/dashboard/src/app/globals.css b/sky/dashboard/src/app/globals.css
index fbee99dc207..7822ecd45cc 100644
--- a/sky/dashboard/src/app/globals.css
+++ b/sky/dashboard/src/app/globals.css
@@ -505,3 +505,73 @@
   text-align: left !important;
   white-space: nowrap !important;
 }
+
+/* ===== Infra Page Glassy Loading Styles ===== */
+
+@keyframes infra-shimmer {
+  0% {
+    background-position: -200% 0;
+  }
+  100% {
+    background-position: 200% 0;
+  }
+}
+
+/* Skeleton text placeholder - replaces CircularProgress in cells */
+.infra-skeleton-text {
+  display: inline-block;
+  height: 0.75rem;
+  background: linear-gradient(
+    90deg,
+    hsl(var(--border)) 0%,
+    hsl(var(--secondary)) 50%,
+    hsl(var(--border)) 100%
+  );
+  background-size: 200% 100%;
+  animation: infra-shimmer 1.5s infinite;
+  border-radius: 3px;
+  min-width: 24px;
+}
+
+/* Row shimmer effect for progressive loading */
+.infra-loading-row {
+  position: relative;
+  overflow: hidden;
+}
+
+.infra-loading-row::after {
+  content: '';
+  position: absolute;
+  top: 0;
+  left: 0;
+  right: 0;
+  bottom: 0;
+  background: linear-gradient(
+    90deg,
+    rgba(255, 255, 255, 0) 0%,
+    rgba(255, 255, 255, 0.5) 50%,
+    rgba(255, 255, 255, 0) 100%
+  );
+  background-size: 200% 100%;
+  animation: infra-shimmer 1.5s infinite;
+  pointer-events: none;
+}
+
+/* Glass overlay for table during refresh */
+.infra-table-refreshing {
+  position: relative;
+}
+
+.infra-table-refreshing::before {
+  content: '';
+  position: absolute;
+  top: 0;
+  left: 0;
+  right: 0;
+  bottom: 0;
+  background: rgba(255, 255, 255, 0.5);
+  backdrop-filter: blur(1px);
+  z-index: 5;
+  border-radius: inherit;
+  pointer-events: none;
+}
diff --git a/sky/dashboard/src/components/GPUMetricsSection.jsx b/sky/dashboard/src/components/GPUMetricsSection.jsx
new file mode 100644
index 00000000000..28c565d078d
--- /dev/null
+++ b/sky/dashboard/src/components/GPUMetricsSection.jsx
@@ -0,0 +1,212 @@
+import React, { useState } from 'react';
+import {
+  ChevronDownIcon,
+  ChevronRightIcon,
+  ExternalLinkIcon,
+} from 'lucide-react';
+import { CustomTooltip as Tooltip } from '@/components/utils';
+import { getGrafanaUrl, buildGrafanaUrl } from '@/utils/grafana';
+
+// Grafana configuration constants
+const GRAFANA_DASHBOARD_SLUG = 'skypilot-dcgm-gpu/skypilot-dcgm-gpu-metrics';
+const GRAFANA_ORG_ID = '1';
+
+// Time range presets for GPU metrics
+const TIME_RANGE_PRESETS = [
+  { label: '15m', value: '15m' },
+  { label: '1h', value: '1h' },
+  { label: '6h', value: '6h' },
+  { label: '24h', value: '24h' },
+  { label: '7d', value: '7d' },
+];
+
+// GPU panels configuration
+const GPU_PANELS = [
+  { id: '1', title: 'GPU Utilization', keyPrefix: 'gpu-util' },
+  { id: '2', title: 'GPU Memory Utilization', keyPrefix: 'gpu-memory' },
+  { id: '3', title: 'GPU Temperature', keyPrefix: 'gpu-temp' },
+  { id: '4', title: 'GPU Power Usage', keyPrefix: 'gpu-power' },
+];
+
+/**
+ * Build Grafana panel URL with filters
+ */
+const buildGrafanaMetricsUrl = (panelId, clusterNameOnCloud, timeRange) => {
+  const grafanaUrl = getGrafanaUrl();
+  const params = new URLSearchParams({
+    orgId: GRAFANA_ORG_ID,
+    from: timeRange.from,
+    to: timeRange.to,
+    timezone: 'browser',
+    'var-cluster': clusterNameOnCloud,
+    'var-node': '$__all',
+    'var-gpu': '$__all',
+    theme: 'light',
+    panelId: panelId,
+  });
+  return `${grafanaUrl}/d-solo/${GRAFANA_DASHBOARD_SLUG}?${params.toString()}&__feature.dashboardSceneSolo`;
+};
+
+/**
+ * Reusable GPU Metrics Section component
+ *
+ * @param {Object} props
+ * @param {string} props.clusterNameOnCloud - The cluster name for filtering metrics
+ * @param {string} props.displayName - The name to show in the "Showing:" text
+ * @param {number} props.refreshTrigger - Increment to trigger iframe refresh
+ * @param {string} props.storageKey - LocalStorage key for expanded state
+ * @param {React.ReactNode} props.headerExtra - Optional extra content for header (e.g., task selector)
+ * @param {string} props.noMetricsMessage - Custom message when no metrics available
+ */
+export function GPUMetricsSection({
+  clusterNameOnCloud,
+  displayName,
+  refreshTrigger = 0,
+  storageKey = 'skypilot-gpu-metrics-expanded',
+  headerExtra = null,
+  noMetricsMessage = 'No GPU metrics available.',
+}) {
+  const [timeRange, setTimeRange] = useState({ from: 'now-1h', to: 'now' });
+  const [isExpanded, setIsExpanded] = useState(() => {
+    if (typeof window !== 'undefined') {
+      const saved = localStorage.getItem(storageKey);
+      return saved === 'true';
+    }
+    return false;
+  });
+
+  const handleTimeRangePreset = (preset) => {
+    setTimeRange({
+      from: `now-${preset}`,
+      to: 'now',
+    });
+  };
+
+  const toggleExpanded = () => {
+    const newValue = !isExpanded;
+    setIsExpanded(newValue);
+    if (typeof window !== 'undefined') {
+      localStorage.setItem(storageKey, String(newValue));
+    }
+  };
+
+  const openInGrafana = () => {
+    const queryParams = new URLSearchParams({
+      orgId: GRAFANA_ORG_ID,
+      from: timeRange.from,
+      to: timeRange.to,
+      timezone: 'browser',
+      'var-cluster': clusterNameOnCloud,
+      'var-node': '$__all',
+      'var-gpu': '$__all',
+    });
+    window.open(
+      buildGrafanaUrl(`/d/${GRAFANA_DASHBOARD_SLUG}?${queryParams.toString()}`),
+      '_blank'
+    );
+  };
+
+  return (
+    <div className="mt-6">
+      <div className="rounded-lg border bg-card text-card-foreground shadow-sm">
+        <div
+          className={`flex items-center justify-between px-4 ${isExpanded ? 'pt-4' : 'py-4'}`}
+        >
+          <div className="flex items-center">
+            <button
+              onClick={toggleExpanded}
+              className="flex items-center text-left focus:outline-none hover:text-gray-700 transition-colors duration-200"
+            >
+              {isExpanded ? (
+                <ChevronDownIcon className="w-5 h-5 mr-2" />
+              ) : (
+                <ChevronRightIcon className="w-5 h-5 mr-2" />
+              )}
+              <h3 className="text-lg font-semibold">GPU Metrics</h3>
+            </button>
+            {headerExtra}
+          </div>
+          <Tooltip content="Open in Grafana">
+            <button
+              onClick={openInGrafana}
+              className="p-1.5 rounded-md text-gray-500 hover:text-gray-700 hover:bg-gray-100 transition-colors"
+              aria-label="Open in Grafana"
+            >
+              <ExternalLinkIcon className="w-4 h-4" />
+            </button>
+          </Tooltip>
+        </div>
+        {isExpanded && (
+          <div className="p-5">
+            {/* Filtering Controls */}
+            <div className="mb-4 p-4 bg-gray-50 rounded-md border border-gray-200">
+              <div className="flex flex-col sm:flex-row gap-4 items-start sm:items-center">
+                {/* Time Range Selection */}
+                <div className="flex items-center gap-2">
+                  <label className="text-sm font-medium text-gray-700 whitespace-nowrap">
+                    Time Range:
+                  </label>
+                  <div className="flex gap-1">
+                    {TIME_RANGE_PRESETS.map((preset) => (
+                      <button
+                        key={preset.value}
+                        onClick={() => handleTimeRangePreset(preset.value)}
+                        className={`px-2 py-1 text-xs font-medium rounded border transition-colors ${
+                          timeRange.from === `now-${preset.value}` &&
+                          timeRange.to === 'now'
+                            ? 'bg-sky-blue text-white border-sky-blue'
+                            : 'bg-white text-gray-600 border-gray-300 hover:bg-gray-50'
+                        }`}
+                      >
+                        {preset.label}
+                      </button>
+                    ))}
+                  </div>
+                </div>
+              </div>
+
+              {/* Show current selection info */}
+              <div className="mt-2 text-xs text-gray-500">
+                Showing: {displayName} • Time: {timeRange.from} to{' '}
+                {timeRange.to}
+              </div>
+            </div>
+
+            {clusterNameOnCloud ? (
+              <div className="grid gap-4 [grid-template-columns:repeat(auto-fit,minmax(300px,1fr))]">
+                {GPU_PANELS.map((panel) => (
+                  <div
+                    key={panel.id}
+                    className="bg-white rounded-md border border-gray-200 shadow-sm"
+                  >
+                    <div className="p-2">
+                      <iframe
+                        src={buildGrafanaMetricsUrl(
+                          panel.id,
+                          clusterNameOnCloud,
+                          timeRange
+                        )}
+                        width="100%"
+                        height="400"
+                        frameBorder="0"
+                        title={panel.title}
+                        className="rounded"
+                        key={`${panel.keyPrefix}-${clusterNameOnCloud}-${timeRange.from}-${timeRange.to}-${refreshTrigger}`}
+                      />
+                    </div>
+                  </div>
+                ))}
+              </div>
+            ) : (
+              <div className="p-4 text-center text-gray-500 bg-gray-50 rounded-md">
+                {noMetricsMessage}
+              </div>
+            )}
+          </div>
+        )}
+      </div>
+    </div>
+  );
+}
+
+export default GPUMetricsSection;
diff --git a/sky/dashboard/src/components/clusters.jsx b/sky/dashboard/src/components/clusters.jsx
index 1cb484f6ca8..92cad21811a 100755
--- a/sky/dashboard/src/components/clusters.jsx
+++ b/sky/dashboard/src/components/clusters.jsx
@@ -32,7 +32,11 @@ import {
   TableBody,
   TableCell,
 } from '@/components/ui/table';
-import { getClusters, getClusterHistory } from '@/data/connectors/clusters';
+import {
+  getClusters,
+  getClusterHistory,
+  useClusterData,
+} from '@/data/connectors/clusters';
 import { getWorkspaces } from '@/data/connectors/workspaces';
 import { sortData } from '@/data/utils';
 import { SquareCode, Terminal, RotateCwIcon, Brackets } from 'lucide-react';
@@ -45,6 +49,7 @@ import {
 import { StatusBadge } from '@/components/elements/StatusBadge';
 import { useMobile } from '@/hooks/useMobile';
 import { PluginSlot } from '@/plugins/PluginSlot';
+import { useTableColumns } from '@/plugins/PluginProvider';
 import {
   Select,
   SelectContent,
@@ -590,15 +595,47 @@ export function ClusterTable({
   setOptionValues,
   preloadingComplete,
 }) {
-  const [data, setData] = useState([]);
   const [sortConfig, setSortConfig] = useState({
     key: null,
     direction: 'ascending',
   });
-  const [loading, setLocalLoading] = useState(false);
+
+  // Use the cluster data hook (supports plugin override for server-side pagination)
+  const {
+    data: hookData,
+    allData,
+    total,
+    page,
+    limit,
+    totalPages: hookTotalPages,
+    hasNext,
+    hasPrev,
+    setPage,
+    setLimit,
+    loading: hookLoading,
+    refresh,
+    isServerPagination,
+  } = useClusterData({
+    showHistory,
+    historyDays,
+    refreshInterval: preloadingComplete ? refreshInterval : null,
+    sortConfig,
+    filters,
+  });
+
+  // Track loading state for parent component
   const [isInitialLoad, setIsInitialLoad] = useState(true);
-  const [currentPage, setCurrentPage] = useState(1);
-  const [pageSize, setPageSize] = useState(10);
+
+  useEffect(() => {
+    if (!hookLoading && isInitialLoad) {
+      setIsInitialLoad(false);
+    }
+  }, [hookLoading, isInitialLoad]);
+
+  // Sync loading state with parent
+  useEffect(() => {
+    setLoading(hookLoading);
+  }, [hookLoading, setLoading]);
 
   const fetchOptionValuesFromClusters = (clusters) => {
     let optionValues = {
@@ -621,7 +658,11 @@ export function ClusterTable({
       pushWithoutDuplication(optionValues.cluster, cluster.cluster);
       pushWithoutDuplication(optionValues.user, cluster.user);
       pushWithoutDuplication(optionValues.workspace, cluster.workspace);
-      pushWithoutDuplication(optionValues.infra, cluster.infra);
+      // Use full_infra for filter values so they match database values
+      pushWithoutDuplication(
+        optionValues.infra,
+        cluster.full_infra || cluster.infra
+      );
 
       // Extract labels - add only key:value pairs
       const labels = cluster.labels || {};
@@ -638,71 +679,16 @@ export function ClusterTable({
     return optionValues;
   };
 
-  const fetchData = React.useCallback(async () => {
-    setLoading(true);
-    setLocalLoading(true);
-
-    try {
-      // Use cached data if available, which should have been preloaded by cache preloader
-      const activeClusters = await dashboardCache.get(getClusters);
-
-      if (showHistory) {
-        let historyClusters = [];
-        try {
-          historyClusters = await dashboardCache.get(getClusterHistory, [
-            null,
-            historyDays,
-          ]);
-        } catch (error) {
-          console.error('Error fetching cluster history:', error);
-        }
-        // Mark clusters as active or historical for UI distinction
-        const markedActiveClusters = activeClusters.map((cluster) => ({
-          ...cluster,
-          isHistorical: false,
-        }));
-        const markedHistoryClusters = historyClusters.map((cluster) => ({
-          ...cluster,
-          isHistorical: true,
-        }));
-        // Combine and remove duplicates (prefer active over historical)
-        const combinedData = [...markedActiveClusters];
-        markedHistoryClusters.forEach((histCluster) => {
-          const existsInActive = activeClusters.some(
-            (activeCluster) =>
-              activeCluster.cluster_hash === histCluster.cluster_hash
-          );
-          if (!existsInActive) {
-            combinedData.push(histCluster);
-          }
-        });
-
-        setOptionValues(fetchOptionValuesFromClusters(combinedData));
-
-        setData(combinedData);
-      } else {
-        // Mark active clusters for consistency
-        const markedActiveClusters = activeClusters.map((cluster) => ({
-          ...cluster,
-          isHistorical: false,
-        }));
-
-        setOptionValues(fetchOptionValuesFromClusters(markedActiveClusters));
-
-        setData(markedActiveClusters);
-      }
-    } catch (error) {
-      console.error('Error fetching cluster data:', error);
-      setOptionValues(fetchOptionValuesFromClusters([]));
-      setData([]);
+  // Update filter option values when data changes
+  useEffect(() => {
+    if (allData && allData.length > 0) {
+      setOptionValues(fetchOptionValuesFromClusters(allData));
     }
+  }, [allData, setOptionValues]);
 
-    setLoading(false);
-    setLocalLoading(false);
-    setIsInitialLoad(false);
-  }, [setLoading, showHistory, historyDays, setOptionValues]);
-
-  // Use useMemo to compute sorted data
+  // Use useMemo to compute sorted and filtered data
+  // For server-side pagination, data is already paginated, so we apply filters/sort to hookData
+  // For client-side pagination, we apply filters/sort to allData, then paginate
   const sortedData = React.useMemo(() => {
     // Main filter function
     const filterData = (data, filters) => {
@@ -728,47 +714,37 @@ export function ClusterTable({
       });
     };
 
-    const filteredData = filterData(data, filters);
+    // For server-side pagination, server already handles filtering - just apply sorting
+    // For client-side pagination, we filter/sort the full data then paginate
+    const dataToProcess = isServerPagination ? hookData : allData;
+
+    if (!isServerPagination) {
+      const historicalCount = (allData || []).filter(
+        (c) => c.isHistorical
+      ).length;
+      console.log(
+        '[ClusterTable] Client-side - allData length:',
+        (allData || []).length,
+        ', historical:',
+        historicalCount,
+        ', filters:',
+        filters.length
+      );
+    }
+
+    const filteredData = isServerPagination
+      ? dataToProcess
+      : filterData(dataToProcess, filters);
 
     return sortData(filteredData, sortConfig.key, sortConfig.direction);
-  }, [data, sortConfig, filters]);
+  }, [hookData, allData, sortConfig, filters, isServerPagination]);
 
-  // Expose fetchData to parent component
+  // Expose refresh to parent component
   React.useEffect(() => {
     if (refreshDataRef) {
-      refreshDataRef.current = fetchData;
+      refreshDataRef.current = refresh;
     }
-  }, [refreshDataRef, fetchData]);
-
-  useEffect(() => {
-    setData([]);
-    let isCurrent = true;
-
-    // Only start fetching data after preloading is complete
-    if (preloadingComplete) {
-      fetchData();
-
-      const interval = setInterval(() => {
-        if (isCurrent && window.document.visibilityState === 'visible') {
-          fetchData();
-        }
-      }, refreshInterval);
-
-      return () => {
-        isCurrent = false;
-        clearInterval(interval);
-      };
-    }
-
-    return () => {
-      isCurrent = false;
-    };
-  }, [refreshInterval, fetchData, preloadingComplete]);
-
-  // Reset to first page when data changes
-  useEffect(() => {
-    setCurrentPage(1);
-  }, [data.length]);
+  }, [refreshDataRef, refresh]);
 
   const requestSort = (key) => {
     let direction = 'ascending';
@@ -785,27 +761,364 @@ export function ClusterTable({
     return '';
   };
 
-  // Calculate pagination using sortedData
-  const totalPages = Math.ceil(sortedData.length / pageSize);
-  const startIndex = (currentPage - 1) * pageSize;
-  const endIndex = startIndex + pageSize;
-  const paginatedData = sortedData.slice(startIndex, endIndex);
-
-  // Page navigation handlers
+  // Calculate pagination
+  // For server-side: use hook's total (server already filtered)
+  // For client-side: use sortedData.length (filtered in component)
+  const displayTotal = isServerPagination ? total : sortedData.length;
+  const totalPages = isServerPagination
+    ? hookTotalPages || Math.ceil(total / limit) || 1
+    : Math.ceil(sortedData.length / limit) || 1;
+  const startIndex = (page - 1) * limit;
+  const endIndex = startIndex + limit;
+
+  // For server-side pagination, sortedData is already the current page
+  // For client-side pagination, we need to slice after filtering/sorting
+  const paginatedData = isServerPagination
+    ? sortedData
+    : sortedData.slice(startIndex, endIndex);
+
+  // Page navigation handlers - use hook's setPage
   const goToPreviousPage = () => {
-    setCurrentPage((page) => Math.max(page - 1, 1));
+    if (isServerPagination) {
+      if (hasPrev) setPage(page - 1);
+    } else {
+      setPage(Math.max(page - 1, 1));
+    }
   };
 
   const goToNextPage = () => {
-    setCurrentPage((page) => Math.min(page + 1, totalPages));
+    if (isServerPagination) {
+      if (hasNext) setPage(page + 1);
+    } else {
+      setPage(Math.min(page + 1, totalPages));
+    }
   };
 
   const handlePageSizeChange = (e) => {
     const newSize = parseInt(e.target.value, 10);
-    setPageSize(newSize);
-    setCurrentPage(1); // Reset to first page when changing page size
+    setLimit(newSize);
   };
 
+  // Get plugin columns
+  const pluginColumns = useTableColumns('clusters', { showHistory });
+
+  // Define base columns with their order
+  const baseColumns = [
+    {
+      id: 'status',
+      order: 0,
+      header: {
+        label: 'Status',
+        sortKey: 'status',
+        className: 'sortable whitespace-nowrap',
+      },
+      renderHeader: () => (
+        <TableHead
+          className="sortable whitespace-nowrap"
+          onClick={() => requestSort('status')}
+        >
+          Status{getSortDirection('status')}
+        </TableHead>
+      ),
+      renderCell: (item) => (
+        <TableCell>
+          <PluginSlot
+            name="clusters.table.status.badge"
+            context={item}
+            fallback={
+              <StatusBadge
+                status={item.status}
+                statusTooltip={item.statusTooltip}
+              />
+            }
+          />
+        </TableCell>
+      ),
+    },
+    {
+      id: 'cluster',
+      order: 1,
+      header: {
+        label: 'Cluster',
+        sortKey: 'cluster',
+        className: 'sortable whitespace-nowrap',
+      },
+      renderHeader: () => (
+        <TableHead
+          className="sortable whitespace-nowrap"
+          onClick={() => requestSort('cluster')}
+        >
+          Cluster{getSortDirection('cluster')}
+        </TableHead>
+      ),
+      renderCell: (item) => (
+        <TableCell>
+          <Link
+            href={`/clusters/${item.isHistorical ? item.cluster_hash : item.cluster || item.name}`}
+            className="text-blue-600"
+          >
+            {item.cluster || item.name}
+          </Link>
+        </TableCell>
+      ),
+    },
+    {
+      id: 'user',
+      order: 2,
+      header: {
+        label: 'User',
+        sortKey: 'user',
+        className: 'sortable whitespace-nowrap',
+      },
+      renderHeader: () => (
+        <TableHead
+          className="sortable whitespace-nowrap"
+          onClick={() => requestSort('user')}
+        >
+          User{getSortDirection('user')}
+        </TableHead>
+      ),
+      renderCell: (item) => (
+        <TableCell>
+          <UserDisplay username={item.user} userHash={item.user_hash} />
+        </TableCell>
+      ),
+    },
+    {
+      id: 'workspace',
+      order: 3,
+      header: {
+        label: 'Workspace',
+        sortKey: 'workspace',
+        className: 'sortable whitespace-nowrap',
+      },
+      renderHeader: () => (
+        <TableHead
+          className="sortable whitespace-nowrap"
+          onClick={() => requestSort('workspace')}
+        >
+          Workspace{getSortDirection('workspace')}
+        </TableHead>
+      ),
+      renderCell: (item) => (
+        <TableCell>
+          <Link
+            href="/workspaces"
+            className="text-gray-700 hover:text-blue-600 hover:underline"
+          >
+            {item.workspace || 'default'}
+          </Link>
+        </TableCell>
+      ),
+    },
+    {
+      id: 'infra',
+      order: 4,
+      header: {
+        label: 'Infra',
+        sortKey: 'infra',
+        className: 'sortable whitespace-nowrap',
+      },
+      renderHeader: () => (
+        <TableHead
+          className="sortable whitespace-nowrap"
+          onClick={() => requestSort('infra')}
+        >
+          Infra{getSortDirection('infra')}
+        </TableHead>
+      ),
+      renderCell: (item) => (
+        <TableCell>
+          <NonCapitalizedTooltip
+            content={item.full_infra || item.infra}
+            className="text-sm text-muted-foreground"
+          >
+            <span>
+              <Link href="/infra" className="text-blue-600 hover:underline">
+                {item.cloud}
+              </Link>
+              {item.infra.includes('(') && (
+                <span>
+                  {' ' + item.infra.substring(item.infra.indexOf('('))}
+                </span>
+              )}
+            </span>
+          </NonCapitalizedTooltip>
+        </TableCell>
+      ),
+    },
+    {
+      id: 'resources',
+      order: 5,
+      header: {
+        label: 'Resources',
+        sortKey: 'resources_str',
+        className: 'sortable whitespace-nowrap',
+      },
+      renderHeader: () => (
+        <TableHead
+          className="sortable whitespace-nowrap"
+          onClick={() => requestSort('resources_str')}
+        >
+          Resources{getSortDirection('resources_str')}
+        </TableHead>
+      ),
+      renderCell: (item) => (
+        <TableCell>
+          <NonCapitalizedTooltip
+            content={item.resources_str_full || item.resources_str}
+            className="text-sm text-muted-foreground"
+          >
+            <span>{item.resources_str}</span>
+          </NonCapitalizedTooltip>
+        </TableCell>
+      ),
+    },
+    {
+      id: 'started',
+      order: 6,
+      header: {
+        label: 'Started',
+        sortKey: 'time',
+        className: 'sortable whitespace-nowrap',
+      },
+      renderHeader: () => (
+        <TableHead
+          className="sortable whitespace-nowrap"
+          onClick={() => requestSort('time')}
+        >
+          Started{getSortDirection('time')}
+        </TableHead>
+      ),
+      renderCell: (item) => (
+        <TableCell>
+          <TimestampWithTooltip date={item.time} />
+        </TableCell>
+      ),
+    },
+    {
+      id: 'duration',
+      order: 7,
+      conditional: true, // Only show when showHistory is true
+      header: {
+        label: 'Duration',
+        sortKey: 'duration',
+        className: 'sortable whitespace-nowrap',
+      },
+      renderHeader: () =>
+        showHistory ? (
+          <TableHead
+            className="sortable whitespace-nowrap"
+            onClick={() => requestSort('duration')}
+          >
+            Duration{getSortDirection('duration')}
+          </TableHead>
+        ) : null,
+      renderCell: (item) =>
+        showHistory ? (
+          <TableCell>{formatDuration(item.duration)}</TableCell>
+        ) : null,
+    },
+    {
+      id: 'autostop',
+      order: 8,
+      header: {
+        label: 'Autostop',
+        sortKey: 'autostop',
+        className: 'sortable whitespace-nowrap',
+      },
+      renderHeader: () => (
+        <TableHead
+          className="sortable whitespace-nowrap"
+          onClick={() => requestSort('autostop')}
+        >
+          Autostop{getSortDirection('autostop')}
+        </TableHead>
+      ),
+      renderCell: (item) => (
+        <TableCell>
+          {item.isHistorical
+            ? '-'
+            : formatAutostop(item.autostop, item.to_down)}
+        </TableCell>
+      ),
+    },
+    {
+      id: 'actions',
+      order: 9,
+      header: {
+        label: 'Actions',
+        className: 'md:sticky md:right-0 md:bg-white',
+      },
+      renderHeader: () => (
+        <TableHead className="md:sticky md:right-0 md:bg-white">
+          Actions
+        </TableHead>
+      ),
+      renderCell: (item) => (
+        <TableCell className="text-left md:sticky md:right-0 md:bg-white">
+          {!item.isHistorical && (
+            <Status2Actions
+              cluster={item.cluster}
+              status={item.status}
+              onOpenSSHModal={onOpenSSHModal}
+              onOpenVSCodeModal={onOpenVSCodeModal}
+            />
+          )}
+        </TableCell>
+      ),
+    },
+  ];
+
+  // Add plugin columns to the array
+  const pluginColumnDefs = pluginColumns.map((col) => ({
+    id: col.id,
+    order: col.header.order,
+    isPlugin: true,
+    pluginColumn: col,
+    renderHeader: () => {
+      const baseClasses = col.header.sortKey
+        ? 'sortable whitespace-nowrap'
+        : 'whitespace-nowrap';
+      const className = `${baseClasses}${col.header.className ? ' ' + col.header.className : ''}`;
+      return (
+        <TableHead
+          className={className}
+          onClick={
+            col.header.sortKey
+              ? () => requestSort(col.header.sortKey)
+              : undefined
+          }
+        >
+          {col.header.label}
+          {col.header.sortKey ? getSortDirection(col.header.sortKey) : ''}
+        </TableHead>
+      );
+    },
+    renderCell: (item) => {
+      const context = { item, showHistory, historyDays };
+      const cellContent = col.cell.render(item, context);
+      return (
+        <TableCell className={col.cell.className || ''}>
+          {cellContent}
+        </TableCell>
+      );
+    },
+  }));
+
+  // Merge base and plugin columns, sort by order
+  const allColumns = [...baseColumns, ...pluginColumnDefs].sort(
+    (a, b) => a.order - b.order
+  );
+
+  // Filter out conditional columns that shouldn't be shown
+  const visibleColumns = allColumns.filter(
+    (col) => !col.conditional || (col.conditional && showHistory)
+  );
+
+  // Calculate dynamic colSpan
+  const totalColSpan = visibleColumns.length;
+
   return (
     <div>
       <Card>
@@ -813,73 +1126,17 @@ export function ClusterTable({
           <Table className="min-w-full">
             <TableHeader>
               <TableRow>
-                <TableHead
-                  className="sortable whitespace-nowrap"
-                  onClick={() => requestSort('status')}
-                >
-                  Status{getSortDirection('status')}
-                </TableHead>
-                <TableHead
-                  className="sortable whitespace-nowrap"
-                  onClick={() => requestSort('cluster')}
-                >
-                  Cluster{getSortDirection('cluster')}
-                </TableHead>
-                <TableHead
-                  className="sortable whitespace-nowrap"
-                  onClick={() => requestSort('user')}
-                >
-                  User{getSortDirection('user')}
-                </TableHead>
-                <TableHead
-                  className="sortable whitespace-nowrap"
-                  onClick={() => requestSort('workspace')}
-                >
-                  Workspace{getSortDirection('workspace')}
-                </TableHead>
-                <TableHead
-                  className="sortable whitespace-nowrap"
-                  onClick={() => requestSort('infra')}
-                >
-                  Infra{getSortDirection('infra')}
-                </TableHead>
-                <TableHead
-                  className="sortable whitespace-nowrap"
-                  onClick={() => requestSort('resources_str')}
-                >
-                  Resources{getSortDirection('resources_str')}
-                </TableHead>
-                <TableHead
-                  className="sortable whitespace-nowrap"
-                  onClick={() => requestSort('time')}
-                >
-                  Started{getSortDirection('time')}
-                </TableHead>
-                {showHistory && (
-                  <TableHead
-                    className="sortable whitespace-nowrap"
-                    onClick={() => requestSort('duration')}
-                  >
-                    Duration{getSortDirection('duration')}
-                  </TableHead>
+                {visibleColumns.map((col) =>
+                  React.cloneElement(col.renderHeader(), { key: col.id })
                 )}
-                <TableHead
-                  className="sortable whitespace-nowrap"
-                  onClick={() => requestSort('autostop')}
-                >
-                  Autostop{getSortDirection('autostop')}
-                </TableHead>
-                <TableHead className="md:sticky md:right-0 md:bg-white">
-                  Actions
-                </TableHead>
               </TableRow>
             </TableHeader>
 
             <TableBody>
-              {loading || !preloadingComplete ? (
+              {hookLoading || !preloadingComplete ? (
                 <TableRow>
                   <TableCell
-                    colSpan={9}
+                    colSpan={totalColSpan}
                     className="text-center py-6 text-gray-500"
                   >
                     <div className="flex justify-center items-center">
@@ -892,99 +1149,18 @@ export function ClusterTable({
                 paginatedData.map((item, index) => {
                   return (
                     <TableRow key={index}>
-                      <TableCell>
-                        <PluginSlot
-                          name="clusters.table.status.badge"
-                          context={item}
-                          fallback={
-                            <StatusBadge
-                              status={item.status}
-                              statusTooltip={item.statusTooltip}
-                            />
-                          }
-                        />
-                      </TableCell>
-                      <TableCell>
-                        <Link
-                          href={`/clusters/${item.isHistorical ? item.cluster_hash : item.cluster || item.name}`}
-                          className="text-blue-600"
-                        >
-                          {item.cluster || item.name}
-                        </Link>
-                      </TableCell>
-                      <TableCell>
-                        <UserDisplay
-                          username={item.user}
-                          userHash={item.user_hash}
-                        />
-                      </TableCell>
-                      <TableCell>
-                        <Link
-                          href="/workspaces"
-                          className="text-gray-700 hover:text-blue-600 hover:underline"
-                        >
-                          {item.workspace || 'default'}
-                        </Link>
-                      </TableCell>
-                      <TableCell>
-                        <NonCapitalizedTooltip
-                          content={item.full_infra || item.infra}
-                          className="text-sm text-muted-foreground"
-                        >
-                          <span>
-                            <Link
-                              href="/infra"
-                              className="text-blue-600 hover:underline"
-                            >
-                              {item.cloud}
-                            </Link>
-                            {item.infra.includes('(') && (
-                              <span>
-                                {' ' +
-                                  item.infra.substring(item.infra.indexOf('('))}
-                              </span>
-                            )}
-                          </span>
-                        </NonCapitalizedTooltip>
-                      </TableCell>
-                      <TableCell>
-                        <NonCapitalizedTooltip
-                          content={
-                            item.resources_str_full || item.resources_str
-                          }
-                          className="text-sm text-muted-foreground"
-                        >
-                          <span>{item.resources_str}</span>
-                        </NonCapitalizedTooltip>
-                      </TableCell>
-                      <TableCell>
-                        <TimestampWithTooltip date={item.time} />
-                      </TableCell>
-                      {showHistory && (
-                        <TableCell>{formatDuration(item.duration)}</TableCell>
+                      {visibleColumns.map((col) =>
+                        React.cloneElement(col.renderCell(item), {
+                          key: col.id,
+                        })
                       )}
-                      <TableCell>
-                        {item.isHistorical
-                          ? '-'
-                          : formatAutostop(item.autostop, item.to_down)}
-                      </TableCell>
-                      <TableCell className="text-left md:sticky md:right-0 md:bg-white">
-                        {!item.isHistorical && (
-                          <Status2Actions
-                            cluster={item.cluster}
-                            status={item.status}
-                            onOpenSSHModal={onOpenSSHModal}
-                            onOpenVSCodeModal={onOpenVSCodeModal}
-                          />
-                        )}
-                      </TableCell>
                     </TableRow>
                   );
                 })
               ) : (
                 <TableRow>
                   <TableCell
-                    colSpan={9}
+                    colSpan={totalColSpan}
                     className="text-center py-6 text-gray-500"
                   >
                     {showHistory ? 'No clusters found' : 'No active clusters'}
@@ -997,14 +1173,14 @@ export function ClusterTable({
       </Card>
 
       {/* Pagination controls */}
-      {data.length > 0 && (
+      {displayTotal > 0 && (
         <div className="flex justify-end items-center py-2 px-4 text-sm text-gray-700">
           <div className="flex items-center space-x-4">
             <div className="flex items-center">
               <span className="mr-2">Rows per page:</span>
               <div className="relative inline-block">
                 <select
-                  value={pageSize}
+                  value={limit}
                   onChange={handlePageSizeChange}
                   className="py-1 pl-2 pr-6 appearance-none outline-none cursor-pointer border-none bg-transparent"
                   style={{ minWidth: '40px' }}
@@ -1032,14 +1208,14 @@ export function ClusterTable({
               </div>
             </div>
             <div>
-              {`${startIndex + 1} - ${Math.min(endIndex, sortedData.length)} of ${sortedData.length}`}
+              {`${startIndex + 1} - ${Math.min(endIndex, displayTotal)} of ${displayTotal}`}
             </div>
             <div className="flex items-center space-x-2">
               <Button
                 variant="ghost"
                 size="icon"
                 onClick={goToPreviousPage}
-                disabled={currentPage === 1}
+                disabled={isServerPagination ? !hasPrev : page === 1}
                 className="text-gray-500 h-8 w-8 p-0"
               >
                 <svg
@@ -1061,7 +1237,11 @@ export function ClusterTable({
                 variant="ghost"
                 size="icon"
                 onClick={goToNextPage}
-                disabled={currentPage === totalPages || totalPages === 0}
+                disabled={
+                  isServerPagination
+                    ? !hasNext
+                    : page === totalPages || totalPages === 0
+                }
                 className="text-gray-500 h-8 w-8 p-0"
               >
                 <svg
diff --git a/sky/dashboard/src/components/elements/PrimaryBadge.jsx b/sky/dashboard/src/components/elements/PrimaryBadge.jsx
new file mode 100644
index 00000000000..a6875b2d4b8
--- /dev/null
+++ b/sky/dashboard/src/components/elements/PrimaryBadge.jsx
@@ -0,0 +1,57 @@
+import React from 'react';
+import { NonCapitalizedTooltip as Tooltip } from '@/components/utils';
+
+/**
+ * A compact badge indicating a primary task in a job group.
+ * Primary tasks determine the overall job group status - when all primary
+ * tasks complete, auxiliary tasks are automatically terminated.
+ */
+export const PrimaryBadge = ({ showTooltip = true, className = '' }) => {
+  const badge = (
+    <span
+      className={`
+        inline-flex items-center gap-0.5
+        px-1.5 py-0.5
+        text-[10px] font-semibold uppercase tracking-wide
+        bg-gradient-to-r from-emerald-50 to-teal-50
+        text-emerald-700
+        border border-emerald-200
+        rounded
+        shadow-sm
+        cursor-help
+        select-none
+        ${className}
+      `}
+    >
+      {/* Star icon - simple SVG for "primary" concept */}
+      <svg
+        className="w-2.5 h-2.5 text-emerald-500"
+        viewBox="0 0 20 20"
+        fill="currentColor"
+        aria-hidden="true"
+      >
+        <path
+          fillRule="evenodd"
+          d="M10.868 2.884c-.321-.772-1.415-.772-1.736 0l-1.83 4.401-4.753.381c-.833.067-1.171 1.107-.536 1.651l3.62 3.102-1.106 4.637c-.194.813.691 1.456 1.405 1.02L10 15.591l4.069 2.485c.713.436 1.598-.207 1.404-1.02l-1.106-4.637 3.62-3.102c.635-.544.297-1.584-.536-1.65l-4.752-.382-1.831-4.401z"
+          clipRule="evenodd"
+        />
+      </svg>
+      <span>Primary</span>
+    </span>
+  );
+
+  if (!showTooltip) {
+    return badge;
+  }
+
+  return (
+    <Tooltip
+      content="Primary task – other tasks will be terminated once all primary tasks finish"
+      className="text-muted-foreground"
+    >
+      {badge}
+    </Tooltip>
+  );
+};
+
+export default PrimaryBadge;
diff --git a/sky/dashboard/src/components/elements/StatusBadge.jsx b/sky/dashboard/src/components/elements/StatusBadge.jsx
index cb58a2471f8..eb11a4c220d 100644
--- a/sky/dashboard/src/components/elements/StatusBadge.jsx
+++ b/sky/dashboard/src/components/elements/StatusBadge.jsx
@@ -1,6 +1,6 @@
 import React from 'react';
 import { CircularProgress } from '@mui/material';
-import { CustomTooltip as Tooltip } from '@/components/utils';
+import { NonCapitalizedTooltip as Tooltip } from '@/components/utils';
 import {
   FilledCircleIcon,
   SquareIcon,
@@ -23,6 +23,8 @@ export const getStatusStyle = (status) => {
       return 'bg-green-50 text-green-700';
     case 'STOPPED':
       return 'bg-yellow-100 text-yellow-800';
+    case 'AUTOSTOPPING':
+      return 'bg-purple-100 text-purple-800';
     case 'TERMINATED':
       return 'bg-gray-100 text-gray-800';
 
@@ -52,6 +54,9 @@ export const getStatusStyle = (status) => {
     case 'FAILED_CONTROLLER':
       return 'bg-red-50 text-red-700';
 
+    // Volume statuses - 'READY' and 'NOT_READY' are
+    // handled below with Serve statuses
+
     // Serve specific statuses - ReplicaStatus
     case 'READY':
       return 'bg-green-50 text-green-700';
@@ -100,6 +105,8 @@ export const getStatusIcon = (status) => {
       return <FilledCircleIcon className="w-3 h-3 mr-1" />;
     case 'STOPPED':
       return <PauseIcon className="w-3 h-3 mr-1" />;
+    case 'AUTOSTOPPING':
+      return <CircularProgress size={12} className="w-3 h-3 mr-1" />;
     case 'TERMINATED':
     case 'FAILED':
     case 'CANCELLED':
@@ -155,7 +162,7 @@ export const StatusBadge = ({ status, statusTooltip }) => {
   const tooltipContent = statusTooltip || status;
 
   return (
-    <Tooltip content={tooltipContent} className="text-muted-foreground text-sm">
+    <Tooltip content={tooltipContent}>
       <span>{status2Badge(status)}</span>
     </Tooltip>
   );
diff --git a/sky/dashboard/src/components/elements/icons.jsx b/sky/dashboard/src/components/elements/icons.jsx
index 945f9591bff..721d18618a2 100644
--- a/sky/dashboard/src/components/elements/icons.jsx
+++ b/sky/dashboard/src/components/elements/icons.jsx
@@ -706,6 +706,25 @@ export function KeyIcon(props) {
   );
 }
 
+export function ShieldIcon(props) {
+  return (
+    <svg
+      {...props}
+      xmlns="http://www.w3.org/2000/svg"
+      width="24"
+      height="24"
+      viewBox="0 0 24 24"
+      fill="none"
+      stroke="currentColor"
+      strokeWidth="2"
+      strokeLinecap="round"
+      strokeLinejoin="round"
+    >
+      <path d="M20 13c0 5-3.5 7.5-7.66 8.95a1 1 0 0 1-.67-.01C7.5 20.5 4 18 4 13V6a1 1 0 0 1 1-1c2 0 4.5-1.2 6.24-2.72a1.17 1.17 0 0 1 1.52 0C14.51 3.81 17 5 19 5a1 1 0 0 1 1 1z" />
+    </svg>
+  );
+}
+
 export function KueueIcon(props) {
   return (
     <svg
diff --git a/sky/dashboard/src/components/elements/sidebar.jsx b/sky/dashboard/src/components/elements/sidebar.jsx
index 6f031004a1b..d70f3a4429b 100755
--- a/sky/dashboard/src/components/elements/sidebar.jsx
+++ b/sky/dashboard/src/components/elements/sidebar.jsx
@@ -23,12 +23,14 @@ import {
   VolumeIcon,
   KueueIcon,
   KeyIcon,
+  ShieldIcon,
 } from '@/components/elements/icons';
-import { Settings, User, Clock } from 'lucide-react';
+import { Settings, User, Clock, FileCode } from 'lucide-react';
 
 // Map icon names to icon components for plugin nav links
 const ICON_MAP = {
   key: KeyIcon,
+  shield: ShieldIcon,
   server: ServerIcon,
   briefcase: BriefcaseIcon,
   chip: ChipIcon,
@@ -37,6 +39,7 @@ const ICON_MAP = {
   volume: VolumeIcon,
   clock: Clock,
   kueue: KueueIcon,
+  filecode: FileCode,
 };
 import { BASE_PATH, ENDPOINT } from '@/data/connectors/constants';
 import { CustomTooltip } from '@/components/utils';
@@ -521,6 +524,15 @@ export function TopBar() {
 
               <div className="border-l border-gray-200 h-6 mx-1"></div>
 
+              <Link
+                href="/recipes"
+                className={getLinkClasses('/recipes')}
+                prefetch={false}
+              >
+                <FileCode className="w-4 h-4" />
+                <span>Recipes</span>
+              </Link>
+
               <Link
                 href="/infra"
                 className={getLinkClasses('/infra')}
@@ -567,7 +579,7 @@ export function TopBar() {
                   className="text-sm text-muted-foreground"
                 >
                   <a
-                    href="https://skypilot.readthedocs.io/en/latest/"
+                    href="https://docs.skypilot.co/en/latest/"
                     target="_blank"
                     rel="noopener noreferrer"
                     className="inline-flex items-center align-middle border-b-2 border-transparent px-1 pt-1 space-x-1 text-gray-600 hover:text-blue-600 transition-colors duration-150 cursor-pointer"
@@ -782,6 +794,20 @@ export function TopBar() {
 
                 <div className="border-t border-gray-200 my-4"></div>
 
+                <Link
+                  href="/recipes"
+                  className={`flex items-center px-4 py-3 text-sm font-medium rounded-md transition-colors ${
+                    isActivePath('/recipes')
+                      ? 'bg-blue-50 text-blue-600'
+                      : 'text-gray-700 hover:bg-gray-100 hover:text-blue-600'
+                  }`}
+                  onClick={toggleMobileSidebar}
+                  prefetch={false}
+                >
+                  <FileCode className="w-5 h-5 mr-3" />
+                  Recipes
+                </Link>
+
                 <Link
                   href="/infra"
                   className={`flex items-center px-4 py-3 text-sm font-medium rounded-md transition-colors ${
@@ -845,7 +871,7 @@ export function TopBar() {
 
                 {/* External links in mobile */}
                 <a
-                  href="https://skypilot.readthedocs.io/en/latest/"
+                  href="https://docs.skypilot.co/en/latest/"
                   target="_blank"
                   rel="noopener noreferrer"
                   className="flex items-center px-4 py-3 text-sm font-medium text-gray-700 hover:bg-gray-100 hover:text-blue-600 rounded-md transition-colors"
diff --git a/sky/dashboard/src/components/elements/version-display.jsx b/sky/dashboard/src/components/elements/version-display.jsx
index 5b64fee32dc..8dadef028bf 100644
--- a/sky/dashboard/src/components/elements/version-display.jsx
+++ b/sky/dashboard/src/components/elements/version-display.jsx
@@ -71,7 +71,7 @@ export function useVersionInfo() {
   return useContext(VersionContext);
 }
 
-function VersionTooltip({
+export function VersionTooltip({
   children,
   version,
   latestVersion,
@@ -95,17 +95,19 @@ function VersionTooltip({
           {plugins.length > 0 ? 'Core commit' : 'Commit'}: {commit}
         </div>
       )}
-      {plugins.map((plugin, index) => {
-        const pluginName = plugin.name || 'Unknown Plugin';
-        const parts = [];
-        if (plugin.version) parts.push(plugin.version);
-        if (showCommit && plugin.commit) parts.push(plugin.commit);
-        return parts.length > 0 ? (
-          <div key={index}>
-            {pluginName}: {parts.join(' - ')}
-          </div>
-        ) : null;
-      })}
+      {plugins
+        .filter((plugin) => !plugin.hidden_from_display)
+        .map((plugin, index) => {
+          const pluginName = plugin.name || 'Unknown Plugin';
+          const parts = [];
+          if (plugin.version) parts.push(plugin.version);
+          if (showCommit && plugin.commit) parts.push(plugin.commit);
+          return parts.length > 0 ? (
+            <div key={index}>
+              {pluginName}: {parts.join(' - ')}
+            </div>
+          ) : null;
+        })}
       {!commit &&
         plugins.length === 0 &&
         (!latestVersion || !showUpdateInfo) && (
diff --git a/sky/dashboard/src/components/elements/version-display.test.jsx b/sky/dashboard/src/components/elements/version-display.test.jsx
new file mode 100644
index 00000000000..198d900466b
--- /dev/null
+++ b/sky/dashboard/src/components/elements/version-display.test.jsx
@@ -0,0 +1,210 @@
+/**
+ * Tests for version-display component plugin filtering
+ */
+
+import React from 'react';
+import { render, screen } from '@testing-library/react';
+import { VersionTooltip } from './version-display';
+
+describe('VersionDisplay - Plugin Filtering', () => {
+  describe('VersionTooltip filters hidden plugins', () => {
+    test('should display visible plugins', () => {
+      const plugins = [
+        {
+          name: 'VisiblePlugin1',
+          version: '1.0.0',
+          commit: 'abc123',
+          hidden_from_display: false,
+        },
+        {
+          name: 'VisiblePlugin2',
+          version: '2.0.0',
+          commit: 'def456',
+          hidden_from_display: false,
+        },
+      ];
+
+      const { container } = render(
+        <VersionTooltip
+          version="1.0.0"
+          commit="core123"
+          plugins={plugins}
+          showCommit={true}
+        >
+          <div>Version</div>
+        </VersionTooltip>
+      );
+
+      // Check that both visible plugins are rendered
+      expect(container.textContent).toContain('VisiblePlugin1');
+      expect(container.textContent).toContain('VisiblePlugin2');
+      expect(container.textContent).toContain('1.0.0');
+      expect(container.textContent).toContain('2.0.0');
+    });
+
+    test('should exclude hidden plugins from display', () => {
+      const plugins = [
+        {
+          name: 'VisiblePlugin',
+          version: '1.0.0',
+          commit: 'abc123',
+          hidden_from_display: false,
+        },
+        {
+          name: 'HiddenPlugin',
+          version: '2.0.0',
+          commit: 'def456',
+          hidden_from_display: true,
+        },
+      ];
+
+      const { container } = render(
+        <VersionTooltip
+          version="1.0.0"
+          commit="core123"
+          plugins={plugins}
+          showCommit={true}
+        >
+          <div>Version</div>
+        </VersionTooltip>
+      );
+
+      // Check that visible plugin is rendered
+      expect(container.textContent).toContain('VisiblePlugin');
+      expect(container.textContent).toContain('1.0.0');
+
+      // Check that hidden plugin is NOT rendered
+      expect(container.textContent).not.toContain('HiddenPlugin');
+      expect(container.textContent).not.toContain('2.0.0');
+    });
+
+    test('should handle plugins without hidden_from_display property (defaults to visible)', () => {
+      const plugins = [
+        {
+          name: 'PluginWithoutFlag',
+          version: '1.0.0',
+          commit: 'abc123',
+          // No hidden_from_display property
+        },
+      ];
+
+      const { container } = render(
+        <VersionTooltip
+          version="1.0.0"
+          commit="core123"
+          plugins={plugins}
+          showCommit={true}
+        >
+          <div>Version</div>
+        </VersionTooltip>
+      );
+
+      // Plugin without the flag should still be displayed (defensive filtering)
+      expect(container.textContent).toContain('PluginWithoutFlag');
+      expect(container.textContent).toContain('1.0.0');
+    });
+
+    test('should filter multiple hidden plugins', () => {
+      const plugins = [
+        {
+          name: 'VisiblePlugin',
+          version: '1.0.0',
+          commit: 'abc123',
+          hidden_from_display: false,
+        },
+        {
+          name: 'HiddenPlugin1',
+          version: '2.0.0',
+          commit: 'def456',
+          hidden_from_display: true,
+        },
+        {
+          name: 'HiddenPlugin2',
+          version: '3.0.0',
+          commit: 'ghi789',
+          hidden_from_display: true,
+        },
+        {
+          name: 'AnotherVisiblePlugin',
+          version: '4.0.0',
+          commit: 'jkl012',
+          hidden_from_display: false,
+        },
+      ];
+
+      const { container } = render(
+        <VersionTooltip
+          version="1.0.0"
+          commit="core123"
+          plugins={plugins}
+          showCommit={true}
+        >
+          <div>Version</div>
+        </VersionTooltip>
+      );
+
+      // Check that visible plugins are rendered
+      expect(container.textContent).toContain('VisiblePlugin');
+      expect(container.textContent).toContain('AnotherVisiblePlugin');
+
+      // Check that hidden plugins are NOT rendered
+      expect(container.textContent).not.toContain('HiddenPlugin1');
+      expect(container.textContent).not.toContain('HiddenPlugin2');
+    });
+
+    test('should handle empty plugins array', () => {
+      const plugins = [];
+
+      const { container } = render(
+        <VersionTooltip
+          version="1.0.0"
+          commit="core123"
+          plugins={plugins}
+          showCommit={true}
+        >
+          <div>Version</div>
+        </VersionTooltip>
+      );
+
+      // Should still show commit info
+      expect(container.textContent).toContain('Core commit');
+      expect(container.textContent).toContain('core123');
+    });
+
+    test('should handle all plugins being hidden', () => {
+      const plugins = [
+        {
+          name: 'HiddenPlugin1',
+          version: '1.0.0',
+          commit: 'abc123',
+          hidden_from_display: true,
+        },
+        {
+          name: 'HiddenPlugin2',
+          version: '2.0.0',
+          commit: 'def456',
+          hidden_from_display: true,
+        },
+      ];
+
+      const { container } = render(
+        <VersionTooltip
+          version="1.0.0"
+          commit="core123"
+          plugins={plugins}
+          showCommit={true}
+        >
+          <div>Version</div>
+        </VersionTooltip>
+      );
+
+      // Should not show any plugin names
+      expect(container.textContent).not.toContain('HiddenPlugin1');
+      expect(container.textContent).not.toContain('HiddenPlugin2');
+
+      // Should still show commit info
+      expect(container.textContent).toContain('Core commit');
+      expect(container.textContent).toContain('core123');
+    });
+  });
+});
diff --git a/sky/dashboard/src/components/infra.jsx b/sky/dashboard/src/components/infra.jsx
index 022e43bff14..5e728b405dc 100755
--- a/sky/dashboard/src/components/infra.jsx
+++ b/sky/dashboard/src/components/infra.jsx
@@ -7,6 +7,7 @@ import React, { useState, useEffect, useCallback } from 'react';
 import { CircularProgress } from '@mui/material';
 import { Layout } from '@/components/elements/layout';
 import {
+  AlertTriangleIcon,
   RotateCwIcon,
   SearchIcon,
   XIcon,
@@ -151,6 +152,16 @@ const GpuUtilizationBar = ({
   );
 };
 
+// Skeleton badge for loading cells - replaces CircularProgress size={12}
+const SkeletonBadge = () => (
+  <span className="px-2 py-0.5 bg-muted rounded text-xs font-medium inline-flex items-center">
+    <span
+      className="infra-skeleton-text"
+      style={{ width: '20px', height: '12px' }}
+    />
+  </span>
+);
+
 // Reusable component for infrastructure sections (SSH Node Pool or Kubernetes)
 export function InfrastructureSection({
   title,
@@ -216,6 +227,16 @@ export function InfrastructureSection({
     );
   }
 
+  // Determine if table should show refreshing state
+  // For K8s: show during loading or when contexts haven't all loaded yet
+  // For SSH/Slurm: only show during loading
+  const isTableRefreshing =
+    !isInitialLoad &&
+    (isLoading ||
+      (!(isSlurm || isSSH) &&
+        safeContexts.length > 0 &&
+        !safeContexts.every((c) => loadedContexts.has(c))));
+
   // Show table if we have contexts to display, even if some data is still loading
   if (safeContexts.length > 0) {
     return (
@@ -243,7 +264,11 @@ export function InfrastructureSection({
           </div>
           <div className="grid grid-cols-1 md:grid-cols-2 gap-6">
             <div>
-              <div className="overflow-x-auto rounded-md border border-gray-200 shadow-sm bg-white">
+              <div
+                className={`overflow-x-auto rounded-md border shadow-sm bg-card ${
+                  isTableRefreshing ? 'infra-table-refreshing' : ''
+                }`}
+              >
                 <table className="min-w-full text-sm">
                   <thead className="bg-gray-50">
                     <tr>
@@ -349,32 +374,47 @@ export function InfrastructureSection({
                           : '';
 
                       return (
-                        <tr key={context} className="hover:bg-gray-50">
+                        <tr
+                          key={context}
+                          className={`hover:bg-muted/50 ${
+                            !hasGpuData && !isInitialLoad
+                              ? 'infra-loading-row'
+                              : ''
+                          }`}
+                        >
                           <td className="p-3">
-                            <NonCapitalizedTooltip
-                              content={`${displayName}${workspaceDisplay}`}
-                              className="text-sm text-muted-foreground"
-                            >
-                              <span
-                                className="text-blue-600 hover:underline cursor-pointer"
-                                onClick={() => handleContextClick(context)}
+                            <div className="flex items-center gap-1.5">
+                              <NonCapitalizedTooltip
+                                content={`${displayName}${workspaceDisplay}`}
+                                className="text-sm text-muted-foreground"
                               >
-                                {displayName.length > NAME_TRUNCATE_LENGTH
-                                  ? `${displayName.substring(0, Math.floor((NAME_TRUNCATE_LENGTH - 3) / 2))}...${displayName.substring(displayName.length - Math.ceil((NAME_TRUNCATE_LENGTH - 3) / 2))}`
-                                  : displayName}
-                                {workspaceDisplay && (
-                                  <span className="text-xs text-gray-500 ml-1">
-                                    {workspaceDisplay}
-                                  </span>
-                                )}
-                              </span>
-                            </NonCapitalizedTooltip>
+                                <span
+                                  className="text-blue-600 hover:underline cursor-pointer"
+                                  onClick={() => handleContextClick(context)}
+                                >
+                                  {displayName.length > NAME_TRUNCATE_LENGTH
+                                    ? `${displayName.substring(0, Math.floor((NAME_TRUNCATE_LENGTH - 3) / 2))}...${displayName.substring(displayName.length - Math.ceil((NAME_TRUNCATE_LENGTH - 3) / 2))}`
+                                    : displayName}
+                                  {workspaceDisplay && (
+                                    <span className="text-xs text-gray-500 ml-1">
+                                      {workspaceDisplay}
+                                    </span>
+                                  )}
+                                </span>
+                              </NonCapitalizedTooltip>
+                              {contextErrors[context] && (
+                                <NonCapitalizedTooltip
+                                  content={`Context unreachable: ${contextErrors[context]}`}
+                                  className="text-sm text-muted-foreground"
+                                >
+                                  <AlertTriangleIcon className="w-4 h-4 text-yellow-500 flex-shrink-0" />
+                                </NonCapitalizedTooltip>
+                              )}
+                            </div>
                           </td>
                           <td className="p-3">
                             {isClusterDataLoading ? (
-                              <span className="px-2 py-0.5 bg-gray-100 text-gray-500 rounded text-xs font-medium">
-                                <CircularProgress size={12} />
-                              </span>
+                              <SkeletonBadge />
                             ) : (
                               <span className="px-2 py-0.5 bg-gray-100 text-gray-500 rounded text-xs font-medium">
                                 {stats.clusters}
@@ -383,9 +423,7 @@ export function InfrastructureSection({
                           </td>
                           <td className="p-3">
                             {isJobsDataLoading ? (
-                              <span className="px-2 py-0.5 bg-gray-100 text-gray-500 rounded text-xs font-medium">
-                                <CircularProgress size={12} />
-                              </span>
+                              <SkeletonBadge />
                             ) : (
                               <span className="px-2 py-0.5 bg-gray-100 text-gray-500 rounded text-xs font-medium">
                                 {jobsData[contextStatsKey]?.jobs || 0}
@@ -394,33 +432,17 @@ export function InfrastructureSection({
                           </td>
                           <td className="p-3">
                             {!hasNodeData ? (
-                              <span className="px-2 py-0.5 bg-gray-100 text-gray-500 rounded text-xs font-medium">
-                                <CircularProgress size={12} />
-                              </span>
+                              <SkeletonBadge />
                             ) : (
-                              <span
-                                className={`px-2 py-0.5 rounded text-xs font-medium ${
-                                  contextErrors[context]
-                                    ? 'bg-yellow-100 text-yellow-800'
-                                    : 'bg-gray-100 text-gray-500'
-                                }`}
-                                title={
-                                  contextErrors[context]
-                                    ? contextErrors[context]
-                                    : ''
-                                }
-                              >
+                              <span className="px-2 py-0.5 bg-gray-100 text-gray-500 rounded text-xs font-medium">
                                 {nodes.length}
-                                {contextErrors[context] ? '*' : ''}
                               </span>
                             )}
                           </td>
                           {!isSlurm && (
                             <td className="p-3">
                               {!hasNodeData ? (
-                                <span className="px-2 py-0.5 bg-gray-100 text-gray-500 rounded text-xs font-medium">
-                                  <CircularProgress size={12} />
-                                </span>
+                                <SkeletonBadge />
                               ) : (
                                 <span className="px-2 py-0.5 bg-gray-100 text-gray-500 rounded text-xs font-medium">
                                   {formatCpu(aggregatedCpu)}
@@ -431,9 +453,7 @@ export function InfrastructureSection({
                           {!isSlurm && (
                             <td className="p-3">
                               {!hasNodeData ? (
-                                <span className="px-2 py-0.5 bg-gray-100 text-gray-500 rounded text-xs font-medium">
-                                  <CircularProgress size={12} />
-                                </span>
+                                <SkeletonBadge />
                               ) : (
                                 <span className="px-2 py-0.5 bg-gray-100 text-gray-500 rounded text-xs font-medium">
                                   {formatMemory(aggregatedMemory)}
@@ -443,9 +463,7 @@ export function InfrastructureSection({
                           )}
                           <td className="p-3">
                             {!hasGpuData ? (
-                              <span className="px-2 py-0.5 bg-gray-100 text-gray-500 rounded text-xs font-medium">
-                                <CircularProgress size={12} />
-                              </span>
+                              <SkeletonBadge />
                             ) : (
                               <span className="px-2 py-0.5 bg-gray-100 text-gray-500 rounded text-xs font-medium">
                                 {gpuTypes || '-'}
@@ -454,9 +472,7 @@ export function InfrastructureSection({
                           </td>
                           <td className="p-3">
                             {!hasGpuData ? (
-                              <span className="px-2 py-0.5 bg-gray-100 text-gray-500 rounded text-xs font-medium">
-                                <CircularProgress size={12} />
-                              </span>
+                              <SkeletonBadge />
                             ) : (
                               <span className="px-2 py-0.5 bg-gray-100 text-gray-500 rounded text-xs font-medium">
                                 {totalGpus}
@@ -472,7 +488,11 @@ export function InfrastructureSection({
             </div>
             {gpus && gpus.length > 0 && (
               <div>
-                <div className="overflow-x-auto rounded-md border border-gray-200 shadow-sm bg-white">
+                <div
+                  className={`overflow-x-auto rounded-md border shadow-sm bg-card ${
+                    isTableRefreshing ? 'infra-table-refreshing' : ''
+                  }`}
+                >
                   <table className="min-w-full text-sm">
                     <thead className="bg-gray-50">
                       <tr>
@@ -558,6 +578,7 @@ export function ContextDetails({
   gpusInContext,
   nodesInContext,
   gpuMetricsRefreshTrigger = 0,
+  isSlurm = false,
 }) {
   // Determine if this is an SSH context
   const isSSHContext = contextName.startsWith('ssh-');
@@ -689,146 +710,220 @@ export function ContextDetails({
       <div className="rounded-lg border bg-card text-card-foreground shadow-sm h-full">
         <div className="p-5">
           <div className="flex items-center justify-between mb-4">
-            <h4 className="text-lg font-semibold">Available GPUs</h4>
+            <h4 className="text-lg font-semibold">Nodes</h4>
           </div>
-          <div className="grid grid-cols-1 md:grid-cols-2 lg:grid-cols-3 gap-4 mb-6">
-            {gpusInContext.map((gpu) => {
-              return (
-                <div
-                  key={gpu.gpu_name}
-                  className="p-3 bg-gray-50 rounded-md border border-gray-200 shadow-sm"
-                >
-                  <div className="flex justify-between items-center mb-1.5 flex-wrap">
-                    <div className="font-medium text-gray-800 text-sm">
-                      {gpu.gpu_name}
-                      <span className="text-xs text-gray-500 ml-2">
-                        (Requestable: {gpu.gpu_requestable_qty_per_node} / node)
-                      </span>
+          {gpusInContext.length > 0 && (
+            <div className="mb-6">
+              <h4 className="text-base font-semibold mb-3">Available GPUs</h4>
+              <div className="grid grid-cols-1 md:grid-cols-2 lg:grid-cols-3 gap-4">
+                {gpusInContext.map((gpu) => {
+                  return (
+                    <div
+                      key={gpu.gpu_name}
+                      className="p-3 bg-gray-50 rounded-md border border-gray-200 shadow-sm"
+                    >
+                      <div className="flex justify-between items-center mb-1.5 flex-wrap">
+                        <div className="font-medium text-gray-800 text-sm">
+                          {gpu.gpu_name}
+                          <span className="text-xs text-gray-500 ml-2">
+                            (Requestable: {gpu.gpu_requestable_qty_per_node} /
+                            node)
+                          </span>
+                        </div>
+                        <span className="text-xs font-medium">
+                          {gpu.gpu_free} free / {gpu.gpu_total} total
+                        </span>
+                      </div>
+                      <div className="w-full">
+                        <GpuUtilizationBar
+                          gpu={gpu}
+                          heightClass="h-4"
+                          wrapperClassName="w-full"
+                        />
+                      </div>
                     </div>
-                    <span className="text-xs font-medium">
-                      {gpu.gpu_free} free / {gpu.gpu_total} total
-                    </span>
-                  </div>
-                  <div className="w-full">
-                    <GpuUtilizationBar
-                      gpu={gpu}
-                      heightClass="h-4"
-                      wrapperClassName="w-full"
-                    />
-                  </div>
-                </div>
-              );
-            })}
-          </div>
+                  );
+                })}
+              </div>
+            </div>
+          )}
 
-          {nodesInContext && nodesInContext.length > 0 && (
-            <>
-              <h4 className="text-lg font-semibold mb-4">Nodes</h4>
-              <div className="overflow-x-auto rounded-md border border-gray-200 shadow-sm">
-                <table className="min-w-full text-sm">
-                  <thead className="bg-gray-100">
-                    <tr>
-                      <th className="p-3 text-left font-medium text-gray-600">
-                        Node
-                      </th>
-                      <th className="p-3 text-left font-medium text-gray-600">
-                        IP Address
-                      </th>
-                      <th className="p-3 text-left font-medium text-gray-600">
-                        vCPU
-                      </th>
-                      <th className="p-3 text-left font-medium text-gray-600">
-                        Memory (GB)
-                      </th>
-                      <th className="p-3 text-left font-medium text-gray-600">
-                        GPU
-                      </th>
-                      <th className="p-3 text-left font-medium text-gray-600">
-                        GPU Utilization
-                      </th>
-                    </tr>
-                  </thead>
-                  <tbody className="bg-white divide-y divide-gray-200">
-                    {nodesInContext.map((node, index) => {
-                      // Format CPU display: "X of Y free" or just "Y" if free is unknown
-                      let cpuDisplay = '-';
+          {nodesInContext.length > 0 && (
+            <div className="overflow-x-auto rounded-md border border-gray-200 shadow-sm">
+              <table className="min-w-full text-sm">
+                <thead className="bg-gray-100">
+                  <tr>
+                    <th className="p-3 text-left font-medium text-gray-600">
+                      Node
+                    </th>
+                    {!isSlurm && (
+                      <>
+                        <th className="p-3 text-left font-medium text-gray-600">
+                          IP Address
+                        </th>
+                        <th className="p-3 text-left font-medium text-gray-600">
+                          vCPU
+                        </th>
+                        <th className="p-3 text-left font-medium text-gray-600">
+                          Memory (GB)
+                        </th>
+                      </>
+                    )}
+                    <th className="p-3 text-left font-medium text-gray-600">
+                      GPU
+                    </th>
+                    <th className="p-3 text-left font-medium text-gray-600">
+                      GPU Utilization
+                    </th>
+                    <th className="p-3 text-left font-medium text-gray-600">
+                      Node Status
+                    </th>
+                  </tr>
+                </thead>
+                <tbody className="bg-white divide-y divide-gray-200">
+                  {nodesInContext.map((node, index) => {
+                    // Format CPU display: "X of Y free" or just "Y" if free is unknown
+                    let cpuDisplay = '-';
+                    if (
+                      node.cpu_count !== null &&
+                      node.cpu_count !== undefined
+                    ) {
+                      const cpuTotal = formatCpu(node.cpu_count);
                       if (
-                        node.cpu_count !== null &&
-                        node.cpu_count !== undefined
+                        node.cpu_free !== null &&
+                        node.cpu_free !== undefined
                       ) {
-                        const cpuTotal = formatCpu(node.cpu_count);
-                        if (
-                          node.cpu_free !== null &&
-                          node.cpu_free !== undefined
-                        ) {
-                          const cpuFree = formatCpu(node.cpu_free);
-                          cpuDisplay = `${cpuFree} of ${cpuTotal} free`;
-                        } else {
-                          cpuDisplay = cpuTotal;
-                        }
+                        const cpuFree = formatCpu(node.cpu_free);
+                        cpuDisplay = `${cpuFree} of ${cpuTotal} free`;
+                      } else {
+                        cpuDisplay = cpuTotal;
                       }
-
-                      // Format memory display: "X of Y free" or just "Y" if free is unknown
-                      // (GB is in column header, so don't include it in values)
-                      let memoryDisplay = '-';
+                    }
+
+                    // Format memory display: "X of Y free" or just "Y" if free is unknown
+                    // (GB is in column header, so don't include it in values)
+                    let memoryDisplay = '-';
+                    if (
+                      node.memory_gb !== null &&
+                      node.memory_gb !== undefined
+                    ) {
+                      const memoryTotal = node.memory_gb.toFixed(1);
                       if (
-                        node.memory_gb !== null &&
-                        node.memory_gb !== undefined
+                        node.memory_free_gb !== null &&
+                        node.memory_free_gb !== undefined
                       ) {
-                        const memoryTotal = node.memory_gb.toFixed(1);
-                        if (
-                          node.memory_free_gb !== null &&
-                          node.memory_free_gb !== undefined
-                        ) {
-                          const memoryFree = node.memory_free_gb.toFixed(1);
-                          memoryDisplay = `${memoryFree} of ${memoryTotal} free`;
-                        } else {
-                          memoryDisplay = memoryTotal;
+                        const memoryFree = node.memory_free_gb.toFixed(1);
+                        memoryDisplay = `${memoryFree} of ${memoryTotal} free`;
+                      } else {
+                        memoryDisplay = memoryTotal;
+                      }
+                    }
+
+                    // Build utilization string
+                    const utilizationStr = `${node.gpu_free} of ${node.gpu_total} free`;
+
+                    // Build node status string
+                    const statusInfo = [];
+
+                    // Add not ready info
+                    if (node.is_ready === false) {
+                      statusInfo.push('NotReady');
+                    }
+
+                    // Add cordoned info
+                    if (node.is_cordoned === true) {
+                      statusInfo.push('Cordoned');
+                    }
+
+                    // Build taint info separately
+                    const taints = node.taints || [];
+                    let taintInfo = null;
+                    if (taints.length > 0) {
+                      const taintsByEffect = {};
+                      for (const taint of taints) {
+                        const effect = taint.effect;
+                        const key = taint.key;
+                        if (!taintsByEffect[effect]) {
+                          taintsByEffect[effect] = [];
                         }
+                        taintsByEffect[effect].push(key);
                       }
+                      const taintStrs = Object.entries(taintsByEffect).map(
+                        ([effect, keys]) =>
+                          `${effect} Taint [${keys.join(', ')}]`
+                      );
+                      if (taintStrs.length > 0) {
+                        taintInfo = taintStrs.join(', ');
+                      }
+                    }
 
-                      const utilizationStr =
-                        node.is_ready === false
-                          ? `0 of ${node.gpu_total} free (Node NotReady)`
-                          : `${node.gpu_free} of ${node.gpu_total} free`;
+                    const nodeStatusStr =
+                      statusInfo.length > 0 || taintInfo
+                        ? statusInfo.join(', ')
+                        : 'Healthy';
+                    const isNodeHealthy = statusInfo.length === 0 && !taintInfo;
 
-                      return (
-                        <tr
-                          key={`${node.node_name}-${index}`}
-                          className="hover:bg-gray-50"
-                        >
-                          <td className="p-3 whitespace-nowrap text-gray-700">
-                            {node.node_name}
-                          </td>
-                          <td className="p-3 whitespace-nowrap text-gray-700">
-                            {node.ip_address || '-'}
-                          </td>
-                          <td className="p-3 whitespace-nowrap text-gray-700">
-                            {cpuDisplay}
-                          </td>
-                          <td className="p-3 whitespace-nowrap text-gray-700">
-                            {memoryDisplay}
-                          </td>
-                          <td className="p-3 whitespace-nowrap text-gray-700">
-                            {node.gpu_name}
-                          </td>
-                          <td className="p-3 whitespace-nowrap text-gray-700">
-                            {utilizationStr}
-                          </td>
-                        </tr>
-                      );
-                    })}
-                  </tbody>
-                </table>
-              </div>
-            </>
+                    return (
+                      <tr
+                        key={`${node.node_name}-${index}`}
+                        className="hover:bg-gray-50"
+                      >
+                        <td className="p-3 whitespace-nowrap text-gray-700">
+                          {node.node_name}
+                        </td>
+                        {!isSlurm && (
+                          <>
+                            <td className="p-3 whitespace-nowrap text-gray-700">
+                              {node.ip_address || '-'}
+                            </td>
+                            <td className="p-3 whitespace-nowrap text-gray-700">
+                              {cpuDisplay}
+                            </td>
+                            <td className="p-3 whitespace-nowrap text-gray-700">
+                              {memoryDisplay}
+                            </td>
+                          </>
+                        )}
+                        <td className="p-3 whitespace-nowrap text-gray-700">
+                          {node.gpu_name}
+                        </td>
+                        <td className="p-3 whitespace-nowrap text-gray-700">
+                          {utilizationStr}
+                        </td>
+                        <td className="p-3 max-w-xs">
+                          <div className="flex flex-col gap-1.5">
+                            {nodeStatusStr && (
+                              <span
+                                className={`inline-flex items-center px-2.5 py-1 rounded-md text-xs font-medium w-fit ${
+                                  isNodeHealthy
+                                    ? 'bg-emerald-50 text-emerald-700 ring-1 ring-inset ring-emerald-600/20'
+                                    : 'bg-amber-50 text-amber-700 ring-1 ring-inset ring-amber-600/20'
+                                }`}
+                              >
+                                {nodeStatusStr}
+                              </span>
+                            )}
+                            {taintInfo && (
+                              <span className="inline-flex items-center px-2.5 py-1 rounded-md text-xs font-medium w-fit bg-gray-50 text-gray-700 ring-1 ring-inset ring-gray-600/20">
+                                {taintInfo}
+                              </span>
+                            )}
+                          </div>
+                        </td>
+                      </tr>
+                    );
+                  })}
+                </tbody>
+              </table>
+            </div>
           )}
 
-          {/* GPU Metrics Section - only show for k8s contexts, not SSH node pools */}
+          {/* GPU Metrics Section - only show for k8s contexts, not SSH node pools or Slurm */}
           {isGrafanaAvailable &&
             gpusInContext &&
             gpusInContext.length > 0 &&
-            !isSSHContext && (
+            !isSSHContext &&
+            !isSlurm && (
               <>
                 <h4 className="text-lg font-semibold mb-4 mt-6">GPU Metrics</h4>
 
@@ -2049,42 +2144,52 @@ export function GPUs() {
         const gpuDataPromise = forceRefresh
           ? getContextGPUData(context)
           : dashboardCache.get(getContextGPUData, [context]);
-        gpuDataPromise.then((gpuData) => {
-          // Mark this context as loaded (even if it has no GPUs)
-          setLoadedContexts((prev) => new Set([...prev, context]));
-
-          // Update perContextGPUs - merge in data for this context
-          setPerContextGPUs((prev) => {
-            // Remove any existing entries for this context, then add new ones
-            const filtered = prev.filter((gpu) => gpu.context !== context);
-            return [...filtered, ...gpuData.perContextGPUs];
-          });
+        gpuDataPromise
+          .then((gpuData) => {
+            // Mark this context as loaded (even if it has no GPUs)
+            setLoadedContexts((prev) => new Set([...prev, context]));
+
+            // Update perContextGPUs - merge in data for this context
+            setPerContextGPUs((prev) => {
+              // Remove any existing entries for this context, then add new ones
+              const filtered = prev.filter((gpu) => gpu.context !== context);
+              return [...filtered, ...gpuData.perContextGPUs];
+            });
 
-          // Update perNodeGPUs - merge in data for this context
-          setPerNodeGPUs((prev) => {
-            const filtered = prev.filter((node) => node.context !== context);
-            return [...filtered, ...gpuData.perNodeGPUs];
-          });
+            // Update perNodeGPUs - merge in data for this context
+            setPerNodeGPUs((prev) => {
+              const filtered = prev.filter((node) => node.context !== context);
+              return [...filtered, ...gpuData.perNodeGPUs];
+            });
 
-          // Note: allGPUs is computed via useEffect when perContextGPUs changes
+            // Note: allGPUs is computed via useEffect when perContextGPUs changes
 
-          // Update context errors if there was an error
-          if (gpuData.error) {
+            // Update context errors if there was an error
+            if (gpuData.error) {
+              setContextErrors((prev) => ({
+                ...prev,
+                [context]: gpuData.error,
+              }));
+            }
+          })
+          .catch((error) => {
+            // Mark context as loaded even on error to prevent infinite spinner
+            setLoadedContexts((prev) => new Set([...prev, context]));
             setContextErrors((prev) => ({
               ...prev,
-              [context]: gpuData.error,
+              [context]: error.message || 'Failed to load GPU data',
             }));
-          }
-
-          // Decrement pending count and check if ALL fetches are complete
-          pendingContextCountRef.current--;
-          if (
-            pendingContextCountRef.current === 0 &&
-            mainFetchDoneRef.current
-          ) {
-            setIsFetching(false); // Everything done, stop spinner
-          }
-        });
+          })
+          .finally(() => {
+            // Decrement pending count and check if ALL fetches are complete
+            pendingContextCountRef.current--;
+            if (
+              pendingContextCountRef.current === 0 &&
+              mainFetchDoneRef.current
+            ) {
+              setIsFetching(false); // Everything done, stop spinner
+            }
+          });
       });
     } catch (error) {
       console.error('Error in fetchKubernetesData:', error);
@@ -2320,6 +2425,7 @@ export function GPUs() {
     };
 
     initializeData();
+    // eslint-disable-next-line react-hooks/exhaustive-deps
   }, []);
 
   // Effect for interval refresh.
@@ -2691,6 +2797,7 @@ export function GPUs() {
         gpusInContext={gpusInContext}
         nodesInContext={nodesInContext}
         gpuMetricsRefreshTrigger={gpuMetricsRefreshTrigger}
+        isSlurm={isSlurmCluster}
       />
     );
   };
@@ -2732,7 +2839,14 @@ export function GPUs() {
                 : `No enabled clouds for workspace "${selectedWorkspace}".`}
             </p>
           ) : (
-            <div className="overflow-x-auto rounded-md border border-gray-200 shadow-sm bg-white">
+            <div
+              className={`overflow-x-auto rounded-md border shadow-sm bg-card ${
+                !isInitialLoad &&
+                (clusterDataLoading || sshAndKubeJobsDataLoading)
+                  ? 'infra-table-refreshing'
+                  : ''
+              }`}
+            >
               <table className="min-w-full text-sm">
                 <thead className="bg-gray-50">
                   <tr>
@@ -2747,7 +2861,7 @@ export function GPUs() {
                     </th>
                   </tr>
                 </thead>
-                <tbody className="bg-white divide-y divide-gray-200">
+                <tbody className="bg-card divide-y divide-gray-200">
                   {filteredCloudInfraData.map((cloud) => {
                     // Use separate loading states for progressive loading
                     // Clusters and jobs load independently (clusters often ready first)
@@ -2756,15 +2870,13 @@ export function GPUs() {
                     const jobCount = cloudJobCounts[cloud.name] ?? cloud.jobs;
 
                     return (
-                      <tr key={cloud.name} className="hover:bg-gray-50">
+                      <tr key={cloud.name} className="hover:bg-muted/50">
                         <td className="p-3 font-medium text-gray-700">
                           {cloud.name}
                         </td>
                         <td className="p-3">
                           {clusterDataLoading ? (
-                            <span className="px-2 py-0.5 bg-gray-100 text-gray-500 rounded text-xs font-medium">
-                              <CircularProgress size={12} />
-                            </span>
+                            <SkeletonBadge />
                           ) : (
                             <span className="px-2 py-0.5 bg-gray-100 text-gray-500 rounded text-xs font-medium">
                               {clusterCount ?? 0}
@@ -2773,9 +2885,7 @@ export function GPUs() {
                         </td>
                         <td className="p-3">
                           {sshAndKubeJobsDataLoading ? (
-                            <span className="px-2 py-0.5 bg-gray-100 text-gray-500 rounded text-xs font-medium">
-                              <CircularProgress size={12} />
-                            </span>
+                            <SkeletonBadge />
                           ) : (
                             <span className="px-2 py-0.5 bg-gray-100 text-gray-500 rounded text-xs font-medium">
                               {jobCount ?? 0}
@@ -3025,6 +3135,13 @@ export function GPUs() {
                 >
                   SSH Node Pool
                 </Link>
+              ) : slurmClusters.includes(selectedContext) ? (
+                <Link
+                  href="/infra"
+                  className="text-sky-blue hover:underline cursor-pointer"
+                >
+                  Slurm
+                </Link>
               ) : (
                 <Link
                   href="/infra"
diff --git a/sky/dashboard/src/components/jobs.jsx b/sky/dashboard/src/components/jobs.jsx
index dc3857d80d8..b86202e34be 100755
--- a/sky/dashboard/src/components/jobs.jsx
+++ b/sky/dashboard/src/components/jobs.jsx
@@ -41,6 +41,8 @@ import {
   MonitorPlay,
   RefreshCcw,
   Download,
+  ChevronDownIcon,
+  ChevronRightIcon,
 } from 'lucide-react';
 import {
   handleJobAction,
@@ -49,11 +51,17 @@ import {
 import { ConfirmationModal } from '@/components/elements/modals';
 import { isJobController } from '@/data/utils';
 import { StatusBadge, getStatusStyle } from '@/components/elements/StatusBadge';
+import { PrimaryBadge } from '@/components/elements/PrimaryBadge';
 import { UserDisplay } from '@/components/elements/UserDisplay';
 import { useMobile } from '@/hooks/useMobile';
 import dashboardCache from '@/lib/cache';
 import cachePreloader from '@/lib/cache-preloader';
 import { PluginSlot } from '@/plugins/PluginSlot';
+import {
+  useTableColumns,
+  usePluginComponents,
+  useMergedTableColumns,
+} from '@/plugins/PluginProvider';
 import {
   FilterDropdown,
   Filters,
@@ -84,6 +92,59 @@ export const statusGroups = {
   ],
 };
 
+// Status priority for aggregation (higher index = worse status)
+const STATUS_PRIORITY = {
+  SUCCEEDED: 0,
+  PENDING: 1,
+  SUBMITTED: 2,
+  STARTING: 3,
+  RUNNING: 4,
+  RECOVERING: 5,
+  CANCELLING: 6,
+  CANCELLED: 7,
+  FAILED_SETUP: 8,
+  FAILED_PRECHECKS: 9,
+  FAILED_NO_RESOURCE: 10,
+  FAILED: 11,
+  FAILED_CONTROLLER: 12,
+};
+
+// Helper function to aggregate status for a job group
+// Returns the "worst" status based on priority
+// For job groups with primary/auxiliary tasks, status is determined only by primary tasks
+// Uses is_primary_in_job_group per task: null (non-group), true (primary), false (auxiliary)
+export function getAggregatedStatus(tasks) {
+  if (!tasks || tasks.length === 0) return 'PENDING';
+  if (tasks.length === 1) return tasks[0].status;
+
+  // Filter to only primary tasks for status determination.
+  // is_primary_in_job_group: true/false for job groups, null/undefined for non-groups.
+  // For non-job-groups (null), all tasks count for status.
+  // For job groups, only tasks with is_primary_in_job_group=true count.
+  const primaryTasks = tasks.filter(
+    (t) =>
+      t.is_primary_in_job_group === null ||
+      t.is_primary_in_job_group === undefined ||
+      t.is_primary_in_job_group === true
+  );
+
+  // Use primary tasks for status; fall back to all tasks if none match
+  const tasksForStatus = primaryTasks.length > 0 ? primaryTasks : tasks;
+
+  let worstStatus = 'SUCCEEDED';
+  let worstPriority = 0;
+
+  for (const task of tasksForStatus) {
+    const priority = STATUS_PRIORITY[task.status] ?? 0;
+    if (priority > worstPriority) {
+      worstPriority = priority;
+      worstStatus = task.status;
+    }
+  }
+
+  return worstStatus;
+}
+
 // Define filter options for the filter dropdown
 const PROPERTY_OPTIONS = [
   {
@@ -353,9 +414,6 @@ export function ManagedJobsTable({
   preloadingComplete,
   lastFetchedTime,
 }) {
-  const [data, setData] = useState([]);
-  const [totalCount, setTotalCount] = useState(0);
-  const [totalNoFilter, setTotalNoFilter] = useState(0);
   const [sortConfig, setSortConfig] = useState({
     key: null,
     direction: 'ascending',
@@ -366,9 +424,9 @@ export function ManagedJobsTable({
   const [pageSize, setPageSize] = useState(10);
   const [expandedRowId, setExpandedRowId] = useState(null);
   const expandedRowRef = useRef(null);
+  const [expandedJobGroups, setExpandedJobGroups] = useState(new Set());
   const [selectedStatuses, setSelectedStatuses] = useState([]);
   const [statusCounts, setStatusCounts] = useState({});
-  const [apiStatusCounts, setApiStatusCounts] = useState({});
   const [controllerStopped, setControllerStopped] = useState(false);
   const [controllerLaunching, setControllerLaunching] = useState(false);
   const [isRestarting, setIsRestarting] = useState(false);
@@ -384,6 +442,46 @@ export function ManagedJobsTable({
   // Guards multiple concurrent fetches: only latest response should commit
   const requestSeqRef = useRef(0);
 
+  // Local state for jobs data (replacing useJobsData hook)
+  const [data, setData] = useState([]);
+  const [totalCount, setTotalCount] = useState(0);
+  const [totalNoFilter, setTotalNoFilter] = useState(0);
+  const [hookControllerStopped, setHookControllerStopped] = useState(false);
+
+  // Compute statuses based on UI state for filtering
+  const computedStatuses = React.useMemo(() => {
+    // If specific statuses are selected, use those
+    if (selectedStatuses.length > 0) {
+      return selectedStatuses;
+    }
+    // If not in "show all" mode but no specific statuses selected, show no jobs
+    if (!showAllMode) {
+      return [];
+    }
+    // Show all active jobs
+    if (activeTab === 'active') {
+      return statusGroups.active;
+    }
+    // Show all finished jobs
+    if (activeTab === 'finished') {
+      return statusGroups.finished;
+    }
+    // For activeTab === 'all' and showAllMode === true, show all jobs
+    return [];
+  }, [selectedStatuses, showAllMode, activeTab]);
+
+  // Convert sortConfig to API format
+  const sortBy = React.useMemo(
+    () => sortConfig.key || 'submitted_at',
+    [sortConfig.key]
+  );
+  const sortOrder = React.useMemo(() => {
+    if (sortConfig.key) {
+      return sortConfig.direction === 'ascending' ? 'asc' : 'desc';
+    }
+    return 'desc';
+  }, [sortConfig.key, sortConfig.direction]);
+
   // Determine if we should show the Workspace column
   // Only show if there are multiple workspaces or a workspace other than 'default'
   const shouldShowWorkspace = React.useMemo(() => {
@@ -424,6 +522,7 @@ export function ManagedJobsTable({
     });
   };
 
+  // Fetch data using jobsCacheManager directly
   const fetchData = React.useCallback(
     async (options = {}) => {
       const includeStatus = options.includeStatus !== false;
@@ -431,121 +530,112 @@ export function ManagedJobsTable({
       const version = requestSeqRef.current + 1;
       requestSeqRef.current = version;
       setLocalLoading(true);
-      setLoading(true); // Set parent loading state
+      setLoading(true); // Set parent loading state.
+
       try {
-        // Build server-side filter params from UI filters
+        // Build filter params from UI filters
         const getFilterValue = (prop) => {
           const f = (filters || []).find(
             (fi) => (fi.property || '').toLowerCase() === prop
           );
           return f && f.value ? String(f.value) : undefined;
         };
-        // Determine statuses parameter based on current state
-        let statusesParam = undefined;
-
-        // If specific statuses are selected, use those
-        if (selectedStatuses.length > 0) {
-          statusesParam = selectedStatuses;
-        } else if (!showAllMode) {
-          // If not in "show all" mode but no specific statuses selected, show no jobs
-          statusesParam = [];
-        } else if (activeTab === 'active') {
-          // Show all active jobs
-          statusesParam = statusGroups.active;
-        } else if (activeTab === 'finished') {
-          // Show all finished jobs
-          statusesParam = statusGroups.finished;
-        }
-        // For activeTab === 'all' and showAllMode === true, don't set statuses (show all jobs)
 
+        // Build params for jobsCacheManager
         const params = {
           allUsers: true,
           nameMatch: getFilterValue('name'),
           userMatch: getFilterValue('user'),
           workspaceMatch: getFilterValue('workspace'),
           poolMatch: getFilterValue('pool'),
-          statuses: statusesParam,
-          page: currentPage, // page index starting from 1
+          statuses: computedStatuses.length > 0 ? computedStatuses : undefined,
+          page: currentPage,
           limit: pageSize,
+          sortBy,
+          sortOrder,
         };
 
-        let jobsResponse;
-        let clustersData = null;
+        console.log('[ManagedJobsTable] Fetching jobs with params:', params);
+
+        // Fetch jobs using jobsCacheManager
+        const response = await jobsCacheManager.getPaginatedJobs(params);
 
-        // Check cache status before making requests
-        const isDataCached = jobsCacheManager.isDataCached(params);
-        const isDataLoading = jobsCacheManager.isDataLoading(params);
+        // Only update state if this is still the latest request
+        if (version === requestSeqRef.current) {
+          // Handle controller stopped
+          if (response.controllerStopped) {
+            setHookControllerStopped(true);
+            setData([]);
+            setTotalCount(0);
+            setTotalNoFilter(0);
+            setStatusCounts({});
+          } else {
+            setHookControllerStopped(false);
+            setData(response.jobs || []);
+            setTotalCount(response.total || 0);
+            setTotalNoFilter(response.totalNoFilter || response.total || 0);
+            setStatusCounts(response.statusCounts || {});
+          }
+
+          // Prefetch next page in background
+          if (response.hasNext) {
+            jobsCacheManager.prefetchNextPage(params).catch(() => {});
+          }
+        }
 
+        // Check controller status from clusters
         if (includeStatus) {
           try {
-            clustersData = await dashboardCache.get(getClusters);
+            const clustersData = await dashboardCache.get(getClusters);
+
+            let isControllerStopped = false;
+            let isLaunching = false;
+
+            if (clustersData) {
+              const jobControllerCluster = clustersData?.find((c) =>
+                isJobController(c.cluster)
+              );
+              const jobControllerClusterStatus = jobControllerCluster
+                ? jobControllerCluster.status
+                : 'NOT_FOUND';
+              // Check both cluster status and API response
+              if (
+                jobControllerClusterStatus === 'STOPPED' &&
+                response.controllerStopped
+              ) {
+                isControllerStopped = true;
+              }
+              if (jobControllerClusterStatus === 'LAUNCHING') {
+                isLaunching = true;
+              }
+            }
+
+            if (version === requestSeqRef.current) {
+              setControllerStopped(!!isControllerStopped);
+              setControllerLaunching(!!isLaunching);
+            }
           } catch (error) {
             console.error('Error fetching clusters:', error);
           }
         }
-        jobsResponse = await jobsCacheManager.getPaginatedJobs(params);
-
-        // Always process the response, even if it's null
-        const {
-          jobs = [],
-          total = 0,
-          totalNoFilter = 0,
-          controllerStopped = false,
-          cacheStatus = 'unknown',
-          statusCounts = {},
-        } = jobsResponse || {};
-
-        let isControllerStopped = false;
-        let isLaunching = false;
-        if (includeStatus && clustersData) {
-          const jobControllerCluster = clustersData?.find((c) =>
-            isJobController(c.cluster)
-          );
-          const jobControllerClusterStatus = jobControllerCluster
-            ? jobControllerCluster.status
-            : 'NOT_FOUND';
-          if (jobControllerClusterStatus == 'STOPPED' && controllerStopped) {
-            isControllerStopped = true;
-          }
-          if (jobControllerClusterStatus == 'LAUNCHING') {
-            isLaunching = true;
-          }
-        }
 
-        // Only commit if this is still the latest request
         if (version === requestSeqRef.current) {
-          setData(jobs);
-          setTotalCount(total || 0);
-          setTotalNoFilter(totalNoFilter || 0);
-          setControllerStopped(!!isControllerStopped);
-          setControllerLaunching(!!isLaunching);
-          setApiStatusCounts(statusCounts);
           setIsInitialLoad(false);
         }
-
-        // Log cache status for debugging
-        if (process.env.NODE_ENV === 'development') {
-          console.log('Jobs cache status:', {
-            cacheStatus,
-            isDataCached,
-            isDataLoading,
-            jobCount: jobs.length,
-            totalCount: total,
-            totalNoFilter: totalNoFilter,
-          });
-        }
       } catch (err) {
         console.error('Error fetching data:', err);
-        // Still set data to empty array on error to show proper UI
         if (version === requestSeqRef.current) {
           setData([]);
+          setTotalCount(0);
+          setTotalNoFilter(0);
+          setStatusCounts({});
           setControllerStopped(false);
           setIsInitialLoad(false);
         }
       } finally {
         if (version === requestSeqRef.current) {
           setLocalLoading(false);
-          setLoading(false); // Clear parent loading state
+          setLoading(false);
         }
       }
     },
@@ -554,9 +644,9 @@ export function ManagedJobsTable({
       filters,
       currentPage,
       pageSize,
-      selectedStatuses,
-      showAllMode,
-      activeTab,
+      computedStatuses,
+      sortBy,
+      sortOrder,
     ]
   );
 
@@ -614,6 +704,14 @@ export function ManagedJobsTable({
     }
   }, [activeTab, selectedStatuses, showAllMode, fetchData, preloadingComplete]);
 
+  // Fetch on sort config changes for server-side sorting
+  // Skip on initial fetch (sortConfig has default value)
+  React.useEffect(() => {
+    if (!isInitialFetch.current && preloadingComplete) {
+      fetchData({ includeStatus: false });
+    }
+  }, [sortConfig, fetchData, preloadingComplete]);
+
   // Set up periodic refresh interval only after preloading is complete
   useEffect(() => {
     if (!preloadingComplete) {
@@ -646,6 +744,11 @@ export function ManagedJobsTable({
     setCurrentPage(1);
   }, [filters, pageSize]);
 
+  // Reset to first page when sort config changes
+  useEffect(() => {
+    setCurrentPage(1);
+  }, [sortConfig]);
+
   // Reset status filter when activeTab changes
   useEffect(() => {
     setSelectedStatuses([]);
@@ -715,20 +818,26 @@ export function ManagedJobsTable({
     });
   }, [data, poolsData, setValueList]);
 
-  const requestSort = (key) => {
-    let direction = 'ascending';
-    if (sortConfig.key === key && sortConfig.direction === 'ascending') {
-      direction = 'descending';
-    }
-    setSortConfig({ key, direction });
-  };
+  const requestSort = React.useCallback(
+    (key) => {
+      let direction = 'ascending';
+      if (sortConfig.key === key && sortConfig.direction === 'ascending') {
+        direction = 'descending';
+      }
+      setSortConfig({ key, direction });
+    },
+    [sortConfig]
+  );
 
-  const getSortDirection = (key) => {
-    if (sortConfig.key === key) {
-      return sortConfig.direction === 'ascending' ? ' ↑' : ' ↓';
-    }
-    return '';
-  };
+  const getSortDirection = React.useCallback(
+    (key) => {
+      if (sortConfig.key === key) {
+        return sortConfig.direction === 'ascending' ? ' ↑' : ' ↓';
+      }
+      return '';
+    },
+    [sortConfig]
+  );
 
   // Calculate active and finished counts
   const counts = React.useMemo(() => {
@@ -792,12 +901,102 @@ export function ManagedJobsTable({
     });
   }, [filteredData, sortConfig]);
 
-  // Pagination is performed server-side; derive display indices and end-of-list
+  // Pagination is performed server-side by unique jobs (not tasks)
   const startIndex = (currentPage - 1) * pageSize;
   const paginatedData = sortedData; // already paginated by server
   const totalPages = totalCount > 0 ? Math.ceil(totalCount / pageSize) : 0;
-  const endIndexDisplay =
-    totalCount > 0 ? Math.min(startIndex + sortedData.length, totalCount) : 0;
+
+  // Group jobs by job_id for expandable row functionality
+  const groupedJobs = React.useMemo(() => {
+    const groups = new Map();
+    paginatedData.forEach((job) => {
+      const jobId = job.id;
+      if (!groups.has(jobId)) {
+        groups.set(jobId, []);
+      }
+      groups.get(jobId).push(job);
+    });
+    return groups;
+  }, [paginatedData]);
+
+  // Pre-compute aggregated data for job groups to avoid inline computations during render
+  const jobGroupAggregates = React.useMemo(() => {
+    const aggregates = new Map();
+    groupedJobs.forEach((tasks, jobId) => {
+      if (tasks.length > 1) {
+        // Check if this job group has auxiliary tasks (is_primary_in_job_group=false)
+        const hasAuxiliaryTasks = tasks.some(
+          (t) => t.is_primary_in_job_group === false
+        );
+
+        // Compute aggregated status (respects is_primary_in_job_group)
+        const aggregatedStatus = getAggregatedStatus(tasks);
+
+        // Compute status tooltip showing all task statuses
+        // Also indicate which tasks are primary with a star marker
+        const statusTooltip = hasAuxiliaryTasks
+          ? `Task statuses:\n${tasks
+              .map((t, i) => {
+                const isPrimary = t.is_primary_in_job_group === true;
+                return `Task ${i}${isPrimary ? ' ★' : ''}: ${t.status}`;
+              })
+              .join('\n')}\n\n★ = Primary task`
+          : `Task statuses:\n${tasks.map((t, i) => `Task ${i}: ${t.status}`).join('\n')}`;
+
+        // Compute aggregated resources
+        const resourcesList = tasks
+          .map((t) => t.requested_resources || t.resources_str)
+          .filter(Boolean);
+        const uniqueResources = [...new Set(resourcesList)];
+        const resourcesDisplay =
+          resourcesList.length === 0
+            ? '-'
+            : uniqueResources.length === 1
+              ? uniqueResources[0]
+              : `${uniqueResources[0]} (+${tasks.length - 1} more)`;
+        const resourcesTooltip =
+          resourcesList.length === 0
+            ? null
+            : `Aggregated from ${tasks.length} tasks:\n${resourcesList.map((r, i) => `Task ${i}: ${r}`).join('\n')}`;
+
+        // Compute total recoveries
+        const totalRecoveries = tasks.reduce(
+          (sum, t) => sum + (t.recoveries || 0),
+          0
+        );
+
+        aggregates.set(jobId, {
+          aggregatedStatus,
+          statusTooltip,
+          resourcesDisplay,
+          resourcesTooltip,
+          totalRecoveries,
+        });
+      }
+    });
+    return aggregates;
+  }, [groupedJobs]);
+
+  // Check if there are any job groups (multi-task jobs) in the current view
+  const hasAnyJobGroups = React.useMemo(() => {
+    return Array.from(groupedJobs.values()).some((tasks) => tasks.length > 1);
+  }, [groupedJobs]);
+
+  // Toggle expand/collapse for a job group
+  const toggleJobGroup = (jobId) => {
+    setExpandedJobGroups((prev) => {
+      const newSet = new Set(prev);
+      if (newSet.has(jobId)) {
+        newSet.delete(jobId);
+      } else {
+        newSet.add(jobId);
+      }
+      return newSet;
+    });
+  };
+
+  // Check if a job group is expanded
+  const isJobGroupExpanded = (jobId) => expandedJobGroups.has(jobId);
 
   // Handle status selection
   const handleStatusClick = (status) => {
@@ -827,11 +1026,6 @@ export function ManagedJobsTable({
     setCurrentPage(1);
   };
 
-  // Update status counts from API data
-  useEffect(() => {
-    setStatusCounts(apiStatusCounts);
-  }, [apiStatusCounts]);
-
   // Page navigation handlers
   const goToPreviousPage = () => {
     setCurrentPage((page) => Math.max(page - 1, 1));
@@ -849,6 +1043,581 @@ export function ManagedJobsTable({
     setCurrentPage(1); // Reset to first page when changing page size
   };
 
+  // Define base columns with their order
+  // Each column's renderCell receives a context object:
+  // - item: The task data
+  // - renderMode: 'single' | 'groupParent' | 'groupChild'
+  // - jobId, tasks, taskIndex, aggregates (for job groups)
+  // - isExpanded, toggleJobGroup, hasAnyJobGroups (for job group UI)
+  const baseColumns = React.useMemo(
+    () => [
+      {
+        id: 'id',
+        order: 0,
+        renderHeader: () => (
+          <TableHead
+            className="sortable whitespace-nowrap"
+            onClick={() => requestSort('id')}
+          >
+            ID{getSortDirection('id')}
+          </TableHead>
+        ),
+        renderCell: (item, ctx) => {
+          const {
+            renderMode,
+            jobId,
+            taskIndex,
+            isExpanded,
+            toggleJobGroup,
+            hasAnyJobGroups,
+          } = ctx || {};
+
+          if (renderMode === 'groupParent') {
+            return (
+              <TableCell>
+                <div className="flex items-center">
+                  <button
+                    onClick={() => toggleJobGroup(jobId)}
+                    className="p-1 hover:bg-gray-200 rounded mr-1"
+                  >
+                    {isExpanded ? (
+                      <ChevronDownIcon className="w-4 h-4 text-gray-500" />
+                    ) : (
+                      <ChevronRightIcon className="w-4 h-4 text-gray-500" />
+                    )}
+                  </button>
+                  <Link href={`/jobs/${jobId}`} className="text-blue-600">
+                    {jobId}
+                  </Link>
+                </div>
+              </TableCell>
+            );
+          }
+
+          if (renderMode === 'groupChild') {
+            return (
+              <TableCell className="whitespace-nowrap relative">
+                <div className="absolute left-0 top-0 bottom-0 w-0.5 bg-blue-300"></div>
+                <span className="text-gray-500 pl-6">{taskIndex}</span>
+              </TableCell>
+            );
+          }
+
+          // Single task
+          return (
+            <TableCell>
+              {hasAnyJobGroups ? (
+                <div className="flex items-center">
+                  <span className="w-6 mr-1" aria-hidden="true" />
+                  <Link href={`/jobs/${item.id}`} className="text-blue-600">
+                    {item.id}
+                  </Link>
+                </div>
+              ) : (
+                <Link href={`/jobs/${item.id}`} className="text-blue-600">
+                  {item.id}
+                </Link>
+              )}
+            </TableCell>
+          );
+        },
+      },
+      {
+        id: 'name',
+        order: 1,
+        renderHeader: () => (
+          <TableHead
+            className="sortable whitespace-nowrap"
+            onClick={() => requestSort('name')}
+          >
+            Name{getSortDirection('name')}
+          </TableHead>
+        ),
+        renderCell: (item, ctx) => {
+          const { renderMode, jobId, tasks, taskIndex, toggleJobGroup } =
+            ctx || {};
+
+          if (renderMode === 'groupParent') {
+            return (
+              <TableCell className="whitespace-nowrap">
+                <div className="flex items-center">
+                  <Link href={`/jobs/${jobId}`} className="text-blue-600">
+                    {item.name}
+                  </Link>
+                  <button
+                    onClick={() => toggleJobGroup(jobId)}
+                    className="ml-2 text-xs font-medium bg-gray-200 text-gray-700 hover:bg-gray-300 px-1.5 py-0.5 rounded cursor-pointer whitespace-nowrap"
+                  >
+                    JobGroup: {tasks.length} tasks
+                  </button>
+                </div>
+              </TableCell>
+            );
+          }
+
+          if (renderMode === 'groupChild') {
+            // Check if this job group has auxiliary tasks
+            const hasAuxiliaryTasks = tasks.some(
+              (t) => t.is_primary_in_job_group === false
+            );
+            return (
+              <TableCell className="whitespace-nowrap">
+                <Link
+                  href={`/jobs/${item.id}/${taskIndex}`}
+                  className="text-blue-600 hover:underline"
+                >
+                  {item.task || `Task ${taskIndex}`}
+                </Link>
+                {hasAuxiliaryTasks && item.is_primary_in_job_group === true && (
+                  <span className="ml-1.5">
+                    <PrimaryBadge />
+                  </span>
+                )}
+              </TableCell>
+            );
+          }
+
+          // Single task
+          return (
+            <TableCell className="whitespace-nowrap">
+              <Link href={`/jobs/${item.id}`} className="text-blue-600">
+                {item.name}
+              </Link>
+            </TableCell>
+          );
+        },
+      },
+      {
+        id: 'user',
+        order: 2,
+        renderHeader: () => (
+          <TableHead
+            className="sortable whitespace-nowrap"
+            onClick={() => requestSort('user')}
+          >
+            User{getSortDirection('user')}
+          </TableHead>
+        ),
+        renderCell: (item) => (
+          <TableCell>
+            <UserDisplay username={item.user} userHash={item.user_hash} />
+          </TableCell>
+        ),
+      },
+      {
+        id: 'workspace',
+        order: 2.5,
+        conditional: true,
+        renderHeader: () =>
+          shouldShowWorkspace ? (
+            <TableHead
+              className="sortable whitespace-nowrap"
+              onClick={() => requestSort('workspace')}
+            >
+              Workspace{getSortDirection('workspace')}
+            </TableHead>
+          ) : null,
+        renderCell: (item) =>
+          shouldShowWorkspace ? (
+            <TableCell>
+              <Link
+                href="/workspaces"
+                className="text-gray-700 hover:text-blue-600 hover:underline"
+              >
+                {item.workspace || 'default'}
+              </Link>
+            </TableCell>
+          ) : null,
+      },
+      {
+        id: 'submitted',
+        order: 3,
+        renderHeader: () => (
+          <TableHead
+            className="sortable whitespace-nowrap"
+            onClick={() => requestSort('submitted_at')}
+          >
+            Submitted{getSortDirection('submitted_at')}
+          </TableHead>
+        ),
+        renderCell: (item) => (
+          <TableCell>{formatSubmittedTime(item.submitted_at)}</TableCell>
+        ),
+      },
+      {
+        id: 'duration',
+        order: 4,
+        renderHeader: () => (
+          <TableHead
+            className="sortable whitespace-nowrap"
+            onClick={() => requestSort('job_duration')}
+          >
+            Duration{getSortDirection('job_duration')}
+          </TableHead>
+        ),
+        renderCell: (item) => (
+          <TableCell>{formatDuration(item.job_duration)}</TableCell>
+        ),
+      },
+      {
+        id: 'status',
+        order: 5,
+        renderHeader: () => (
+          <TableHead
+            className="sortable whitespace-nowrap"
+            onClick={() => requestSort('status')}
+          >
+            Status{getSortDirection('status')}
+          </TableHead>
+        ),
+        renderCell: (item, ctx) => {
+          const { renderMode, aggregates } = ctx || {};
+
+          if (renderMode === 'groupParent') {
+            return (
+              <TableCell>
+                <NonCapitalizedTooltip
+                  content={aggregates?.statusTooltip}
+                  className="text-sm text-muted-foreground"
+                >
+                  <span>
+                    <StatusBadge status={aggregates?.aggregatedStatus} />
+                  </span>
+                </NonCapitalizedTooltip>
+              </TableCell>
+            );
+          }
+
+          // Single task or group child
+          return (
+            <TableCell>
+              <PluginSlot
+                name="jobs.table.status.badge"
+                context={item}
+                fallback={
+                  <StatusBadge
+                    status={item.status}
+                    statusTooltip={item.statusTooltip}
+                  />
+                }
+              />
+            </TableCell>
+          );
+        },
+      },
+      {
+        id: 'infra',
+        order: 6,
+        renderHeader: () => (
+          <TableHead
+            className="sortable whitespace-nowrap"
+            onClick={() => requestSort('infra')}
+          >
+            Infra{getSortDirection('infra')}
+          </TableHead>
+        ),
+        renderCell: (item, ctx) => {
+          const { renderMode } = ctx || {};
+
+          // For group parent, show simplified infra (no tooltip with region details)
+          if (renderMode === 'groupParent') {
+            return (
+              <TableCell>
+                {item.infra && item.infra !== '-' ? (
+                  <span>{item.cloud || item.infra.split('(')[0].trim()}</span>
+                ) : (
+                  <span>-</span>
+                )}
+              </TableCell>
+            );
+          }
+
+          // Single task or group child - show full infra with tooltip
+          return (
+            <TableCell>
+              {item.infra && item.infra !== '-' ? (
+                <NonCapitalizedTooltip
+                  content={item.full_infra || item.infra}
+                  className="text-sm text-muted-foreground"
+                >
+                  <span>
+                    <Link
+                      href="/infra"
+                      className="text-blue-600 hover:underline"
+                    >
+                      {item.cloud || item.infra.split('(')[0].trim()}
+                    </Link>
+                    {item.infra.includes('(') && (
+                      <span>
+                        {' ' +
+                          (() => {
+                            const NAME_TRUNCATE_LENGTH =
+                              UI_CONFIG.NAME_TRUNCATE_LENGTH;
+                            const fullRegionPart = item.infra.substring(
+                              item.infra.indexOf('(')
+                            );
+                            const regionContent = fullRegionPart.substring(
+                              1,
+                              fullRegionPart.length - 1
+                            );
+                            if (regionContent.length <= NAME_TRUNCATE_LENGTH) {
+                              return fullRegionPart;
+                            }
+                            const truncatedRegion = `${regionContent.substring(0, Math.floor((NAME_TRUNCATE_LENGTH - 3) / 2))}...${regionContent.substring(regionContent.length - Math.ceil((NAME_TRUNCATE_LENGTH - 3) / 2))}`;
+                            return `(${truncatedRegion})`;
+                          })()}
+                      </span>
+                    )}
+                  </span>
+                </NonCapitalizedTooltip>
+              ) : (
+                <span>{item.infra || '-'}</span>
+              )}
+            </TableCell>
+          );
+        },
+      },
+      {
+        id: 'requested_resources',
+        order: 7,
+        renderHeader: () => (
+          <TableHead
+            className="sortable whitespace-nowrap"
+            onClick={() => requestSort('cluster')}
+          >
+            Requested Resources{getSortDirection('cluster')}
+          </TableHead>
+        ),
+        renderCell: (item, ctx) => {
+          const { renderMode, aggregates } = ctx || {};
+
+          if (renderMode === 'groupParent') {
+            return (
+              <TableCell>
+                {aggregates?.resourcesTooltip ? (
+                  <NonCapitalizedTooltip
+                    content={aggregates.resourcesTooltip}
+                    className="text-sm text-muted-foreground"
+                  >
+                    <span>{aggregates.resourcesDisplay}</span>
+                  </NonCapitalizedTooltip>
+                ) : (
+                  <span>{aggregates?.resourcesDisplay}</span>
+                )}
+              </TableCell>
+            );
+          }
+
+          // Single task or group child
+          return (
+            <TableCell>
+              <NonCapitalizedTooltip
+                content={
+                  item.requested_resources ||
+                  item.resources_str_full ||
+                  item.resources_str ||
+                  '-'
+                }
+                className="text-sm text-muted-foreground"
+              >
+                <span>
+                  {item.requested_resources || item.resources_str || '-'}
+                </span>
+              </NonCapitalizedTooltip>
+            </TableCell>
+          );
+        },
+      },
+      {
+        id: 'recoveries',
+        order: 8,
+        renderHeader: () => (
+          <TableHead
+            className="sortable whitespace-nowrap"
+            onClick={() => requestSort('recoveries')}
+          >
+            Recoveries{getSortDirection('recoveries')}
+          </TableHead>
+        ),
+        renderCell: (item, ctx) => {
+          const { renderMode, aggregates } = ctx || {};
+
+          if (renderMode === 'groupParent') {
+            return <TableCell>{aggregates?.totalRecoveries}</TableCell>;
+          }
+
+          return <TableCell>{item.recoveries}</TableCell>;
+        },
+      },
+      {
+        id: 'pool',
+        order: 9.5,
+        conditional: true,
+        renderHeader: () =>
+          shouldShowPool ? (
+            <TableHead
+              className="sortable whitespace-nowrap"
+              onClick={() => requestSort('pool')}
+            >
+              Pool{getSortDirection('pool')}
+            </TableHead>
+          ) : null,
+        renderCell: (item) =>
+          shouldShowPool ? (
+            <TableCell>
+              <div
+                className={
+                  poolsLoading ? 'blur-sm transition-all duration-300' : ''
+                }
+              >
+                {poolsLoading
+                  ? '-'
+                  : renderPoolLink(item.pool, item.pool_hash, poolsData)}
+              </div>
+            </TableCell>
+          ) : null,
+      },
+      {
+        id: 'details',
+        order: 10,
+        renderHeader: () => <TableHead>Details</TableHead>,
+        renderCell: (item, ctx) => {
+          const { renderMode } = ctx || {};
+
+          if (renderMode === 'groupParent') {
+            return <TableCell>-</TableCell>;
+          }
+
+          // Use task_job_id for group children to avoid conflicts
+          const rowId =
+            ctx?.renderMode === 'groupChild' ? item.task_job_id : item.id;
+
+          return (
+            <TableCell>
+              {item.details ? (
+                <TruncatedDetails
+                  text={item.details}
+                  rowId={rowId}
+                  expandedRowId={expandedRowId}
+                  setExpandedRowId={setExpandedRowId}
+                />
+              ) : (
+                '-'
+              )}
+            </TableCell>
+          );
+        },
+      },
+      {
+        id: 'logs',
+        order: 11,
+        renderHeader: () => <TableHead>Logs</TableHead>,
+        renderCell: (item, ctx) => {
+          const { renderMode, jobId } = ctx || {};
+
+          // For group parent, use jobId; otherwise use item.id
+          const logJobId = renderMode === 'groupParent' ? jobId : item.id;
+
+          return (
+            <TableCell>
+              <Status2Actions
+                jobParent="/jobs"
+                jobId={logJobId}
+                managed={true}
+                workspace={item.workspace}
+              />
+            </TableCell>
+          );
+        },
+      },
+    ],
+    [
+      requestSort,
+      getSortDirection,
+      shouldShowWorkspace,
+      shouldShowPool,
+      expandedRowId,
+      poolsLoading,
+      poolsData,
+    ]
+  );
+
+  // Transform function to convert plugin columns to the format expected by the table
+  const transformPluginColumn = React.useCallback(
+    (col) => ({
+      id: col.id,
+      order: col.header.order,
+      isPlugin: true,
+      pluginColumn: col,
+      renderHeader: () => {
+        const baseClasses = col.header.sortKey
+          ? 'sortable whitespace-nowrap'
+          : 'whitespace-nowrap';
+        const className = `${baseClasses}${col.header.className ? ' ' + col.header.className : ''}`;
+        return (
+          <TableHead
+            className={className}
+            onClick={
+              col.header.sortKey
+                ? () => requestSort(col.header.sortKey)
+                : undefined
+            }
+          >
+            {col.header.label}
+            {col.header.sortKey ? getSortDirection(col.header.sortKey) : ''}
+          </TableHead>
+        );
+      },
+      renderCell: (item, ctx) => {
+        // Merge job group context with plugin context
+        const context = {
+          item,
+          shouldShowWorkspace,
+          shouldShowPool,
+          expandedRowId,
+          setExpandedRowId,
+          expandedRowRef,
+          // Forward job group context for plugins that need it
+          ...(ctx || {}),
+        };
+        const cellContent = col.cell.render(item, context);
+        return (
+          <TableCell className={col.cell.className || ''}>
+            {cellContent}
+          </TableCell>
+        );
+      },
+    }),
+    [
+      requestSort,
+      getSortDirection,
+      shouldShowWorkspace,
+      shouldShowPool,
+      expandedRowId,
+      setExpandedRowId,
+      expandedRowRef,
+    ]
+  );
+
+  // Merge base and plugin columns using the plugin system
+  // Plugin columns with the same ID as base columns will automatically replace them
+  const visibleColumns = useMergedTableColumns(
+    'jobs',
+    baseColumns,
+    {
+      shouldShowColumn: (columnId) => {
+        // Handle conditional columns
+        if (columnId === 'workspace') return shouldShowWorkspace;
+        if (columnId === 'pool') return shouldShowPool;
+        return true;
+      },
+    },
+    transformPluginColumn
+  );
+
+  // Calculate dynamic colSpan (used for expanded rows)
+  const totalColSpan = visibleColumns.length;
+
   return (
     <div className="relative">
       <div className="flex flex-col space-y-1 mb-1">
@@ -1009,95 +1778,21 @@ export function ManagedJobsTable({
           </div>
         )}
 
-      <Card>
-        <div className="overflow-x-auto rounded-lg">
-          <Table className="min-w-full">
+      <Card className="overflow-hidden">
+        <div className="overflow-x-auto">
+          <Table className="min-w-full border-collapse">
             <TableHeader>
               <TableRow>
-                <TableHead
-                  className="sortable whitespace-nowrap"
-                  onClick={() => requestSort('id')}
-                >
-                  ID{getSortDirection('id')}
-                </TableHead>
-                <TableHead
-                  className="sortable whitespace-nowrap"
-                  onClick={() => requestSort('name')}
-                >
-                  Name{getSortDirection('name')}
-                </TableHead>
-                <TableHead
-                  className="sortable whitespace-nowrap"
-                  onClick={() => requestSort('user')}
-                >
-                  User{getSortDirection('user')}
-                </TableHead>
-                {shouldShowWorkspace && (
-                  <TableHead
-                    className="sortable whitespace-nowrap"
-                    onClick={() => requestSort('workspace')}
-                  >
-                    Workspace{getSortDirection('workspace')}
-                  </TableHead>
-                )}
-                <TableHead
-                  className="sortable whitespace-nowrap"
-                  onClick={() => requestSort('submitted_at')}
-                >
-                  Submitted{getSortDirection('submitted_at')}
-                </TableHead>
-                <TableHead
-                  className="sortable whitespace-nowrap"
-                  onClick={() => requestSort('job_duration')}
-                >
-                  Duration{getSortDirection('job_duration')}
-                </TableHead>
-                <TableHead
-                  className="sortable whitespace-nowrap"
-                  onClick={() => requestSort('status')}
-                >
-                  Status{getSortDirection('status')}
-                </TableHead>
-                <TableHead
-                  className="sortable whitespace-nowrap"
-                  onClick={() => requestSort('infra')}
-                >
-                  Infra{getSortDirection('infra')}
-                </TableHead>
-                <TableHead
-                  className="sortable whitespace-nowrap"
-                  onClick={() => requestSort('cluster')}
-                >
-                  Requested Resources{getSortDirection('cluster')}
-                </TableHead>
-                <TableHead
-                  className="sortable whitespace-nowrap"
-                  onClick={() => requestSort('recoveries')}
-                >
-                  Recoveries{getSortDirection('recoveries')}
-                </TableHead>
-                {shouldShowPool && (
-                  <TableHead
-                    className="sortable whitespace-nowrap"
-                    onClick={() => requestSort('pool')}
-                  >
-                    Pool{getSortDirection('pool')}
-                  </TableHead>
+                {visibleColumns.map((col) =>
+                  React.cloneElement(col.renderHeader(), { key: col.id })
                 )}
-
-                <TableHead>Details</TableHead>
-                <TableHead>Logs</TableHead>
               </TableRow>
             </TableHeader>
             <TableBody>
               {loading && isInitialLoad ? (
                 <TableRow>
                   <TableCell
-                    colSpan={
-                      11 +
-                      (shouldShowWorkspace ? 1 : 0) +
-                      (shouldShowPool ? 1 : 0)
-                    }
+                    colSpan={totalColSpan}
                     className="text-center py-6 text-gray-500"
                   >
                     <div className="flex justify-center items-center">
@@ -1108,187 +1803,107 @@ export function ManagedJobsTable({
                 </TableRow>
               ) : paginatedData.length > 0 ? (
                 <>
-                  {paginatedData.map((item) => (
-                    <React.Fragment key={item.task_job_id}>
-                      <TableRow>
-                        <TableCell>
-                          <Link
-                            href={`/jobs/${item.id}`}
-                            className="text-blue-600"
-                          >
-                            {item.id}
-                          </Link>
-                        </TableCell>
-                        <TableCell>
-                          <Link
-                            href={`/jobs/${item.id}`}
-                            className="text-blue-600"
-                          >
-                            {item.name}
-                          </Link>
-                        </TableCell>
-                        <TableCell>
-                          <UserDisplay
-                            username={item.user}
-                            userHash={item.user_hash}
-                          />
-                        </TableCell>
-                        {shouldShowWorkspace && (
-                          <TableCell>
-                            <Link
-                              href="/workspaces"
-                              className="text-gray-700 hover:text-blue-600 hover:underline"
-                            >
-                              {item.workspace || 'default'}
-                            </Link>
-                          </TableCell>
-                        )}
-                        <TableCell>
-                          {formatSubmittedTime(item.submitted_at)}
-                        </TableCell>
-                        <TableCell>
-                          {formatDuration(item.job_duration)}
-                        </TableCell>
-                        <TableCell>
-                          <PluginSlot
-                            name="jobs.table.status.badge"
-                            context={item}
-                            fallback={
-                              <StatusBadge
-                                status={item.status}
-                                statusTooltip={item.statusTooltip}
-                              />
-                            }
-                          />
-                        </TableCell>
-                        <TableCell>
-                          {item.infra && item.infra !== '-' ? (
-                            <NonCapitalizedTooltip
-                              content={item.full_infra || item.infra}
-                              className="text-sm text-muted-foreground"
-                            >
-                              <span>
-                                <Link
-                                  href="/infra"
-                                  className="text-blue-600 hover:underline"
-                                >
-                                  {item.cloud ||
-                                    item.infra.split('(')[0].trim()}
-                                </Link>
-                                {item.infra.includes('(') && (
-                                  <span>
-                                    {' ' +
-                                      (() => {
-                                        const NAME_TRUNCATE_LENGTH =
-                                          UI_CONFIG.NAME_TRUNCATE_LENGTH;
-                                        const fullRegionPart =
-                                          item.infra.substring(
-                                            item.infra.indexOf('(')
-                                          );
-                                        const regionContent =
-                                          fullRegionPart.substring(
-                                            1,
-                                            fullRegionPart.length - 1
-                                          );
-
-                                        if (
-                                          regionContent.length <=
-                                          NAME_TRUNCATE_LENGTH
-                                        ) {
-                                          return fullRegionPart;
-                                        }
-
-                                        const truncatedRegion = `${regionContent.substring(0, Math.floor((NAME_TRUNCATE_LENGTH - 3) / 2))}...${regionContent.substring(regionContent.length - Math.ceil((NAME_TRUNCATE_LENGTH - 3) / 2))}`;
-                                        return `(${truncatedRegion})`;
-                                      })()}
-                                  </span>
-                                )}
-                              </span>
-                            </NonCapitalizedTooltip>
-                          ) : (
-                            <span>{item.infra || '-'}</span>
-                          )}
-                        </TableCell>
-                        <TableCell>
-                          <NonCapitalizedTooltip
-                            content={
-                              item.requested_resources ||
-                              item.resources_str_full ||
-                              item.resources_str ||
-                              '-'
-                            }
-                            className="text-sm text-muted-foreground"
-                          >
-                            <span>
-                              {item.requested_resources ||
-                                item.resources_str ||
-                                '-'}
-                            </span>
-                          </NonCapitalizedTooltip>
-                        </TableCell>
-                        <TableCell>{item.recoveries}</TableCell>
-                        {shouldShowPool && (
-                          <TableCell>
-                            <div
-                              className={
-                                poolsLoading
-                                  ? 'blur-sm transition-all duration-300'
-                                  : ''
-                              }
-                            >
-                              {poolsLoading
-                                ? '-'
-                                : renderPoolLink(
-                                    item.pool,
-                                    item.pool_hash,
-                                    poolsData
-                                  )}
-                            </div>
-                          </TableCell>
-                        )}
-                        <TableCell>
-                          {item.details ? (
-                            <TruncatedDetails
+                  {Array.from(groupedJobs.entries()).map(([jobId, tasks]) => {
+                    const isMultiTask = tasks.length > 1;
+                    const isExpanded = isJobGroupExpanded(jobId);
+                    const firstTask = tasks[0];
+
+                    // For single-task jobs, render using plugin columns
+                    if (!isMultiTask) {
+                      const item = firstTask;
+                      const singleCtx = {
+                        renderMode: 'single',
+                        hasAnyJobGroups,
+                      };
+                      return (
+                        <React.Fragment key={item.task_job_id}>
+                          <TableRow>
+                            {visibleColumns.map((col) => {
+                              const cell = col.renderCell(item, singleCtx);
+                              return cell
+                                ? React.cloneElement(cell, { key: col.id })
+                                : null;
+                            })}
+                          </TableRow>
+                          {expandedRowId === item.id && (
+                            <ExpandedDetailsRow
                               text={item.details}
-                              rowId={item.id}
-                              expandedRowId={expandedRowId}
-                              setExpandedRowId={setExpandedRowId}
+                              colSpan={totalColSpan}
+                              innerRef={expandedRowRef}
                             />
-                          ) : (
-                            '-'
                           )}
-                        </TableCell>
-                        <TableCell>
-                          <Status2Actions
-                            jobParent="/jobs"
-                            jobId={item.id}
-                            managed={true}
-                            workspace={item.workspace}
-                          />
-                        </TableCell>
-                      </TableRow>
-                      {expandedRowId === item.id && (
-                        <ExpandedDetailsRow
-                          text={item.details}
-                          colSpan={
-                            11 +
-                            (shouldShowWorkspace ? 1 : 0) +
-                            (shouldShowPool ? 1 : 0)
-                          }
-                          innerRef={expandedRowRef}
-                        />
-                      )}
-                    </React.Fragment>
-                  ))}
+                        </React.Fragment>
+                      );
+                    }
+
+                    // For multi-task jobs, render parent row with expand toggle
+                    // Get pre-computed aggregates for this job group
+                    const aggregates = jobGroupAggregates.get(jobId) || {};
+                    const parentCtx = {
+                      renderMode: 'groupParent',
+                      jobId,
+                      tasks,
+                      aggregates,
+                      isExpanded,
+                      toggleJobGroup,
+                      hasAnyJobGroups,
+                    };
+
+                    return (
+                      <React.Fragment key={`group-${jobId}`}>
+                        {/* Parent row for job group */}
+                        <TableRow className="hover:bg-gray-50">
+                          {visibleColumns.map((col) => {
+                            const cell = col.renderCell(firstTask, parentCtx);
+                            return cell
+                              ? React.cloneElement(cell, { key: col.id })
+                              : null;
+                          })}
+                        </TableRow>
+
+                        {/* Child task rows when expanded */}
+                        {isExpanded &&
+                          tasks.map((task, taskIndex) => {
+                            const childCtx = {
+                              renderMode: 'groupChild',
+                              jobId,
+                              tasks,
+                              taskIndex,
+                              aggregates,
+                              isExpanded,
+                              toggleJobGroup,
+                              hasAnyJobGroups,
+                            };
+                            return (
+                              <React.Fragment key={task.task_job_id}>
+                                <TableRow className="bg-gray-50/50">
+                                  {visibleColumns.map((col) => {
+                                    const cell = col.renderCell(task, childCtx);
+                                    return cell
+                                      ? React.cloneElement(cell, {
+                                          key: col.id,
+                                        })
+                                      : null;
+                                  })}
+                                </TableRow>
+                                {expandedRowId === task.task_job_id && (
+                                  <ExpandedDetailsRow
+                                    text={task.details}
+                                    colSpan={totalColSpan}
+                                    innerRef={expandedRowRef}
+                                  />
+                                )}
+                              </React.Fragment>
+                            );
+                          })}
+                      </React.Fragment>
+                    );
+                  })}
                 </>
               ) : (
                 <TableRow>
                   <TableCell
-                    colSpan={
-                      11 +
-                      (shouldShowWorkspace ? 1 : 0) +
-                      (shouldShowPool ? 1 : 0)
-                    }
+                    colSpan={totalColSpan}
                     className="text-center py-6"
                   >
                     <div className="flex flex-col items-center space-y-4">
@@ -1348,7 +1963,7 @@ export function ManagedJobsTable({
       <div className="flex justify-end items-center py-2 px-4 text-sm text-gray-700">
         <div className="flex items-center space-x-4">
           <div className="flex items-center">
-            <span className="mr-2">Rows per page:</span>
+            <span className="mr-2">Jobs per page:</span>
             <div className="relative inline-block">
               <select
                 value={pageSize}
@@ -1380,7 +1995,7 @@ export function ManagedJobsTable({
           </div>
           <div>
             {totalCount > 0
-              ? `${startIndex + 1} – ${endIndexDisplay} of ${totalCount}`
+              ? `${startIndex + 1} – ${Math.min(startIndex + groupedJobs.size, totalCount)} of ${totalCount}`
               : '0 – 0 of 0'}
           </div>
           <div className="flex items-center space-x-2">
@@ -1517,7 +2132,7 @@ export function Status2Actions({
       </Tooltip>
       <Tooltip
         key="downloadlogs"
-        content="Download Job Logs"
+        content="Download All Task Logs (zip)"
         className="capitalize text-sm text-muted-foreground"
       >
         <button
diff --git a/sky/dashboard/src/components/recipe-detail.jsx b/sky/dashboard/src/components/recipe-detail.jsx
new file mode 100644
index 00000000000..2a7ac7671e7
--- /dev/null
+++ b/sky/dashboard/src/components/recipe-detail.jsx
@@ -0,0 +1,652 @@
+'use client';
+
+import React, { useState, useEffect, useCallback } from 'react';
+import { useRouter } from 'next/router';
+import Link from 'next/link';
+import { CircularProgress } from '@mui/material';
+import yaml from 'js-yaml';
+import {
+  ArrowLeftIcon,
+  CopyIcon,
+  PinIcon,
+  PinOffIcon,
+  Trash2Icon,
+  EditIcon,
+  ShareIcon,
+  CheckIcon,
+} from 'lucide-react';
+
+import { Card, CardContent, CardHeader, CardTitle } from '@/components/ui/card';
+import { Button } from '@/components/ui/button';
+import {
+  Dialog,
+  DialogContent,
+  DialogHeader,
+  DialogTitle,
+  DialogDescription,
+  DialogFooter,
+} from '@/components/ui/dialog';
+import { Input } from '@/components/ui/input';
+import { Label } from '@/components/ui/label';
+import { Textarea } from '@/components/ui/textarea';
+import { YamlEditor } from '@/components/ui/yaml-editor';
+import {
+  Select,
+  SelectContent,
+  SelectItem,
+  SelectTrigger,
+  SelectValue,
+} from '@/components/ui/select';
+import { YamlHighlighter } from '@/components/YamlHighlighter';
+import { showToast } from '@/data/connectors/toast';
+
+import {
+  getRecipe,
+  updateRecipe,
+  deleteRecipe,
+  togglePinRecipe,
+} from '@/data/connectors/recipes';
+import {
+  getRecipeTypeInfo,
+  getLaunchCommand,
+} from '@/data/constants/recipeTypes';
+import { TimestampWithTooltip } from '@/components/utils';
+
+// Parse recipe name from URL slug
+// Names are the unique identifiers for recipes (no UUID parsing needed)
+function parseRecipeSlug(slug) {
+  if (!slug) return null;
+  // The slug IS the recipe name now
+  return slug;
+}
+
+// Edit Modal Component
+// Note: Recipe names cannot be changed as they are the unique identifier
+function EditModal({ isOpen, onClose, template, onSave }) {
+  const [description, setDescription] = useState('');
+  const [content, setContent] = useState('');
+  const [isSubmitting, setIsSubmitting] = useState(false);
+
+  useEffect(() => {
+    if (template && isOpen) {
+      setDescription(template.description || '');
+      setContent(template.content || '');
+    }
+  }, [template, isOpen]);
+
+  const handleSubmit = async (e) => {
+    e.preventDefault();
+    setIsSubmitting(true);
+
+    // Validate YAML syntax
+    try {
+      yaml.load(content);
+    } catch (yamlError) {
+      showToast(`Invalid YAML: ${yamlError.message}`, 'error');
+      setIsSubmitting(false);
+      return;
+    }
+
+    try {
+      await onSave({
+        description: description || null,
+        content,
+      });
+      onClose();
+    } catch (error) {
+      showToast(`Error: ${error.message}`, 'error');
+    } finally {
+      setIsSubmitting(false);
+    }
+  };
+
+  if (!template) return null;
+
+  return (
+    <Dialog open={isOpen} onOpenChange={onClose}>
+      <DialogContent className="sm:max-w-2xl max-h-[90vh] overflow-y-auto px-8">
+        <DialogHeader>
+          <DialogTitle className="text-xl text-gray-900">
+            Edit Recipe: {template.name}
+          </DialogTitle>
+          <DialogDescription>
+            Update your recipe description and content.
+          </DialogDescription>
+        </DialogHeader>
+
+        <form onSubmit={handleSubmit} className="space-y-4 mt-4">
+          <div className="space-y-2">
+            <Label htmlFor="description">Description</Label>
+            <Input
+              id="description"
+              value={description}
+              onChange={(e) => setDescription(e.target.value)}
+              placeholder="Optional description..."
+              className="placeholder:text-gray-500"
+            />
+          </div>
+
+          <div className="space-y-2">
+            <Label htmlFor="content">YAML Content *</Label>
+            <YamlEditor
+              value={content}
+              onChange={setContent}
+              maxHeight="400px"
+            />
+          </div>
+
+          <DialogFooter>
+            <Button
+              type="button"
+              variant="outline"
+              onClick={onClose}
+              disabled={isSubmitting}
+            >
+              Cancel
+            </Button>
+            <Button
+              type="submit"
+              disabled={isSubmitting}
+              className="bg-sky-600 hover:bg-sky-700 text-white"
+            >
+              {isSubmitting ? (
+                <>
+                  <CircularProgress size={16} className="mr-2" />
+                  Saving...
+                </>
+              ) : (
+                'Save Changes'
+              )}
+            </Button>
+          </DialogFooter>
+        </form>
+      </DialogContent>
+    </Dialog>
+  );
+}
+
+// Delete Confirmation Modal
+function DeleteModal({ isOpen, onClose, template, onDelete }) {
+  const [isDeleting, setIsDeleting] = useState(false);
+
+  const handleDelete = async () => {
+    setIsDeleting(true);
+    try {
+      await onDelete();
+      onClose();
+    } catch (error) {
+      showToast(`Delete failed: ${error.message}`, 'error');
+    } finally {
+      setIsDeleting(false);
+    }
+  };
+
+  if (!template) return null;
+
+  return (
+    <Dialog open={isOpen} onOpenChange={onClose}>
+      <DialogContent className="sm:max-w-md">
+        <DialogHeader>
+          <DialogTitle className="text-xl text-red-600">
+            Delete Recipe
+          </DialogTitle>
+          <DialogDescription>
+            Are you sure you want to delete &quot;{template.name}&quot;? This
+            action cannot be undone.
+          </DialogDescription>
+        </DialogHeader>
+
+        <DialogFooter className="mt-4">
+          <Button variant="outline" onClick={onClose} disabled={isDeleting}>
+            Cancel
+          </Button>
+          <Button
+            onClick={handleDelete}
+            disabled={isDeleting}
+            className="bg-red-600 hover:bg-red-700 text-white"
+          >
+            {isDeleting ? (
+              <>
+                <CircularProgress size={16} className="mr-2" />
+                Deleting...
+              </>
+            ) : (
+              <>
+                <Trash2Icon className="w-4 h-4 mr-2" />
+                Delete
+              </>
+            )}
+          </Button>
+        </DialogFooter>
+      </DialogContent>
+    </Dialog>
+  );
+}
+
+// Main YamlDetail Component
+export function RecipeDetail() {
+  const router = useRouter();
+  // Support both old 'yaml' and new 'recipe' query params for backwards compatibility
+  const { recipe: recipeSlug } = router.query;
+  const slug = recipeSlug;
+
+  const [template, setTemplate] = useState(null);
+  const [loading, setLoading] = useState(true);
+  const [error, setError] = useState(null);
+  const [copied, setCopied] = useState(false);
+  const [commandCopied, setCommandCopied] = useState(false);
+  const [yamlCopied, setYamlCopied] = useState(false);
+
+  // Modal states
+  const [isEditModalOpen, setIsEditModalOpen] = useState(false);
+  const [isDeleteModalOpen, setIsDeleteModalOpen] = useState(false);
+
+  const fetchTemplate = useCallback(async () => {
+    // Wait for router to be ready before accessing query params
+    // (required for Next.js static export where query params are populated client-side)
+    if (!router.isReady || !slug) return;
+
+    const templateId = parseRecipeSlug(slug);
+    if (!templateId) {
+      setError('Invalid template URL');
+      setLoading(false);
+      return;
+    }
+
+    setLoading(true);
+    setError(null);
+
+    try {
+      const data = await getRecipe(templateId);
+      if (!data) {
+        setError('Recipe not found');
+      } else {
+        setTemplate(data);
+      }
+    } catch (err) {
+      setError(err.message || 'Failed to load recipe');
+    } finally {
+      setLoading(false);
+    }
+  }, [router.isReady, slug]);
+
+  useEffect(() => {
+    fetchTemplate();
+  }, [fetchTemplate]);
+
+  const handleEdit = async (data) => {
+    const updated = await updateRecipe(template.name, data);
+    if (updated) {
+      setTemplate(updated);
+      showToast('Recipe updated successfully!', 'success');
+    } else {
+      throw new Error('Failed to update recipe');
+    }
+  };
+
+  const handleDelete = async () => {
+    const deleted = await deleteRecipe(template.name);
+    if (deleted) {
+      showToast('Recipe deleted successfully!', 'success');
+      router.push('/recipes');
+    } else {
+      throw new Error('Failed to delete recipe');
+    }
+  };
+
+  const handleTogglePin = async () => {
+    try {
+      const updated = await togglePinRecipe(template.name, !template.pinned);
+      if (updated) {
+        setTemplate(updated);
+        showToast(
+          updated.pinned ? 'Recipe pinned!' : 'Recipe unpinned!',
+          'success'
+        );
+      }
+    } catch (error) {
+      showToast(`Recipe pin operation failed: ${error.message}`, 'error');
+    }
+  };
+
+  const handleCopyToNew = () => {
+    // Navigate to recipes page with copy data in query params
+    const copyData = {
+      name: `${template.name}-copied`,
+      description: template.description,
+      content: template.content,
+      recipe_type: template.recipe_type,
+    };
+    router.push({
+      pathname: '/recipes',
+      query: { copy: JSON.stringify(copyData) },
+    });
+  };
+
+  // Helper function to copy text with fallback
+  const copyToClipboard = async (text, successMessage, setStateFn) => {
+    try {
+      if (navigator.clipboard && window.isSecureContext) {
+        await navigator.clipboard.writeText(text);
+      } else {
+        // Fallback for non-secure contexts
+        const textArea = document.createElement('textarea');
+        textArea.value = text;
+        textArea.style.position = 'fixed';
+        textArea.style.left = '-999999px';
+        textArea.style.top = '-999999px';
+        document.body.appendChild(textArea);
+        textArea.focus();
+        textArea.select();
+        document.execCommand('copy');
+        textArea.remove();
+      }
+      setStateFn(true);
+      showToast(successMessage, 'success');
+      setTimeout(() => setStateFn(false), 2000);
+    } catch (err) {
+      showToast('Failed to copy to clipboard', 'error');
+    }
+  };
+
+  const handleShare = () => {
+    const url = window.location.href;
+    copyToClipboard(url, 'Link copied to clipboard!', setCopied);
+  };
+
+  const copyCommandToClipboard = () => {
+    if (!template) return;
+    const command = getLaunchCommand(template.recipe_type, template.name);
+    copyToClipboard(command, 'Command copied to clipboard!', setCommandCopied);
+  };
+
+  const copyYamlToClipboard = () => {
+    if (!template) return;
+    copyToClipboard(
+      template.content,
+      'YAML copied to clipboard!',
+      setYamlCopied
+    );
+  };
+
+  if (loading) {
+    return (
+      <div className="flex justify-center items-center h-64">
+        <CircularProgress size={20} className="mr-2" />
+        <span className="text-gray-500">Loading...</span>
+      </div>
+    );
+  }
+
+  if (error) {
+    return (
+      <div className="flex flex-col items-center justify-center h-64">
+        <div className="text-red-500 mb-4">{error}</div>
+        <Link href="/recipes">
+          <Button variant="outline">
+            <ArrowLeftIcon className="w-4 h-4 mr-2" />
+            Back to Hub
+          </Button>
+        </Link>
+      </div>
+    );
+  }
+
+  if (!template) {
+    return null;
+  }
+
+  const typeInfo = getRecipeTypeInfo(template.recipe_type);
+  const TypeIcon = typeInfo.icon;
+
+  return (
+    <div className="space-y-4">
+      {/* Header */}
+      <div className="flex items-center justify-between">
+        <div className="flex items-center gap-4">
+          <Link
+            href="/recipes"
+            className="text-sky-blue hover:text-sky-blue-bright flex items-center"
+          >
+            <ArrowLeftIcon className="h-4 w-4 mr-1.5" />
+            <span>Back</span>
+          </Link>
+          <div className="flex items-center gap-3">
+            <TypeIcon
+              className={`w-5 h-5 ${
+                typeInfo.color === 'sky'
+                  ? 'text-sky-600'
+                  : typeInfo.color === 'purple'
+                    ? 'text-purple-600'
+                    : typeInfo.color === 'green'
+                      ? 'text-green-600'
+                      : typeInfo.color === 'orange'
+                        ? 'text-orange-600'
+                        : 'text-gray-600'
+              }`}
+            />
+            <div>
+              <div className="flex items-center gap-2">
+                <h1 className="text-base text-sky-blue leading-none">
+                  {template.name}
+                </h1>
+                {template.pinned && (
+                  <PinIcon className="w-4 h-4 text-amber-500" />
+                )}
+              </div>
+              <div className="flex items-center gap-2 text-sm text-gray-500">
+                <span>{typeInfo.fullLabel}</span>
+              </div>
+            </div>
+          </div>
+        </div>
+
+        <div className="flex items-center gap-4">
+          <button
+            onClick={handleShare}
+            className="text-sky-blue hover:text-sky-blue-bright flex items-center"
+          >
+            {copied ? (
+              <CheckIcon className="h-4 w-4 mr-1.5 text-green-600" />
+            ) : (
+              <ShareIcon className="h-4 w-4 mr-1.5" />
+            )}
+            <span>{copied ? 'Copied!' : 'Share'}</span>
+          </button>
+          <button
+            onClick={
+              template.is_pinnable !== false ? handleTogglePin : undefined
+            }
+            className={`flex items-center ${
+              template.is_pinnable === false
+                ? 'text-gray-400 cursor-not-allowed'
+                : 'text-sky-blue hover:text-sky-blue-bright'
+            }`}
+            title={
+              template.is_pinnable === false
+                ? 'Default recipes cannot be pinned/unpinned'
+                : ''
+            }
+            disabled={template.is_pinnable === false}
+          >
+            {template.pinned ? (
+              <>
+                <PinOffIcon className="h-4 w-4 mr-1.5" />
+                <span>Unpin</span>
+              </>
+            ) : (
+              <>
+                <PinIcon className="h-4 w-4 mr-1.5" />
+                <span>Pin</span>
+              </>
+            )}
+          </button>
+          <button
+            onClick={handleCopyToNew}
+            className="text-sky-blue hover:text-sky-blue-bright flex items-center"
+          >
+            <CopyIcon className="h-4 w-4 mr-1.5" />
+            <span>Copy to New</span>
+          </button>
+          <button
+            onClick={
+              template.is_editable !== false
+                ? () => setIsEditModalOpen(true)
+                : undefined
+            }
+            className={`flex items-center ${
+              template.is_editable === false
+                ? 'text-gray-400 cursor-not-allowed'
+                : 'text-sky-blue hover:text-sky-blue-bright'
+            }`}
+            title={
+              template.is_editable === false
+                ? 'Default recipes cannot be edited'
+                : ''
+            }
+            disabled={template.is_editable === false}
+          >
+            <EditIcon className="h-4 w-4 mr-1.5" />
+            <span>Edit</span>
+          </button>
+          <button
+            onClick={
+              template.is_editable !== false
+                ? () => setIsDeleteModalOpen(true)
+                : undefined
+            }
+            className={`flex items-center ${
+              template.is_editable === false
+                ? 'text-gray-400 cursor-not-allowed'
+                : 'text-red-600 hover:text-red-700'
+            }`}
+            title={
+              template.is_editable === false
+                ? 'Default recipes cannot be deleted'
+                : ''
+            }
+            disabled={template.is_editable === false}
+          >
+            <Trash2Icon className="h-4 w-4 mr-1.5" />
+            <span>Delete</span>
+          </button>
+        </div>
+      </div>
+
+      {/* Template Details Card */}
+      <div className="rounded-lg border bg-card text-card-foreground shadow-sm">
+        <div className="flex items-center justify-between px-4 pt-4">
+          <h3 className="text-lg font-semibold">Details</h3>
+        </div>
+        <div className="p-4">
+          {/* Metadata Grid */}
+          <div className="grid grid-cols-2 gap-6 mb-6">
+            <div>
+              <div className="text-gray-600 font-medium text-base">Name</div>
+              <div className="text-base mt-1">{template.name}</div>
+            </div>
+            <div>
+              <div className="text-gray-600 font-medium text-base">Type</div>
+              <div className="text-base mt-1">{typeInfo.fullLabel}</div>
+            </div>
+            <div>
+              <div className="text-gray-600 font-medium text-base">
+                Authored by
+              </div>
+              <div className="text-base mt-1">
+                {template.user_name || template.user_id || 'Unknown'}
+              </div>
+            </div>
+            <div>
+              <div className="text-gray-600 font-medium text-base">Updated</div>
+              <div className="text-base mt-1">
+                <TimestampWithTooltip
+                  date={
+                    template.updated_at
+                      ? new Date(template.updated_at * 1000)
+                      : null
+                  }
+                />{' '}
+                by {template.updated_by_name || template.user_name || 'Unknown'}
+              </div>
+            </div>
+          </div>
+
+          {/* Description */}
+          {template.description && (
+            <div className="mb-6">
+              <div className="text-gray-600 font-medium text-base mb-1">
+                Description
+              </div>
+              <p className="text-base text-gray-700">{template.description}</p>
+            </div>
+          )}
+
+          {/* Launch Command */}
+          <div className="mb-6">
+            <div className="flex items-center">
+              <div className="text-gray-600 font-medium text-base">
+                Launch Command
+              </div>
+              <button
+                onClick={copyCommandToClipboard}
+                className="flex items-center text-gray-500 hover:text-gray-700 transition-colors duration-200 p-1 ml-2"
+                title={commandCopied ? 'Copied!' : 'Copy command'}
+              >
+                {commandCopied ? (
+                  <CheckIcon className="w-4 h-4 text-green-600" />
+                ) : (
+                  <CopyIcon className="w-4 h-4" />
+                )}
+              </button>
+            </div>
+            <div className="bg-gray-50 border border-gray-200 rounded-md p-3 mt-2">
+              <code className="text-sm text-gray-800 font-mono break-all">
+                {getLaunchCommand(template.recipe_type, template.name)}
+              </code>
+            </div>
+          </div>
+
+          {/* YAML Content */}
+          <div>
+            <div className="flex items-center">
+              <div className="text-gray-600 font-medium text-base">
+                YAML Content
+              </div>
+              <button
+                onClick={copyYamlToClipboard}
+                className="flex items-center text-gray-500 hover:text-gray-700 transition-colors duration-200 p-1 ml-2"
+                title={yamlCopied ? 'Copied!' : 'Copy YAML'}
+              >
+                {yamlCopied ? (
+                  <CheckIcon className="w-4 h-4 text-green-600" />
+                ) : (
+                  <CopyIcon className="w-4 h-4" />
+                )}
+              </button>
+            </div>
+            <div className="bg-gray-50 border border-gray-200 rounded-md p-3 max-h-96 overflow-y-auto mt-2">
+              <YamlHighlighter className="whitespace-pre-wrap">
+                {template.content}
+              </YamlHighlighter>
+            </div>
+          </div>
+        </div>
+      </div>
+
+      {/* Modals */}
+      <EditModal
+        isOpen={isEditModalOpen}
+        onClose={() => setIsEditModalOpen(false)}
+        template={template}
+        onSave={handleEdit}
+      />
+      <DeleteModal
+        isOpen={isDeleteModalOpen}
+        onClose={() => setIsDeleteModalOpen(false)}
+        template={template}
+        onDelete={handleDelete}
+      />
+    </div>
+  );
+}
diff --git a/sky/dashboard/src/components/recipe-hub.jsx b/sky/dashboard/src/components/recipe-hub.jsx
new file mode 100644
index 00000000000..fd13de9f2dd
--- /dev/null
+++ b/sky/dashboard/src/components/recipe-hub.jsx
@@ -0,0 +1,1137 @@
+'use client';
+
+import React, {
+  useState,
+  useEffect,
+  useCallback,
+  useMemo,
+  useRef,
+} from 'react';
+import { useRouter } from 'next/router';
+import Link from 'next/link';
+import { CircularProgress } from '@mui/material';
+import yaml from 'js-yaml';
+import {
+  RotateCwIcon,
+  PlusIcon,
+  PinIcon,
+  AlertTriangleIcon,
+  FileCode,
+} from 'lucide-react';
+
+import { Card, CardContent, CardHeader } from '@/components/ui/card';
+import { Button } from '@/components/ui/button';
+import {
+  Table,
+  TableHeader,
+  TableRow,
+  TableHead,
+  TableBody,
+  TableCell,
+} from '@/components/ui/table';
+import { sortData } from '@/data/utils';
+import {
+  Dialog,
+  DialogContent,
+  DialogHeader,
+  DialogTitle,
+  DialogDescription,
+  DialogFooter,
+} from '@/components/ui/dialog';
+import { Input } from '@/components/ui/input';
+import { Label } from '@/components/ui/label';
+import { Textarea } from '@/components/ui/textarea';
+import { YamlEditor } from '@/components/ui/yaml-editor';
+import {
+  Select,
+  SelectContent,
+  SelectItem,
+  SelectTrigger,
+  SelectValue,
+} from '@/components/ui/select';
+import {
+  LastUpdatedTimestamp,
+  TimestampWithTooltip,
+  NonCapitalizedTooltip,
+} from '@/components/utils';
+import { showToast } from '@/data/connectors/toast';
+
+import { getRecipes, createRecipe } from '@/data/connectors/recipes';
+import {
+  RecipeType,
+  ALL_RECIPE_TYPES,
+  getRecipeTypeInfo,
+} from '@/data/constants/recipeTypes';
+
+// Define filter options for the YAML filter dropdown
+const RECIPE_PROPERTY_OPTIONS = [
+  { label: 'Name', value: 'name' },
+  { label: 'Type', value: 'recipe_type' },
+  { label: 'Owner', value: 'user_name' },
+];
+
+// Generate URL slug from recipe
+// Recipe names are the unique identifier and are already URL-safe
+function generateRecipeSlug(name) {
+  return name;
+}
+
+// Component to display username, showing just the name part for emails with tooltip
+function UserName({ name, className = '' }) {
+  if (!name) return <span className={className}>Unknown</span>;
+
+  // Check if it's an email address
+  const isEmail = name.includes('@');
+  if (isEmail) {
+    const displayName = name.split('@')[0];
+    return (
+      <NonCapitalizedTooltip content={name}>
+        <span
+          className={`border-b border-dotted border-gray-400 cursor-help ${className}`}
+        >
+          {displayName}
+        </span>
+      </NonCapitalizedTooltip>
+    );
+  }
+
+  return <span className={className}>{name}</span>;
+}
+
+// Recipe Card Component (for Pinned and My Recipes)
+function RecipeCard({ recipe }) {
+  const typeInfo = getRecipeTypeInfo(recipe.recipe_type);
+  const TypeIcon = typeInfo.icon;
+  const slug = generateRecipeSlug(recipe.name);
+
+  return (
+    <Link href={`/recipes/${slug}`} className="block">
+      <Card className="h-full hover:bg-gray-50 transition-colors cursor-pointer group">
+        <CardContent className="p-3">
+          {/* Header with icon and name */}
+          <div className="flex items-start gap-2 mb-1.5">
+            <TypeIcon
+              className={`w-4 h-4 flex-shrink-0 mt-0.5 ${
+                typeInfo.color === 'sky'
+                  ? 'text-sky-600'
+                  : typeInfo.color === 'purple'
+                    ? 'text-purple-600'
+                    : typeInfo.color === 'green'
+                      ? 'text-green-600'
+                      : typeInfo.color === 'orange'
+                        ? 'text-orange-600'
+                        : 'text-gray-600'
+              }`}
+            />
+            <div className="flex-1 min-w-0">
+              <h3 className="text-base font-medium text-blue-600 truncate group-hover:text-blue-800 transition-colors">
+                {recipe.name}
+              </h3>
+            </div>
+          </div>
+
+          {/* Bottom info section with even spacing */}
+          <div className="space-y-1.5">
+            {/* Type */}
+            <div className="text-sm text-gray-500">{typeInfo.label}</div>
+
+            {/* Description */}
+            {recipe.description && (
+              <p
+                className="text-sm text-gray-600 truncate"
+                title={recipe.description}
+              >
+                {recipe.description}
+              </p>
+            )}
+
+            {/* Authored by */}
+            <div className="text-sm text-gray-500 truncate">
+              Authored by <UserName name={recipe.user_name || recipe.user_id} />
+            </div>
+
+            {/* Last updated info - only show for editable recipes */}
+            {recipe.is_editable && recipe.user_name !== 'local' && (
+              <div className="text-sm text-gray-500 truncate">
+                Updated by{' '}
+                <UserName name={recipe.updated_by_name || recipe.user_name} />{' '}
+                <TimestampWithTooltip
+                  date={
+                    recipe.updated_at
+                      ? new Date(recipe.updated_at * 1000)
+                      : null
+                  }
+                />
+              </div>
+            )}
+          </div>
+        </CardContent>
+      </Card>
+    </Link>
+  );
+}
+
+// Template Row Component (for Pinned and My Recipes)
+function TemplateRow({ title, icon: Icon, recipes, emptyMessage, iconColor }) {
+  if (recipes.length === 0) {
+    return (
+      <div className="mb-6">
+        <div className="flex items-center gap-2 mb-3">
+          <Icon className={`w-4 h-4 ${iconColor}`} />
+          <h2 className="text-base text-gray-700">{title}</h2>
+          <span className="text-sm text-gray-500">({recipes.length})</span>
+        </div>
+        <div className="text-center py-6 bg-white rounded-lg border border-gray-200 shadow-sm">
+          <p className="text-gray-500">{emptyMessage}</p>
+        </div>
+      </div>
+    );
+  }
+
+  return (
+    <div className="mb-6">
+      <div className="flex items-center gap-2 mb-3">
+        <Icon className={`w-4 h-4 ${iconColor}`} />
+        <h2 className="text-base text-gray-700">{title}</h2>
+        <span className="text-sm text-gray-500">({recipes.length})</span>
+      </div>
+      <div className="grid grid-cols-1 sm:grid-cols-2 md:grid-cols-3 lg:grid-cols-4 xl:grid-cols-5 gap-4">
+        {recipes.map((recipe) => (
+          <RecipeCard key={recipe.name} recipe={recipe} />
+        ))}
+      </div>
+    </div>
+  );
+}
+
+// All Recipes Section with Sortable Table and Filter Bar (same as clusters page)
+function AllRecipesSection({ recipes }) {
+  const [filters, setFilters] = useState([]);
+  const [sortConfig, setSortConfig] = useState({
+    key: 'updated_at',
+    direction: 'descending',
+  });
+
+  // Extract option values from templates for the filter dropdown
+  const optionValues = useMemo(() => {
+    const names = new Set();
+    const types = new Set();
+    const owners = new Set();
+
+    recipes.forEach((r) => {
+      if (r.name) names.add(r.name);
+      if (r.recipe_type) types.add(r.recipe_type);
+      if (r.user_name) owners.add(r.user_name);
+      else if (r.user_id) owners.add(r.user_id);
+    });
+
+    return {
+      name: Array.from(names).sort(),
+      recipe_type: Array.from(types).sort(),
+      user_name: Array.from(owners).sort(),
+    };
+  }, [recipes]);
+
+  const requestSort = (key) => {
+    let direction = 'ascending';
+    if (sortConfig.key === key && sortConfig.direction === 'ascending') {
+      direction = 'descending';
+    }
+    setSortConfig({ key, direction });
+  };
+
+  const getSortDirection = (key) => {
+    if (sortConfig.key === key) {
+      return sortConfig.direction === 'ascending' ? ' ↑' : ' ↓';
+    }
+    return '';
+  };
+
+  // Filter and sort templates
+  const sortedAndFilteredTemplates = useMemo(() => {
+    let result = recipes;
+
+    // Apply filters
+    if (filters.length > 0) {
+      result = result.filter((item) => {
+        for (const filter of filters) {
+          const propertyKey = filter.property.toLowerCase().replace(' ', '_');
+          let itemValue = '';
+
+          if (propertyKey === 'name') itemValue = item.name || '';
+          else if (propertyKey === 'type') itemValue = item.recipe_type || '';
+          else if (propertyKey === 'owner')
+            itemValue = item.user_name || item.user_id || '';
+
+          if (!itemValue.toLowerCase().includes(filter.value.toLowerCase())) {
+            return false;
+          }
+        }
+        return true;
+      });
+    }
+
+    return sortData(result, sortConfig.key, sortConfig.direction);
+  }, [recipes, filters, sortConfig]);
+
+  return (
+    <div className="mb-6">
+      {/* Filter Bar */}
+      <div className="flex flex-wrap items-center gap-2 mb-3">
+        <span className="text-base text-gray-700">All Recipes</span>
+        <div className="w-full sm:w-auto">
+          <RecipeFilterDropdown
+            propertyList={RECIPE_PROPERTY_OPTIONS}
+            valueList={optionValues}
+            setFilters={setFilters}
+            placeholder="Filter recipes"
+            filters={filters}
+          />
+        </div>
+        <span className="text-sm text-gray-500 ml-auto">
+          {sortedAndFilteredTemplates.length} of {recipes.length}
+        </span>
+      </div>
+
+      {/* Filter Chips */}
+      {filters.length > 0 && (
+        <RecipeFilters filters={filters} setFilters={setFilters} />
+      )}
+
+      {/* Table */}
+      <Card>
+        <div className="overflow-x-auto rounded-lg">
+          <Table className="min-w-full">
+            <TableHeader>
+              <TableRow>
+                <TableHead
+                  className="sortable whitespace-nowrap cursor-pointer hover:bg-gray-50"
+                  onClick={() => requestSort('recipe_type')}
+                >
+                  Type{getSortDirection('recipe_type')}
+                </TableHead>
+                <TableHead
+                  className="sortable whitespace-nowrap cursor-pointer hover:bg-gray-50"
+                  onClick={() => requestSort('name')}
+                >
+                  Name{getSortDirection('name')}
+                </TableHead>
+                <TableHead
+                  className="sortable whitespace-nowrap cursor-pointer hover:bg-gray-50"
+                  onClick={() => requestSort('description')}
+                >
+                  Description{getSortDirection('description')}
+                </TableHead>
+                <TableHead
+                  className="sortable whitespace-nowrap cursor-pointer hover:bg-gray-50"
+                  onClick={() => requestSort('user_name')}
+                >
+                  Owner{getSortDirection('user_name')}
+                </TableHead>
+                <TableHead
+                  className="sortable whitespace-nowrap cursor-pointer hover:bg-gray-50"
+                  onClick={() => requestSort('updated_by_name')}
+                >
+                  Last Updated By{getSortDirection('updated_by_name')}
+                </TableHead>
+                <TableHead
+                  className="sortable whitespace-nowrap cursor-pointer hover:bg-gray-50"
+                  onClick={() => requestSort('updated_at')}
+                >
+                  Updated{getSortDirection('updated_at')}
+                </TableHead>
+              </TableRow>
+            </TableHeader>
+            <TableBody>
+              {sortedAndFilteredTemplates.length === 0 ? (
+                <TableRow>
+                  <TableCell
+                    colSpan={6}
+                    className="text-center py-6 text-gray-500"
+                  >
+                    {recipes.length === 0
+                      ? 'No recipes from other users.'
+                      : 'No recipes match your filter criteria.'}
+                  </TableCell>
+                </TableRow>
+              ) : (
+                sortedAndFilteredTemplates.map((recipe) => {
+                  const typeInfo = getRecipeTypeInfo(recipe.recipe_type);
+                  const TypeIcon = typeInfo.icon;
+                  const slug = generateRecipeSlug(recipe.name);
+                  const truncatedDesc = recipe.description
+                    ? recipe.description.length > 80
+                      ? recipe.description.substring(0, 80) + '...'
+                      : recipe.description
+                    : '-';
+                  return (
+                    <TableRow key={recipe.name} className="hover:bg-gray-50">
+                      <TableCell>
+                        <div className="flex items-center gap-2">
+                          <TypeIcon
+                            className={`w-4 h-4 ${
+                              typeInfo.color === 'sky'
+                                ? 'text-sky-600'
+                                : typeInfo.color === 'purple'
+                                  ? 'text-purple-600'
+                                  : typeInfo.color === 'green'
+                                    ? 'text-green-600'
+                                    : typeInfo.color === 'orange'
+                                      ? 'text-orange-600'
+                                      : 'text-gray-600'
+                            }`}
+                          />
+                          <span>{typeInfo.label}</span>
+                        </div>
+                      </TableCell>
+                      <TableCell>
+                        <Link
+                          href={`/recipes/${slug}`}
+                          className="text-blue-600 hover:text-blue-800 hover:underline"
+                        >
+                          {recipe.name}
+                        </Link>
+                      </TableCell>
+                      <TableCell
+                        className="text-gray-600 max-w-[400px]"
+                        title={recipe.description || ''}
+                      >
+                        <span className="cursor-default">{truncatedDesc}</span>
+                      </TableCell>
+                      <TableCell className="text-gray-600">
+                        <UserName name={recipe.user_name || recipe.user_id} />
+                      </TableCell>
+                      <TableCell className="text-gray-600">
+                        {recipe.updated_by_name || recipe.user_name ? (
+                          <UserName
+                            name={recipe.updated_by_name || recipe.user_name}
+                          />
+                        ) : (
+                          '-'
+                        )}
+                      </TableCell>
+                      <TableCell className="text-gray-500">
+                        <TimestampWithTooltip
+                          date={
+                            recipe.updated_at
+                              ? new Date(recipe.updated_at * 1000)
+                              : null
+                          }
+                        />
+                      </TableCell>
+                    </TableRow>
+                  );
+                })
+              )}
+            </TableBody>
+          </Table>
+        </div>
+      </Card>
+    </div>
+  );
+}
+
+// Filter Dropdown Component (same pattern as clusters page)
+const RecipeFilterDropdown = ({
+  propertyList = [],
+  valueList,
+  setFilters,
+  placeholder = 'Filter recipes',
+  filters = [],
+}) => {
+  const inputRef = useRef(null);
+  const dropdownRef = useRef(null);
+
+  const [isOpen, setIsOpen] = useState(false);
+  const [value, setValue] = useState('');
+  const [propertyValue, setPropertyValue] = useState('name');
+  const [valueOptions, setValueOptions] = useState([]);
+
+  const getPropertyLabel = useCallback(
+    (propValue) => {
+      const propertyItem = propertyList.find(
+        (item) => item.value === propValue
+      );
+      return propertyItem ? propertyItem.label : propValue;
+    },
+    [propertyList]
+  );
+
+  // Handle clicks outside the dropdown
+  useEffect(() => {
+    const handleClickOutside = (event) => {
+      if (
+        dropdownRef.current &&
+        !dropdownRef.current.contains(event.target) &&
+        inputRef.current &&
+        !inputRef.current.contains(event.target)
+      ) {
+        setIsOpen(false);
+      }
+    };
+
+    document.addEventListener('mousedown', handleClickOutside);
+    return () => {
+      document.removeEventListener('mousedown', handleClickOutside);
+    };
+  }, []);
+
+  useEffect(() => {
+    let updatedValueOptions = [];
+
+    // Get the property label for filtering existing selections
+    const propertyLabel = getPropertyLabel(propertyValue);
+    const selectedValues = filters
+      .filter((filter) => filter.property === propertyLabel)
+      .map((filter) => filter.value);
+
+    if (valueList && typeof valueList === 'object') {
+      const options = valueList[propertyValue] || [];
+      updatedValueOptions = options.filter(
+        (val) => !selectedValues.includes(val)
+      );
+    }
+
+    // Filter options based on current input value
+    if (value.trim() !== '') {
+      updatedValueOptions = updatedValueOptions.filter(
+        (item) =>
+          item && item.toString().toLowerCase().includes(value.toLowerCase())
+      );
+    }
+
+    setValueOptions(updatedValueOptions);
+  }, [propertyValue, valueList, value, filters, getPropertyLabel]);
+
+  const handleValueChange = (e) => {
+    setValue(e.target.value);
+    if (!isOpen) {
+      setIsOpen(true);
+    }
+  };
+
+  const handleInputFocus = () => {
+    setIsOpen(true);
+  };
+
+  const handleOptionSelect = (option) => {
+    setFilters((prevFilters) => [
+      ...prevFilters,
+      {
+        property: getPropertyLabel(propertyValue),
+        operator: ':',
+        value: option,
+      },
+    ]);
+    setIsOpen(false);
+    setValue('');
+    inputRef.current.focus();
+  };
+
+  const handleKeyDown = (e) => {
+    if (e.key === 'Enter' && value.trim() !== '') {
+      setFilters((prevFilters) => [
+        ...prevFilters,
+        {
+          property: getPropertyLabel(propertyValue),
+          operator: ':',
+          value: value,
+        },
+      ]);
+      setValue('');
+      setIsOpen(false);
+    } else if (e.key === 'Escape') {
+      setIsOpen(false);
+      inputRef.current.blur();
+    }
+  };
+
+  return (
+    <div className="flex flex-row border border-gray-300 rounded-md overflow-visible">
+      <div className="border-r border-gray-300 flex-shrink-0">
+        <Select onValueChange={setPropertyValue} value={propertyValue}>
+          <SelectTrigger
+            aria-label="Filter Property"
+            className="focus:ring-0 focus:ring-offset-0 border-none rounded-l-md rounded-r-none w-20 sm:w-24 md:w-32 h-8 text-xs sm:text-sm"
+          >
+            <SelectValue placeholder="Name" />
+          </SelectTrigger>
+          <SelectContent>
+            {propertyList.map((item, index) => (
+              <SelectItem key={`property-item-${index}`} value={item.value}>
+                {item.label}
+              </SelectItem>
+            ))}
+          </SelectContent>
+        </Select>
+      </div>
+      <div className="relative flex-1">
+        <input
+          type="text"
+          ref={inputRef}
+          placeholder={placeholder}
+          value={value}
+          onChange={handleValueChange}
+          onFocus={handleInputFocus}
+          onKeyDown={handleKeyDown}
+          className="h-8 w-full sm:w-96 px-3 pr-8 text-sm border-none rounded-l-none rounded-r-md focus:ring-0 focus:outline-none"
+          autoComplete="off"
+        />
+        {value && (
+          <button
+            onClick={() => {
+              setValue('');
+              setIsOpen(false);
+            }}
+            className="absolute right-2 top-1/2 transform -translate-y-1/2 text-gray-400 hover:text-gray-600"
+            title="Clear filter"
+            tabIndex={-1}
+          >
+            <svg
+              className="h-4 w-4"
+              fill="none"
+              stroke="currentColor"
+              viewBox="0 0 24 24"
+            >
+              <path
+                strokeLinecap="round"
+                strokeLinejoin="round"
+                strokeWidth={2}
+                d="M6 18L18 6M6 6l12 12"
+              />
+            </svg>
+          </button>
+        )}
+        {isOpen && valueOptions.length > 0 && (
+          <div
+            ref={dropdownRef}
+            className="absolute z-50 mt-1 w-full bg-white border border-gray-200 rounded-md shadow-lg max-h-60 overflow-y-auto"
+            style={{ zIndex: 9999 }}
+          >
+            {valueOptions.map((option, index) => (
+              <div
+                key={`${option}-${index}`}
+                className={`px-3 py-2 cursor-pointer hover:bg-gray-50 text-sm ${
+                  index !== valueOptions.length - 1
+                    ? 'border-b border-gray-100'
+                    : ''
+                }`}
+                onClick={() => handleOptionSelect(option)}
+              >
+                <span className="text-sm text-gray-700">{option}</span>
+              </div>
+            ))}
+          </div>
+        )}
+      </div>
+    </div>
+  );
+};
+
+// Filter Chips Component (same pattern as clusters page)
+const RecipeFilters = ({ filters = [], setFilters }) => {
+  const onRemove = (index) => {
+    setFilters((prevFilters) => prevFilters.filter((_, i) => i !== index));
+  };
+
+  const clearFilters = () => {
+    setFilters([]);
+  };
+
+  return (
+    <div className="flex items-center gap-4 py-2 px-2 mb-2">
+      <div className="flex flex-wrap items-center gap-2">
+        {filters.map((filter, index) => (
+          <RecipeFilterItem
+            key={`filteritem-${index}`}
+            filter={filter}
+            onRemove={() => onRemove(index)}
+          />
+        ))}
+        {filters.length > 0 && (
+          <button
+            onClick={clearFilters}
+            className="rounded-full px-4 py-1 text-sm text-gray-700 bg-gray-200 hover:bg-gray-300"
+          >
+            Clear filters
+          </button>
+        )}
+      </div>
+    </div>
+  );
+};
+
+// Filter Item Component (same pattern as clusters page)
+const RecipeFilterItem = ({ filter, onRemove }) => {
+  return (
+    <div className="flex items-center text-blue-600 bg-blue-100 px-1 py-1 rounded-full text-sm">
+      <div className="flex items-center gap-1 px-2">
+        <span>{filter.property}</span>
+        <span>{filter.operator}</span>
+        <span>{filter.value}</span>
+      </div>
+      <button
+        onClick={onRemove}
+        className="p-0.5 ml-1 text-gray-400 hover:text-gray-600 bg-blue-500 hover:bg-blue-600 rounded-full flex items-center"
+        title="Remove filter"
+      >
+        <svg className="h-3 w-3" fill="none" stroke="white" viewBox="0 0 24 24">
+          <path
+            strokeLinecap="round"
+            strokeLinejoin="round"
+            strokeWidth={5}
+            d="M6 18L18 6M6 6l12 12"
+          />
+        </svg>
+      </button>
+    </div>
+  );
+};
+
+// Helper to generate example YAML based on type
+function getExampleRecipe(recipeType) {
+  switch (recipeType) {
+    case RecipeType.CLUSTER:
+      return `name: my-cluster
+resources:
+  infra: aws
+  accelerators: A100:1
+
+run: |
+  echo "Hello, SkyPilot!"
+`;
+    case RecipeType.JOB:
+      return `name: my-job
+resources:
+  infra: aws
+  accelerators: A100:1
+
+run: |
+  echo "Running managed job..."
+`;
+    case RecipeType.POOL:
+      return `pool:
+  name: my-pool
+
+resources:
+  infra: aws
+  accelerators: A100:1
+`;
+    default:
+      return `name: my-${recipeType}
+resources:
+  infra: aws
+  accelerators: A100:1
+
+run: |
+  echo "Hello, SkyPilot!"
+`;
+  }
+}
+
+// Create YAML Modal with Owner Name field
+function CreateRecipeModal({
+  isOpen,
+  onClose,
+  onSubmit,
+  initialData,
+  isAuthenticated,
+}) {
+  const [name, setName] = useState('');
+  const [description, setDescription] = useState('');
+  const [content, setContent] = useState(getExampleRecipe(RecipeType.CLUSTER));
+  const [recipeType, setRecipeType] = useState(RecipeType.CLUSTER);
+  const [ownerName, setOwnerName] = useState('');
+  const [isSubmitting, setIsSubmitting] = useState(false);
+
+  useEffect(() => {
+    if (isOpen) {
+      if (initialData) {
+        setName(initialData.name || '');
+        setDescription(initialData.description || '');
+        setContent(initialData.content || '');
+        setRecipeType(initialData.recipe_type || RecipeType.CLUSTER);
+      } else {
+        setName('');
+        setDescription('');
+        setRecipeType(RecipeType.CLUSTER);
+        setContent(getExampleRecipe(RecipeType.CLUSTER));
+      }
+      setOwnerName('');
+    }
+  }, [initialData, isOpen]);
+
+  // Update example YAML when type changes (only if content matches previous example)
+  const handleRecipeTypeChange = (newType) => {
+    const oldExample = getExampleRecipe(recipeType);
+    // If user hasn't modified the content, update it with new example
+    if (content === oldExample || content === '') {
+      setContent(getExampleRecipe(newType));
+    }
+    setRecipeType(newType);
+  };
+
+  const handleSubmit = async (e) => {
+    e.preventDefault();
+    setIsSubmitting(true);
+
+    // Validate YAML syntax
+    try {
+      yaml.load(content);
+    } catch (yamlError) {
+      showToast(`Invalid YAML: ${yamlError.message}`, 'error');
+      setIsSubmitting(false);
+      return;
+    }
+
+    try {
+      await onSubmit({
+        name,
+        description: description || null,
+        content,
+        recipeType,
+        ownerName: ownerName || null,
+      });
+      onClose();
+    } catch (error) {
+      showToast(`Error: ${error.message}`, 'error');
+    } finally {
+      setIsSubmitting(false);
+    }
+  };
+
+  return (
+    <Dialog open={isOpen} onOpenChange={onClose}>
+      <DialogContent className="sm:max-w-2xl max-h-[90vh] overflow-y-auto px-8">
+        <DialogHeader>
+          <DialogTitle className="text-xl text-gray-900">
+            Create New Recipe
+          </DialogTitle>
+          <DialogDescription>
+            Create a reusable recipe for clusters, jobs, and more.
+          </DialogDescription>
+        </DialogHeader>
+
+        <div className="bg-amber-50 border border-amber-200 rounded-md p-3 flex items-start gap-2 mt-4">
+          <AlertTriangleIcon className="w-4 h-4 text-amber-600 mt-0.5 flex-shrink-0" />
+          <p className="text-sm text-amber-800">
+            This recipe will be visible to everyone with access to this
+            dashboard.
+          </p>
+        </div>
+
+        <form onSubmit={handleSubmit} className="space-y-4 mt-4">
+          <div className="grid grid-cols-2 gap-4">
+            <div className="space-y-2">
+              <Label htmlFor="name">Name *</Label>
+              <Input
+                id="name"
+                value={name}
+                onChange={(e) => setName(e.target.value)}
+                placeholder="my-gpu-training"
+                className="placeholder:text-gray-400"
+                required
+              />
+            </div>
+
+            <div className="space-y-2">
+              <Label htmlFor="recipe-type">Type *</Label>
+              <Select value={recipeType} onValueChange={handleRecipeTypeChange}>
+                <SelectTrigger>
+                  <SelectValue placeholder="Select type" />
+                </SelectTrigger>
+                <SelectContent>
+                  {ALL_RECIPE_TYPES.map((type) => {
+                    const info = getRecipeTypeInfo(type);
+                    const TypeIcon = info.icon;
+                    return (
+                      <SelectItem key={type} value={type}>
+                        <div className="flex items-center gap-2">
+                          <TypeIcon className={`w-4 h-4 ${info.colorClass}`} />
+                          <span>{info.fullLabel}</span>
+                        </div>
+                      </SelectItem>
+                    );
+                  })}
+                </SelectContent>
+              </Select>
+            </div>
+          </div>
+
+          {/* Owner Name field - only shown when not authenticated */}
+          {!isAuthenticated && (
+            <div className="space-y-2">
+              <Label htmlFor="owner-name">Your Name</Label>
+              <Input
+                id="owner-name"
+                value={ownerName}
+                onChange={(e) => setOwnerName(e.target.value)}
+                placeholder="Enter your name (optional)"
+                className="placeholder:text-gray-400"
+              />
+              <p className="text-xs text-gray-500">
+                This name will be shown as the template owner.
+              </p>
+            </div>
+          )}
+
+          <div className="space-y-2">
+            <Label htmlFor="description">Description</Label>
+            <Input
+              id="description"
+              value={description}
+              onChange={(e) => setDescription(e.target.value)}
+              placeholder="A brief description of what this recipe does..."
+              className="placeholder:text-gray-400"
+            />
+          </div>
+
+          <div className="space-y-2">
+            <Label htmlFor="content">YAML Content *</Label>
+            <YamlEditor
+              value={content}
+              onChange={setContent}
+              maxHeight="400px"
+            />
+          </div>
+
+          <DialogFooter>
+            <Button
+              type="button"
+              variant="outline"
+              onClick={onClose}
+              disabled={isSubmitting}
+            >
+              Cancel
+            </Button>
+            <Button
+              type="submit"
+              disabled={isSubmitting}
+              className="bg-sky-600 hover:bg-sky-700 text-white"
+            >
+              {isSubmitting ? (
+                <>
+                  <CircularProgress size={16} className="mr-2" />
+                  Creating...
+                </>
+              ) : (
+                'Create Recipe'
+              )}
+            </Button>
+          </DialogFooter>
+        </form>
+      </DialogContent>
+    </Dialog>
+  );
+}
+
+// Main Hub component
+export function RecipeHub() {
+  const router = useRouter();
+
+  // Data state
+  const [allRecipes, setAllRecipes] = useState([]);
+  const [currentUserId, setCurrentUserId] = useState(null);
+  const [isAuthenticated, setIsAuthenticated] = useState(false);
+
+  // UI state
+  const [loading, setLoading] = useState(true);
+  const [lastFetchedTime, setLastFetchedTime] = useState(null);
+  const [isCreateModalOpen, setIsCreateModalOpen] = useState(false);
+  const [initialCreateData, setInitialCreateData] = useState(null);
+
+  // Fetch current user ID
+  const fetchCurrentUser = useCallback(async () => {
+    try {
+      const response = await fetch(
+        `${window.location.origin}/internal/dashboard/users/role`
+      );
+      if (response.ok) {
+        const data = await response.json();
+        setCurrentUserId(data.id || 'local');
+        // Check if user is authenticated (not 'local')
+        setIsAuthenticated(data.id && data.id !== 'local');
+      } else {
+        setCurrentUserId('local');
+        setIsAuthenticated(false);
+      }
+    } catch (error) {
+      console.error('Failed to get user info:', error);
+      setCurrentUserId('local');
+      setIsAuthenticated(false);
+    }
+  }, []);
+
+  // Fetch all data
+  const fetchData = useCallback(async (showLoading = false) => {
+    if (showLoading) setLoading(true);
+    try {
+      const recipesData = await getRecipes();
+      setAllRecipes(recipesData || []);
+      setLastFetchedTime(new Date());
+    } catch (error) {
+      console.error('Error fetching recipes:', error);
+      showToast(`Error loading recipes: ${error.message}`, 'error');
+    } finally {
+      if (showLoading) setLoading(false);
+    }
+  }, []);
+
+  // Check for copy data in query params
+  useEffect(() => {
+    if (router.query.copy) {
+      try {
+        const copyData = JSON.parse(router.query.copy);
+        setInitialCreateData(copyData);
+        setIsCreateModalOpen(true);
+        // Clear the query param
+        router.replace('/recipes', undefined, { shallow: true });
+      } catch (e) {
+        console.error('Failed to parse copy data:', e);
+      }
+    }
+  }, [router.query.copy, router]);
+
+  useEffect(() => {
+    fetchCurrentUser();
+    fetchData(true);
+  }, [fetchCurrentUser, fetchData]);
+
+  // Separate templates into categories
+  const pinnedRecipes = allRecipes.filter((r) => r.pinned);
+
+  // When not authenticated, show all non-pinned recipes in "All Recipes"
+  // When authenticated, separate into "My Recipes" and "All Recipes"
+  const myRecipes = isAuthenticated
+    ? allRecipes.filter((r) => !r.pinned && r.user_id === currentUserId)
+    : [];
+  const otherRecipes = isAuthenticated
+    ? allRecipes.filter((r) => !r.pinned && r.user_id !== currentUserId)
+    : allRecipes.filter((r) => !r.pinned);
+
+  // Handlers
+  const handleCreate = async (data) => {
+    await createRecipe({
+      ...data,
+      // If ownerName is provided, pass it to override the user_name
+      ...(data.ownerName ? { ownerName: data.ownerName } : {}),
+    });
+    showToast('Recipe created successfully!', 'success');
+    await fetchData();
+    setInitialCreateData(null);
+  };
+
+  const handleRefresh = () => {
+    fetchData(true);
+  };
+
+  const handleCloseCreateModal = () => {
+    setIsCreateModalOpen(false);
+    setInitialCreateData(null);
+  };
+
+  // Empty state
+  const EmptyState = () => (
+    <div className="text-center py-12">
+      <p className="text-gray-500 mb-4">No recipes yet</p>
+      <button
+        onClick={() => setIsCreateModalOpen(true)}
+        className="bg-sky-600 hover:bg-sky-700 text-white flex items-center justify-center mx-auto rounded-md px-3 py-1.5 text-sm font-medium transition-colors duration-200"
+      >
+        <PlusIcon className="h-4 w-4 mr-2" />
+        Create Your First Template
+      </button>
+    </div>
+  );
+
+  if (loading && allRecipes.length === 0) {
+    return (
+      <div className="flex justify-center items-center h-64">
+        <CircularProgress size={20} className="mr-2" />
+        <span className="text-gray-500">Loading...</span>
+      </div>
+    );
+  }
+
+  return (
+    <div className="h-full">
+      {/* Header */}
+      <div className="flex flex-wrap items-center gap-2 mb-4 min-h-[20px]">
+        <div className="text-base flex items-center">
+          <span className="text-sky-blue leading-none text-base">Recipes</span>
+        </div>
+
+        <div className="flex items-center gap-2 ml-auto">
+          {loading && (
+            <div className="flex items-center mr-2">
+              <CircularProgress size={16} />
+              <span className="ml-2 text-gray-500 text-sm">Refreshing...</span>
+            </div>
+          )}
+          {!loading && lastFetchedTime && (
+            <LastUpdatedTimestamp
+              timestamp={lastFetchedTime}
+              className="mr-2"
+            />
+          )}
+          <button
+            onClick={handleRefresh}
+            disabled={loading}
+            className="text-sky-blue hover:text-sky-blue-bright flex items-center"
+          >
+            <RotateCwIcon className="h-4 w-4 mr-1.5" />
+            <span>Refresh</span>
+          </button>
+          <button
+            onClick={() => setIsCreateModalOpen(true)}
+            className="ml-4 bg-sky-600 hover:bg-sky-700 text-white flex items-center rounded-md px-3 py-1 text-sm font-medium transition-colors duration-200"
+            title="New Recipe"
+          >
+            <PlusIcon className="h-4 w-4 mr-2" />
+            New Recipe
+          </button>
+        </div>
+      </div>
+
+      {allRecipes.length === 0 ? (
+        <Card>
+          <EmptyState />
+        </Card>
+      ) : (
+        <div>
+          {/* Pinned Recipes */}
+          <TemplateRow
+            title="Pinned Recipes"
+            icon={PinIcon}
+            iconColor="text-amber-500"
+            recipes={pinnedRecipes}
+            emptyMessage="No pinned recipes. Pin important recipes for quick access."
+          />
+
+          {/* My Recipes - only shown when authenticated */}
+          {isAuthenticated && (
+            <TemplateRow
+              title="My Recipes"
+              icon={FileCode}
+              iconColor="text-sky-500"
+              recipes={myRecipes}
+              emptyMessage="You haven't created any recipes yet."
+            />
+          )}
+
+          {/* All Recipes - Searchable List */}
+          <AllRecipesSection recipes={otherRecipes} />
+        </div>
+      )}
+
+      {/* Create Modal */}
+      <CreateRecipeModal
+        isOpen={isCreateModalOpen}
+        onClose={handleCloseCreateModal}
+        onSubmit={handleCreate}
+        initialData={initialCreateData}
+        isAuthenticated={isAuthenticated}
+      />
+    </div>
+  );
+}
diff --git a/sky/dashboard/src/components/shared/FilterSystem.jsx b/sky/dashboard/src/components/shared/FilterSystem.jsx
index e56c6beeda6..65f8ab0ef7d 100644
--- a/sky/dashboard/src/components/shared/FilterSystem.jsx
+++ b/sky/dashboard/src/components/shared/FilterSystem.jsx
@@ -23,6 +23,22 @@ export const evaluateCondition = (item, filter) => {
 
   const propertyLower = property.toLowerCase();
 
+  // Special handling for infra filtering - use full_infra for comparison
+  // since the filter dropdown shows full values but item.infra is truncated
+  if (propertyLower === 'infra') {
+    const itemValue = (item.full_infra || item.infra)?.toString().toLowerCase();
+    const filterValue = value.toString().toLowerCase();
+
+    switch (operator) {
+      case '=':
+        return itemValue === filterValue;
+      case ':':
+        return itemValue?.includes(filterValue);
+      default:
+        return true;
+    }
+  }
+
   // Special handling for labels filtering
   if (propertyLower === 'labels') {
     const labels = item.labels || {};
diff --git a/sky/dashboard/src/components/ui/dialog.jsx b/sky/dashboard/src/components/ui/dialog.jsx
index 71e1526650a..64881424c90 100644
--- a/sky/dashboard/src/components/ui/dialog.jsx
+++ b/sky/dashboard/src/components/ui/dialog.jsx
@@ -36,7 +36,7 @@ const DialogContent = React.forwardRef(
         )}
         {...props}
       >
-        {children}
+        <div style={{ width: '100%', minWidth: 0 }}>{children}</div>
         <DialogPrimitive.Close className="absolute right-4 top-4 rounded-sm opacity-70 ring-offset-white transition-opacity hover:opacity-100 focus:outline-none focus:ring-2 focus:ring-gray-400 focus:ring-offset-2 disabled:pointer-events-none data-[state=open]:bg-gray-100 data-[state=open]:text-gray-500">
           <X className="h-4 w-4" />
           <span className="sr-only">Close</span>
diff --git a/sky/dashboard/src/components/ui/yaml-editor.jsx b/sky/dashboard/src/components/ui/yaml-editor.jsx
new file mode 100644
index 00000000000..8c3e7bd0e67
--- /dev/null
+++ b/sky/dashboard/src/components/ui/yaml-editor.jsx
@@ -0,0 +1,54 @@
+'use client';
+
+import React from 'react';
+import CodeMirror from '@uiw/react-codemirror';
+import { yaml } from '@codemirror/lang-yaml';
+
+/**
+ * YAML Editor component with syntax highlighting.
+ * Drop-in replacement for Textarea when editing YAML content.
+ */
+export function YamlEditor({
+  value,
+  onChange,
+  className,
+  maxHeight = '400px',
+  minHeight,
+  disabled = false,
+}) {
+  return (
+    <div
+      className={`rounded-md border border-gray-300 overflow-hidden ${className || ''}`}
+      style={{
+        width: '100%',
+        maxWidth: '100%',
+        minWidth: 0,
+      }}
+    >
+      <CodeMirror
+        value={value}
+        onChange={onChange}
+        extensions={[yaml()]}
+        editable={!disabled}
+        basicSetup={{
+          lineNumbers: true,
+          foldGutter: true,
+          highlightActiveLineGutter: false,
+          highlightActiveLine: false,
+          indentOnInput: true,
+          bracketMatching: true,
+          autocompletion: false,
+        }}
+        style={{
+          fontSize: '13px',
+          maxHeight,
+          minHeight,
+          overflow: 'auto',
+        }}
+        theme="light"
+      />
+    </div>
+  );
+}
+
+export default YamlEditor;
diff --git a/sky/dashboard/src/components/users.jsx b/sky/dashboard/src/components/users.jsx
index b463eff8451..69cbc2437a2 100644
--- a/sky/dashboard/src/components/users.jsx
+++ b/sky/dashboard/src/components/users.jsx
@@ -1651,6 +1651,7 @@ function UsersTable({
         if (setLastFetchedTime) setLastFetchedTime(new Date());
       }
     },
+    // eslint-disable-next-line react-hooks/exhaustive-deps
     [setLoading, setLastFetchedTime]
   );
 
@@ -2941,7 +2942,7 @@ function ServiceAccountTokensView({
                     placeholder="e.g., 30"
                     min="0"
                     max="365"
-                    value={newToken.expires_in_days || ''}
+                    value={newToken.expires_in_days ?? ''}
                     onChange={(e) =>
                       setNewToken({
                         ...newToken,
diff --git a/sky/dashboard/src/components/utils.jsx b/sky/dashboard/src/components/utils.jsx
index 5bceddb8ef6..b4dffacaa99 100644
--- a/sky/dashboard/src/components/utils.jsx
+++ b/sky/dashboard/src/components/utils.jsx
@@ -183,12 +183,14 @@ export const CustomTooltip = ({ children, ...props }) => {
 export const NonCapitalizedTooltip = ({ children, ...props }) => {
   const content = props.content;
   props.content = undefined;
+  // Remove className from props to prevent it from affecting tooltip styling
+  const { className: _unused, ...restProps } = props;
   return (
     <Tooltip
       {...DEFAULT_TOOLTIP_PROPS}
-      {...props}
+      {...restProps}
       content={
-        <span className="left-full w-max px-2 py-1 text-sm text-gray-100 bg-gray-500 text-sm rounded whitespace-pre-line">
+        <span className="left-full w-max px-2 py-1 text-sm text-gray-100 bg-gray-500 rounded whitespace-pre-line normal-case">
           {content}
         </span>
       }
@@ -403,7 +405,7 @@ export function shouldDropLogLine(line) {
   return false;
 }
 
-function extractNodeTypes(logs) {
+export function extractNodeTypes(logs) {
   const nodePattern = /\((head|worker\d+),/g; // Matches 'head' or 'worker' followed by any number
   const nodeTypes = new Set();
 
@@ -424,8 +426,12 @@ function extractNodeTypes(logs) {
   return sortedNodeTypes; // Return sorted array
 }
 
-export function LogFilter({ logs, controller = false }) {
-  const [selectedNode, setSelectedNode] = useState('all');
+export function LogFilter({
+  logs,
+  controller = false,
+  isLoading = false,
+  selectedNode = 'all',
+}) {
   const normalizeLogs = (input) => {
     if (!input) return [];
     if (Array.isArray(input)) return input;
@@ -437,11 +443,6 @@ export function LogFilter({ logs, controller = false }) {
 
   const normalizedLogs = normalizeLogs(logs);
   const [filteredLogs, setFilteredLogs] = useState(normalizedLogs);
-  const [nodeTypes, setNodeTypes] = useState([]);
-
-  useEffect(() => {
-    setNodeTypes(extractNodeTypes(normalizedLogs.join('\n')));
-  }, [normalizedLogs]);
 
   useEffect(() => {
     if (selectedNode === 'all') {
@@ -457,34 +458,37 @@ export function LogFilter({ logs, controller = false }) {
   return (
     <div>
       <style>{logStyles}</style>
-      {!controller && (
-        <div style={{ marginBottom: '1rem' }}>
-          <Select
-            onValueChange={(value) => setSelectedNode(value)}
-            value={selectedNode}
-          >
-            <SelectTrigger
-              aria-label="Node"
-              className="focus:ring-0 focus:ring-offset-0"
-            >
-              <SelectValue placeholder="Select Node" />
-            </SelectTrigger>
-            <SelectContent>
-              <SelectItem value="all">All Nodes</SelectItem>
-              {nodeTypes.map((node) => (
-                <SelectItem key={node} value={node}>
-                  {node}
-                </SelectItem>
-              ))}
-            </SelectContent>
-          </Select>
-        </div>
-      )}
       <div
         className="logs-container whitespace-pre-wrap break-all font-mono text-sm text-gray-900"
         aria-label="job-logs"
       >
-        {filteredLogs.join('\n')}
+        {isLoading ? (
+          <div className="flex items-center justify-center py-8 text-gray-500">
+            <svg
+              className="animate-spin h-5 w-5 mr-2"
+              xmlns="http://www.w3.org/2000/svg"
+              fill="none"
+              viewBox="0 0 24 24"
+            >
+              <circle
+                className="opacity-25"
+                cx="12"
+                cy="12"
+                r="10"
+                stroke="currentColor"
+                strokeWidth="4"
+              ></circle>
+              <path
+                className="opacity-75"
+                fill="currentColor"
+                d="M4 12a8 8 0 018-8V0C5.373 0 0 5.373 0 12h4zm2 5.291A7.962 7.962 0 014 12H0c0 3.042 1.135 5.824 3 7.938l3-2.647z"
+              ></path>
+            </svg>
+            <span>Loading logs...</span>
+          </div>
+        ) : (
+          filteredLogs.join('\n')
+        )}
       </div>
     </div>
   );
diff --git a/sky/dashboard/src/components/volumes.jsx b/sky/dashboard/src/components/volumes.jsx
index 0f5f7b2dc70..f7f5f56cb26 100644
--- a/sky/dashboard/src/components/volumes.jsx
+++ b/sky/dashboard/src/components/volumes.jsx
@@ -400,7 +400,10 @@ function VolumesTable({
                     <TableCell className="font-medium">{volume.name}</TableCell>
                     <TableCell>{volume.infra || 'N/A'}</TableCell>
                     <TableCell>
-                      <StatusBadge status={volume.status} />
+                      <StatusBadge
+                        status={volume.status}
+                        statusTooltip={volume.error_message || volume.status}
+                      />
                     </TableCell>
                     <TableCell>{formatSize(volume.size)}</TableCell>
                     <TableCell>{volume.user_name || 'N/A'}</TableCell>
diff --git a/sky/dashboard/src/data/connectors/client.js b/sky/dashboard/src/data/connectors/client.js
index bd70914f2ee..ec6ba51957c 100644
--- a/sky/dashboard/src/data/connectors/client.js
+++ b/sky/dashboard/src/data/connectors/client.js
@@ -3,8 +3,77 @@
 import { getErrorMessageFromResponse } from '@/data/utils';
 import { ENDPOINT } from './constants';
 
+// Cache for current user info
+let cachedUserInfo = null;
+let userInfoPromise = null;
+
+// Fetch and cache the current user info
+async function getCurrentUserInfo() {
+  if (cachedUserInfo) {
+    return cachedUserInfo;
+  }
+
+  if (userInfoPromise) {
+    return userInfoPromise;
+  }
+
+  userInfoPromise = (async () => {
+    try {
+      const baseUrl = window.location.origin;
+      const response = await fetch(`${baseUrl}/internal/dashboard/users/role`);
+      if (response.ok) {
+        const data = await response.json();
+        cachedUserInfo = {
+          id: data.id || 'local',
+          name: data.name || data.id || 'local',
+        };
+      } else {
+        // Not authenticated or error - use 'local' as fallback
+        cachedUserInfo = { id: 'local', name: 'local' };
+      }
+    } catch (error) {
+      console.error('Failed to get user info:', error);
+      cachedUserInfo = { id: 'local', name: 'local' };
+    }
+    return cachedUserInfo;
+  })();
+
+  return userInfoPromise;
+}
+
+// Wait for plugins that need early initialization (e.g., fetch interceptors)
+// Such plugins are marked with data-requires-early-init="true" and set
+// window.__skyPluginsReady = true when ready
+let pluginsReadyPromise = null;
+
+async function waitForPlugins() {
+  if (window.__skyPluginsReady) return;
+  if (pluginsReadyPromise) return pluginsReadyPromise;
+
+  // Check if any plugin needs early init
+  const needsWait = document.querySelector(
+    'script[src*="/plugins/"][data-requires-early-init="true"]'
+  );
+  if (!needsWait) return;
+
+  // Wait for plugin to signal ready (max 1s)
+  pluginsReadyPromise = new Promise((resolve) => {
+    const start = Date.now();
+    const check = setInterval(() => {
+      if (window.__skyPluginsReady || Date.now() - start >= 1000) {
+        clearInterval(check);
+        resolve();
+      }
+    }, 50);
+  });
+  return pluginsReadyPromise;
+}
+
 export const apiClient = {
   fetchImmediate: async (path, body, method = 'POST', options = {}) => {
+    // Wait for plugins to be ready before making API calls
+    await waitForPlugins();
+
     // Call a skypilot API and get the result
     const headers =
       method === 'POST'
@@ -18,11 +87,13 @@ export const apiClient = {
     const fullUrl = `${baseUrl}${ENDPOINT}${path}`;
 
     if (body !== undefined) {
+      // Get user info (cached after first call)
+      const userInfo = await getCurrentUserInfo();
       body.env_vars = {
         ...(body.env_vars || {}),
         SKYPILOT_IS_FROM_DASHBOARD: 'true',
-        SKYPILOT_USER_ID: 'dashboard',
-        SKYPILOT_USER: 'dashboard',
+        SKYPILOT_USER_ID: userInfo.id,
+        SKYPILOT_USER: userInfo.name,
       };
     }
 
@@ -45,10 +116,8 @@ export const apiClient = {
         throw new Error(msg);
       }
 
-      // Handle X-Request-ID for API requests
-      const id =
-        response.headers.get('X-Skypilot-Request-ID') ||
-        response.headers.get('X-Request-ID');
+      // Handle X-Skypilot-Request-ID for API requests
+      const id = response.headers.get('X-Skypilot-Request-ID');
 
       // Handle empty request ID
       if (!id) {
diff --git a/sky/dashboard/src/data/connectors/clusters.jsx b/sky/dashboard/src/data/connectors/clusters.jsx
index fe0dc177865..8d5a1a14379 100644
--- a/sky/dashboard/src/data/connectors/clusters.jsx
+++ b/sky/dashboard/src/data/connectors/clusters.jsx
@@ -1,10 +1,33 @@
 'use client';
 
-import { useState, useEffect, useCallback } from 'react';
+import { useState, useEffect, useCallback, useMemo, useRef } from 'react';
 import { showToast } from '@/data/connectors/toast';
 import { apiClient } from '@/data/connectors/client';
 import { ENDPOINT } from '@/data/connectors/constants';
 import dashboardCache from '@/lib/cache';
+import { applyEnhancements } from '@/plugins/dataEnhancement';
+
+// ============ Pagination Plugin Integration ============
+
+/**
+ * Check if the pagination plugin is available.
+ * The plugin sets window.__skyPaginationFetch when loaded.
+ * With requires_early_init=True, the plugin is guaranteed to be
+ * loaded before any API calls complete.
+ */
+function isPaginationPluginAvailable() {
+  return (
+    typeof window !== 'undefined' &&
+    typeof window.__skyPaginationFetch === 'function'
+  );
+}
+
+/**
+ * Get the pagination plugin fetch function
+ */
+function getPaginationFetch() {
+  return typeof window !== 'undefined' ? window.__skyPaginationFetch : null;
+}
 
 const DEFAULT_TAIL_LINES = 5000;
 
@@ -44,6 +67,7 @@ const clusterStatusMap = {
   UP: 'RUNNING',
   STOPPED: 'STOPPED',
   INIT: 'LAUNCHING',
+  PENDING: 'PENDING',
   null: 'TERMINATED',
 };
 
@@ -114,7 +138,15 @@ export async function getClusters({ clusterNames = null } = {}) {
         ],
       };
     });
-    return clusterData;
+
+    // Apply plugin data enhancements
+    // Pass raw backend data so enhancements can extract fields directly
+    const enhancedClusters = await applyEnhancements(clusterData, 'clusters', {
+      dashboardCache,
+      rawData: clusters, // Raw backend response for field extraction
+    });
+
+    return enhancedClusters;
   } catch (error) {
     console.error('Error fetching clusters:', error);
     throw error;
@@ -135,8 +167,6 @@ export async function getClusterHistory(clusterHash = null, days = 30) {
 
     const history = await apiClient.fetch('/cost_report', requestBody);
 
-    console.log('Raw cluster history data:', history); // Debug log
-
     const historyData = history.map((cluster) => {
       // Get cloud name from resources if available
       let cloud = 'Unknown';
@@ -188,8 +218,13 @@ export async function getClusterHistory(clusterHash = null, days = 30) {
       };
     });
 
-    console.log('Processed cluster history data:', historyData); // Debug log
-    return historyData;
+    // Apply plugin data enhancements
+    // Pass raw backend data so enhancements can extract fields directly
+    const enhancedHistory = await applyEnhancements(historyData, 'clusters', {
+      dashboardCache,
+      rawData: history, // Raw backend response for field extraction
+    });
+    return enhancedHistory;
   } catch (error) {
     console.error('Error fetching cluster history:', error);
     throw error;
@@ -442,3 +477,235 @@ export function useClusterDetails({ cluster, job = null }) {
     refreshClusterJobsOnly,
   };
 }
+
+// ============ useClusterData Hook ============
+
+/**
+ * Hook for cluster data with pagination support.
+ * If the pagination plugin is available, uses server-side pagination.
+ * Otherwise, falls back to client-side pagination with getClusters.
+ *
+ * With requires_early_init=True, the plugin is guaranteed to be loaded
+ * before the first API call completes, so we just need a simple check.
+ *
+ * @param {Object} options - Hook options
+ * @param {boolean} options.showHistory - Whether to include historical clusters
+ * @param {number} options.historyDays - Number of days of history to fetch
+ * @param {number} options.refreshInterval - Auto-refresh interval in ms
+ * @returns {Object} Cluster data with pagination state and actions
+ */
+export function useClusterData(options = {}) {
+  const {
+    showHistory = false,
+    historyDays = 1,
+    refreshInterval = null,
+    sortConfig = { key: null, direction: 'ascending' },
+    filters = [],
+  } = options;
+
+  // Convert sortConfig to API format
+  // Default to launched_at desc (newest first) when no sort is specified
+  const sortBy = sortConfig.key || 'launched_at';
+  const sortOrder = sortConfig.key
+    ? sortConfig.direction === 'ascending'
+      ? 'asc'
+      : 'desc'
+    : 'desc'; // Default to desc when no sort key selected
+
+  // Serialize filters for stable dependency comparison
+  const filtersKey = JSON.stringify(filters);
+
+  const [data, setData] = useState([]);
+  const [fullData, setFullData] = useState([]); // Full dataset for client-side filtering
+  const [loading, setLoading] = useState(true);
+  const [page, setPage] = useState(1);
+  const [limit, setLimit] = useState(10);
+  const [total, setTotal] = useState(0);
+  const [totalPages, setTotalPages] = useState(1);
+  const [hasNext, setHasNext] = useState(false);
+  const [hasPrev, setHasPrev] = useState(false);
+  const [error, setError] = useState(null);
+  const [isServerPagination, setIsServerPagination] = useState(false);
+
+  // Reset to page 1 when filters change
+  useEffect(() => {
+    setPage(1);
+  }, [filtersKey]);
+
+  /**
+   * Fetch clusters using server-side pagination (plugin path)
+   */
+  const fetchServerSide = useCallback(async () => {
+    console.log('[useClusterData] Using server-side pagination');
+    const pluginFetch = getPaginationFetch();
+
+    const result = await dashboardCache.get(pluginFetch, [
+      {
+        page,
+        limit,
+        showHistory,
+        historyDays,
+        sortBy,
+        sortOrder,
+        filters,
+      },
+    ]);
+
+    const resultTotal = result.total || 0;
+    const resultTotalPages = result.totalPages || result.total_pages || 1;
+    const resultHasNext = result.hasNext || result.has_next || false;
+    const resultHasPrev = result.hasPrev || result.has_prev || false;
+    const resultData = result.items || result.data || [];
+
+    setData(resultData);
+    setFullData(resultData);
+    setTotal(resultTotal);
+    setTotalPages(resultTotalPages);
+    setHasNext(resultHasNext);
+    setHasPrev(resultHasPrev);
+    setIsServerPagination(true);
+
+    // Prefetch next page in background if there is one
+    if (resultHasNext) {
+      const nextPageOptions = {
+        page: page + 1,
+        limit,
+        showHistory,
+        historyDays,
+        sortBy,
+        sortOrder,
+        filters,
+      };
+      dashboardCache
+        .get(pluginFetch, [nextPageOptions], { ttl: 30000 })
+        .then(() => console.log('[useClusterData] Prefetched page', page + 1))
+        .catch((err) => console.warn('[useClusterData] Prefetch failed:', err));
+    }
+  }, [page, limit, showHistory, historyDays, sortBy, sortOrder, filters]);
+
+  /**
+   * Fetch clusters using client-side pagination (default path)
+   */
+  const fetchClientSide = useCallback(async () => {
+    console.log('[useClusterData] Using client-side pagination');
+
+    const activeClusters = await dashboardCache.get(getClusters);
+
+    let allClusters;
+    if (showHistory) {
+      let historyClusters = [];
+      try {
+        historyClusters = await dashboardCache.get(getClusterHistory, [
+          null,
+          historyDays,
+        ]);
+      } catch (historyError) {
+        console.error('Error fetching cluster history:', historyError);
+      }
+
+      const markedActive = activeClusters.map((c) => ({
+        ...c,
+        isHistorical: false,
+      }));
+      const markedHistory = historyClusters.map((c) => ({
+        ...c,
+        isHistorical: true,
+      }));
+
+      allClusters = [...markedActive];
+      markedHistory.forEach((hist) => {
+        if (!activeClusters.some((a) => a.cluster_hash === hist.cluster_hash)) {
+          allClusters.push(hist);
+        }
+      });
+    } else {
+      allClusters = activeClusters.map((c) => ({
+        ...c,
+        isHistorical: false,
+      }));
+    }
+
+    const clientTotal = allClusters.length;
+    const clientTotalPages = Math.ceil(clientTotal / limit) || 1;
+    const startIndex = (page - 1) * limit;
+    const paginatedData = allClusters.slice(startIndex, startIndex + limit);
+
+    setData(paginatedData);
+    setFullData(allClusters);
+    setTotal(clientTotal);
+    setTotalPages(clientTotalPages);
+    setHasNext(page < clientTotalPages);
+    setHasPrev(page > 1);
+    setIsServerPagination(false);
+  }, [showHistory, historyDays, page, limit]);
+
+  /**
+   * Main fetch function - chooses server or client path
+   */
+  const fetchData = useCallback(async () => {
+    setLoading(true);
+    setError(null);
+
+    try {
+      if (isPaginationPluginAvailable()) {
+        await fetchServerSide();
+      } else {
+        await fetchClientSide();
+      }
+    } catch (fetchError) {
+      console.error('[useClusterData] Error fetching clusters:', fetchError);
+      setError(fetchError);
+      setData([]);
+      setFullData([]);
+    }
+
+    setLoading(false);
+  }, [fetchServerSide, fetchClientSide]);
+
+  // Fetch data on mount and when dependencies change
+  useEffect(() => {
+    fetchData();
+  }, [fetchData]);
+
+  // Auto-refresh
+  useEffect(() => {
+    if (!refreshInterval) return;
+    const interval = setInterval(() => {
+      if (document.visibilityState === 'visible') {
+        fetchData();
+      }
+    }, refreshInterval);
+    return () => clearInterval(interval);
+  }, [refreshInterval, fetchData]);
+
+  // Handle limit change - reset to page 1
+  const handleSetLimit = useCallback((newLimit) => {
+    setLimit(newLimit);
+    setPage(1);
+  }, []);
+
+  return {
+    // Data - current page slice (paginated)
+    data,
+    // allData - full dataset for client-side filtering (in server mode, same as data)
+    allData: fullData,
+    total,
+
+    // Pagination state
+    page,
+    limit,
+    totalPages,
+    hasNext,
+    hasPrev,
+
+    // Pagination actions
+    setPage,
+    setLimit: handleSetLimit,
+
+    // Other
+    loading,
+    error,
+    refresh: fetchData,
+    isServerPagination,
+  };
+}
diff --git a/sky/dashboard/src/data/connectors/infra.jsx b/sky/dashboard/src/data/connectors/infra.jsx
index 320a889d24c..b3efad3f83c 100644
--- a/sky/dashboard/src/data/connectors/infra.jsx
+++ b/sky/dashboard/src/data/connectors/infra.jsx
@@ -504,6 +504,13 @@ export async function getContextGPUData(context) {
         const totalCount = nodeData['total']?.['accelerator_count'] || 0;
         const freeCount = nodeData['free']?.['accelerators_available'] || 0;
         const isReady = nodeData['is_ready'] !== false;
+        // Check if node is cordoned (defaults to false for backward compatibility)
+        const isCordoned = nodeData['is_cordoned'] === true;
+        // Check if node has taints (defaults to empty for backward compatibility)
+        const taints = nodeData['taints'] || [];
+        const isTainted = taints.length > 0;
+        // Node is considered not ready if it's not ready, cordoned, or tainted
+        const isNodeNotReady = !isReady || isCordoned || isTainted;
 
         // Per-node data - use same field names as original getKubernetesGPUsFromContexts
         perNodeGPUs.push({
@@ -512,6 +519,8 @@ export async function getContextGPUData(context) {
           gpu_total: totalCount,
           gpu_free: freeCount,
           is_ready: isReady,
+          is_cordoned: isCordoned,
+          taints: taints,
           context: context,
           ip_address: nodeData['ip_address'] || null,
           cpu_count: nodeData['cpu_count'] ?? null,
@@ -534,7 +543,7 @@ export async function getContextGPUData(context) {
           }
           gpuToData[gpuName].gpu_total += totalCount;
           gpuToData[gpuName].gpu_free += freeCount;
-          if (!isReady) {
+          if (isNodeNotReady) {
             gpuToData[gpuName].gpu_not_ready += totalCount;
           }
           gpuToData[gpuName].gpu_requestable_qty_per_node = totalCount;
@@ -633,6 +642,13 @@ async function getKubernetesGPUsFromContexts(contextNames) {
           const freeCount = nodeData['free']?.['accelerators_available'] || 0;
           // Check if node is ready (defaults to true for backward compatibility)
           const isReady = nodeData['is_ready'] !== false;
+          // Check if node is cordoned (defaults to false for backward compatibility)
+          const isCordoned = nodeData['is_cordoned'] === true;
+          // Check if node has taints (defaults to empty for backward compatibility)
+          const taints = nodeData['taints'] || [];
+          const isTainted = taints.length > 0;
+          // Node is considered not ready if it's not ready, cordoned, or tainted
+          const isNodeNotReady = !isReady || isCordoned || isTainted;
 
           if (totalCount > 0) {
             if (!gpuToData[gpuName]) {
@@ -647,7 +663,7 @@ async function getKubernetesGPUsFromContexts(contextNames) {
             }
             gpuToData[gpuName].gpu_total += totalCount;
             gpuToData[gpuName].gpu_free += freeCount;
-            if (isReady === false) {
+            if (isNodeNotReady) {
               gpuToData[gpuName].gpu_not_ready += totalCount;
             }
             gpuToData[gpuName].gpu_requestable_qty_per_node = totalCount;
@@ -696,6 +712,10 @@ async function getKubernetesGPUsFromContexts(contextNames) {
             nodeData['free']?.['accelerators_available'] ?? 0;
           // Check if node is ready (defaults to true for backward compatibility)
           const nodeIsReady = nodeData['is_ready'] !== false;
+          // Check if node is cordoned (defaults to false for backward compatibility)
+          const nodeIsCordoned = nodeData['is_cordoned'] === true;
+          // Get taints (defaults to empty for backward compatibility)
+          const nodeTaints = nodeData['taints'] || [];
 
           // Extract CPU and memory information
           const cpuCount = nodeData['cpu_count'] ?? null;
@@ -715,6 +735,8 @@ async function getKubernetesGPUsFromContexts(contextNames) {
             cpu_free: cpuFree,
             memory_free_gb: memoryFreeGb,
             is_ready: nodeIsReady,
+            is_cordoned: nodeIsCordoned,
+            taints: nodeTaints,
           };
 
           // If this node provides a GPU type not found via GPU availability,
@@ -790,9 +812,7 @@ async function getKubernetesPerNodeGPUs(context) {
       const msg = `Failed to get kubernetes node info for context ${context} with status ${response.status}, error: ${response.statusText}`;
       throw new Error(msg);
     }
-    const id =
-      response.headers.get('X-Skypilot-Request-ID') ||
-      response.headers.get('x-request-id');
+    const id = response.headers.get('X-Skypilot-Request-ID');
     if (!id) {
       const msg = 'No request ID received from server for kubernetes node info';
       throw new Error(msg);
@@ -875,9 +895,7 @@ export async function getCloudGPUs() {
       const msg = `Failed to get cloud GPUs with status ${response.status}, error: ${response.statusText}`;
       throw new Error(msg);
     }
-    const id =
-      response.headers.get('X-Skypilot-Request-ID') ||
-      response.headers.get('x-request-id');
+    const id = response.headers.get('X-Skypilot-Request-ID');
     if (!id) {
       const msg = 'No request ID received from server for cloud GPUs';
       throw new Error(msg);
@@ -952,9 +970,7 @@ export async function getDetailedGpuInfo(filter) {
       const msg = `Failed to get detailed GPU info with status ${response.status}, error: ${response.statusText}`;
       throw new Error(msg);
     }
-    const id =
-      response.headers.get('X-Skypilot-Request-ID') ||
-      response.headers.get('X-Request-ID');
+    const id = response.headers.get('X-Skypilot-Request-ID');
     if (!id) {
       const msg = 'No request ID received from server for detailed GPU info';
       throw new Error(msg);
@@ -1123,9 +1139,7 @@ async function getSlurmClusterGPUs() {
       const msg = `Failed to get slurm cluster GPUs with status ${response.status}`;
       throw new Error(msg);
     }
-    const id =
-      response.headers.get('X-Skypilot-Request-ID') ||
-      response.headers.get('x-request-id');
+    const id = response.headers.get('X-Skypilot-Request-ID');
     if (!id) {
       const msg = 'No request ID received from server for slurm cluster GPUs';
       throw new Error(msg);
@@ -1167,9 +1181,7 @@ async function getSlurmPerNodeGPUs() {
       const msg = `Failed to get slurm node info with status ${response.status}`;
       throw new Error(msg);
     }
-    const id =
-      response.headers.get('X-Skypilot-Request-ID') ||
-      response.headers.get('x-request-id');
+    const id = response.headers.get('X-Skypilot-Request-ID');
     if (!id) {
       const msg = 'No request ID received from server for slurm node info';
       throw new Error(msg);
diff --git a/sky/dashboard/src/data/connectors/jobs.jsx b/sky/dashboard/src/data/connectors/jobs.jsx
index a2f4ecd0c0d..63c3bf79f3e 100644
--- a/sky/dashboard/src/data/connectors/jobs.jsx
+++ b/sky/dashboard/src/data/connectors/jobs.jsx
@@ -1,4 +1,4 @@
-import { useState, useEffect } from 'react';
+import { useState, useEffect, useCallback } from 'react';
 import { showToast } from '@/data/connectors/toast';
 import {
   CLUSTER_NOT_UP_ERROR,
@@ -6,7 +6,31 @@ import {
   NOT_SUPPORTED_ERROR,
 } from '@/data/connectors/constants';
 import dashboardCache from '@/lib/cache';
+import jobsCacheManager from '@/lib/jobs-cache-manager';
 import { apiClient } from './client';
+import { applyEnhancements } from '@/plugins/dataEnhancement';
+
+// ============ Pagination Plugin Integration ============
+
+/**
+ * Check if the jobs pagination plugin is available.
+ * The plugin sets window.__skyJobsPaginationFetch when loaded.
+ * With requires_early_init=True, the plugin is guaranteed to be
+ * loaded before any API calls complete.
+ */
+function isJobsPaginationPluginAvailable() {
+  return (
+    typeof window !== 'undefined' &&
+    typeof window.__skyJobsPaginationFetch === 'function'
+  );
+}
+
+/**
+ * Get the jobs pagination plugin fetch function
+ */
+function getJobsPaginationFetch() {
+  return typeof window !== 'undefined' ? window.__skyJobsPaginationFetch : null;
+}
 
 // Configuration
 const DEFAULT_TAIL_LINES = 5000;
@@ -31,9 +55,56 @@ const DEFAULT_FIELDS = [
   'pool_hash',
   'details',
   'failure_reason',
+  'user_yaml',
+  'entrypoint',
+  'is_job_group',
+  'execution',
+  'is_primary_in_job_group',
   'links',
 ];
 
+/**
+ * Compute the job group status based on primary tasks.
+ * For job groups with primary/auxiliary tasks, the job status is determined
+ * only by the primary tasks. If all primary tasks succeed, the job is
+ * considered successful even if auxiliary tasks were cancelled.
+ *
+ * Uses is_primary_in_job_group per task:
+ * - null: Non-job-group task (counts for status)
+ * - true: Primary task in job group (counts for status)
+ * - false: Auxiliary task in job group (does not count for status)
+ *
+ * @param {Array} tasks - Array of task objects with status and is_primary_in_job_group fields
+ * @returns {string} - The computed job group status
+ */
+export function computeJobGroupStatus(tasks) {
+  if (!tasks || tasks.length === 0) {
+    return null;
+  }
+
+  // Filter to only primary tasks for status determination.
+  // is_primary_in_job_group: true/false for job groups, null for non-groups.
+  // For non-job-groups (null), all tasks count for status.
+  // For job groups, only tasks with is_primary_in_job_group=true count.
+  const primaryTasks = tasks.filter(
+    (t) =>
+      t.is_primary_in_job_group === null ||
+      t.is_primary_in_job_group === undefined ||
+      t.is_primary_in_job_group === true
+  );
+
+  // Use primary tasks for status; fall back to all tasks if none match
+  const tasksForStatus = primaryTasks.length > 0 ? primaryTasks : tasks;
+
+  // Return the first non-SUCCEEDED status, or SUCCEEDED if all succeeded
+  for (const task of tasksForStatus) {
+    if (task.status !== 'SUCCEEDED') {
+      return task.status;
+    }
+  }
+  return 'SUCCEEDED';
+}
+
 export async function getManagedJobs(options = {}) {
   try {
     const {
@@ -219,14 +290,26 @@ export async function getManagedJobs(options = {}) {
         pool: job.pool,
         pool_hash: job.pool_hash,
         current_cluster_name: job.current_cluster_name,
+        cluster_name_on_cloud: job.cluster_name_on_cloud,
         job_id_on_pool_cluster: job.job_id_on_pool_cluster,
         accelerators: job.accelerators, // Include accelerators field
         labels: job.labels || {}, // Include labels field
+        // JobGroup fields
+        is_job_group: job.is_job_group,
+        execution: job.execution,
+        is_primary_in_job_group: job.is_primary_in_job_group,
       };
     });
 
+    // Apply plugin data enhancements
+    // Pass raw backend data so enhancements can extract fields directly
+    const enhancedJobs = await applyEnhancements(jobData, 'jobs', {
+      dashboardCache,
+      rawData: managedJobs, // Raw backend response for field extraction
+    });
+
     return {
-      jobs: jobData,
+      jobs: enhancedJobs,
       total,
       totalNoFilter,
       controllerStopped: false,
@@ -426,6 +509,7 @@ export async function getPoolStatus() {
 }
 
 // Hook for individual job details that reuses the main jobs cache
+// Returns all tasks for a given job_id (supports multi-task jobs)
 export function useSingleManagedJob(jobId, refreshTrigger = 0) {
   const [jobData, setJobData] = useState(null);
   const [loadingJobData, setLoadingJobData] = useState(true);
@@ -439,19 +523,19 @@ export function useSingleManagedJob(jobId, refreshTrigger = 0) {
       try {
         setLoadingJobData(true);
 
-        // Always get all jobs data (cache handles freshness automatically)
+        // Fetch the specific job by ID with all fields for complete data
         const allJobsData = await dashboardCache.get(getManagedJobs, [
           { allUsers: true, allFields: true, jobIDs: [jobId] },
         ]);
 
-        // Filter for the specific job client-side
-        const job = allJobsData?.jobs?.find(
-          (j) => String(j.id) === String(jobId)
-        );
+        // Filter for ALL tasks matching this job_id (supports multi-task jobs)
+        const matchingJobs =
+          allJobsData?.jobs?.filter((j) => String(j.id) === String(jobId)) ||
+          [];
 
-        if (job) {
+        if (matchingJobs.length > 0) {
           setJobData({
-            jobs: [job],
+            jobs: matchingJobs,
             controllerStopped: allJobsData.controllerStopped || false,
           });
         } else {
@@ -477,6 +561,7 @@ export function useSingleManagedJob(jobId, refreshTrigger = 0) {
 
 export async function streamManagedJobLogs({
   jobId,
+  task = null,
   controller = false,
   signal,
   onNewLog,
@@ -517,6 +602,7 @@ export async function streamManagedJobLogs({
         follow: false,
         job_id: jobId,
         tail: DEFAULT_TAIL_LINES,
+        task: task,
       };
 
       const response = await apiClient.fetchImmediate(
diff --git a/sky/dashboard/src/data/connectors/recipes.js b/sky/dashboard/src/data/connectors/recipes.js
new file mode 100644
index 00000000000..9b5afdb77b7
--- /dev/null
+++ b/sky/dashboard/src/data/connectors/recipes.js
@@ -0,0 +1,142 @@
+/**
+ * YAML Hub API connector
+ *
+ * Provides functions for managing YAML templates including CRUD operations,
+ * pinning, and deployment.
+ */
+
+import { apiClient } from './client';
+
+/**
+ * List YAML templates with optional filters.
+ * By default returns all pinned templates plus templates owned by the current user.
+ *
+ * @param {Object} options - Filter options
+ * @param {boolean} [options.pinnedOnly] - Only return pinned templates
+ * @param {boolean} [options.myRecipesOnly] - Only return user's own templates
+ * @param {string} [options.recipeType] - Filter by type (see RecipeType in constants/recipeTypes.js)
+ * @returns {Promise<Array>} List of recipes
+ */
+export async function getRecipes(options = {}) {
+  try {
+    const body = {
+      pinned_only: options.pinnedOnly || false,
+      my_recipes_only: options.myRecipesOnly || false,
+      recipe_type: options.recipeType || null,
+    };
+
+    const result = await apiClient.fetch('/recipes/list', body, 'POST');
+    return result || [];
+  } catch (error) {
+    console.error('Error fetching YAML templates:', error);
+    throw error;
+  }
+}
+
+/**
+ * Get a single recipe by name.
+ *
+ * @param {string} recipeName - The recipe's unique name
+ * @returns {Promise<Object|null>} Recipe object or null if not found
+ */
+export async function getRecipe(recipeName) {
+  try {
+    const result = await apiClient.fetch('/recipes/get', {
+      recipe_name: recipeName,
+    });
+    return result;
+  } catch (error) {
+    console.error('Error fetching Recipe:', error);
+    throw error;
+  }
+}
+
+/**
+ * Create a new Recipe.
+ *
+ * @param {Object} data - Recipe data
+ * @param {string} data.name - Display name
+ * @param {string} data.content - YAML content
+ * @param {string} data.recipeType - Type (see RecipeType in constants/recipeTypes.js)
+ * @param {string} [data.description] - Optional description
+ * @param {string} [data.ownerName] - Optional owner name (for unauthenticated users)
+ * @returns {Promise<Object>} Created recipe object
+ */
+export async function createRecipe(data) {
+  try {
+    const result = await apiClient.fetch('/recipes/create', {
+      name: data.name,
+      content: data.content,
+      recipe_type: data.recipeType,
+      description: data.description || null,
+      owner_name: data.ownerName || null,
+    });
+    return result;
+  } catch (error) {
+    console.error('Error creating Recipe:', error);
+    throw error;
+  }
+}
+
+/**
+ * Update an existing Recipe.
+ * Note: Recipe names cannot be changed as they are the unique identifier.
+ *
+ * @param {string} recipeName - The recipe's unique name
+ * @param {Object} data - Fields to update
+ * @param {string} [data.description] - New description
+ * @param {string} [data.content] - New YAML content
+ * @returns {Promise<Object|null>} Updated recipe or null if not authorized
+ */
+export async function updateRecipe(recipeName, data) {
+  try {
+    const result = await apiClient.fetch('/recipes/update', {
+      recipe_name: recipeName,
+      description: data.description,
+      content: data.content,
+    });
+    return result;
+  } catch (error) {
+    console.error('Error updating Recipe:', error);
+    throw error;
+  }
+}
+
+/**
+ * Delete a Recipe.
+ * Only the owner can delete their recipe.
+ *
+ * @param {string} recipeName - The recipe's unique name
+ * @returns {Promise<boolean>} True if deleted successfully
+ */
+export async function deleteRecipe(recipeName) {
+  try {
+    const result = await apiClient.fetch('/recipes/delete', {
+      recipe_name: recipeName,
+    });
+    return result;
+  } catch (error) {
+    console.error('Error deleting Recipe:', error);
+    throw error;
+  }
+}
+
+/**
+ * Toggle pin status of a Recipe.
+ *
+ * @param {string} recipeName - The recipe's unique name
+ * @param {boolean} pinned - New pinned status
+ * @returns {Promise<Object|null>} Updated recipe or null if not found
+ */
+export async function togglePinRecipe(recipeName, pinned) {
+  try {
+    const result = await apiClient.fetch('/recipes/pin', {
+      recipe_name: recipeName,
+      pinned: pinned,
+    });
+    return result;
+  } catch (error) {
+    console.error('Error toggling Recipe pin status:', error);
+    throw error;
+  }
+}
diff --git a/sky/dashboard/src/data/connectors/volumes.js b/sky/dashboard/src/data/connectors/volumes.js
index 5ab60ba392a..dfd889e8914 100644
--- a/sky/dashboard/src/data/connectors/volumes.js
+++ b/sky/dashboard/src/data/connectors/volumes.js
@@ -37,6 +37,7 @@ export async function getVolumes() {
           name_on_cloud: volume.name_on_cloud,
           usedby_pods: volume.usedby_pods,
           usedby_clusters: volume.usedby_clusters,
+          error_message: volume.error_message || null,
         };
       }) || [];
 
@@ -62,9 +63,7 @@ export async function deleteVolume(volumeName) {
         msg: `Failed to delete volume with status ${response.status}`,
       };
     }
-    const id =
-      response.headers.get('X-SkyPilot-Request-ID') ||
-      response.headers.get('X-Request-ID');
+    const id = response.headers.get('X-SkyPilot-Request-ID');
     if (!id) {
       console.error('No request ID received from server for deleting volume');
       return {
diff --git a/sky/dashboard/src/data/constants/recipeTypes.js b/sky/dashboard/src/data/constants/recipeTypes.js
new file mode 100644
index 00000000000..12c077cc80a
--- /dev/null
+++ b/sky/dashboard/src/data/constants/recipeTypes.js
@@ -0,0 +1,127 @@
+/**
+ * Recipe type constants for the Recipe Hub.
+ *
+ * These values must match the RecipeType enum in sky/recipes/utils.py
+ */
+
+import {
+  ServerIcon,
+  BriefcaseIcon,
+  DatabaseIcon,
+  LayersIcon,
+  FileCode,
+} from 'lucide-react';
+
+export const RecipeType = Object.freeze({
+  CLUSTER: 'cluster',
+  JOB: 'job',
+  POOL: 'pool',
+  VOLUME: 'volume',
+});
+
+/**
+ * List of all valid recipe types.
+ */
+export const ALL_RECIPE_TYPES = Object.freeze(Object.values(RecipeType));
+
+/**
+ * Check if a string is a valid recipe type.
+ * @param {string} value - The value to check
+ * @returns {boolean} True if valid recipe type
+ */
+export function isValidRecipeType(value) {
+  return ALL_RECIPE_TYPES.includes(value);
+}
+
+/**
+ * Helper to capitalize first letter of each word.
+ * @param {string} str - The string to capitalize
+ * @returns {string} Capitalized string
+ */
+export function capitalizeWords(str) {
+  if (!str) return '';
+  return str
+    .split(' ')
+    .map((word) => word.charAt(0).toUpperCase() + word.slice(1).toLowerCase())
+    .join(' ');
+}
+
+/**
+ * Map color names to Tailwind text color classes.
+ */
+const COLOR_CLASS_MAP = {
+  sky: 'text-sky-600',
+  purple: 'text-purple-600',
+  green: 'text-green-600',
+  orange: 'text-orange-600',
+  gray: 'text-gray-600',
+};
+
+/**
+ * Get icon, color, and label information for a recipe type.
+ * @param {string} recipeType - The recipe type value
+ * @returns {Object} Object with icon, color, colorClass, label (short), and fullLabel properties
+ */
+export function getRecipeTypeInfo(recipeType) {
+  let info;
+  switch (recipeType) {
+    case RecipeType.CLUSTER:
+      info = {
+        icon: ServerIcon,
+        color: 'sky',
+        label: 'Cluster',
+        fullLabel: 'Cluster',
+      };
+      break;
+    case RecipeType.JOB:
+      info = {
+        icon: BriefcaseIcon,
+        color: 'purple',
+        label: 'Job',
+        fullLabel: 'Managed Job',
+      };
+      break;
+    case RecipeType.VOLUME:
+      info = {
+        icon: DatabaseIcon,
+        color: 'green',
+        label: 'Volume',
+        fullLabel: 'Volume',
+      };
+      break;
+    case RecipeType.POOL:
+      info = {
+        icon: LayersIcon,
+        color: 'orange',
+        label: 'Pool',
+        fullLabel: 'Job Pool',
+      };
+      break;
+    default:
+      throw new Error(`Invalid recipe type: ${recipeType}`);
+  }
+  // Add the Tailwind color class
+  info.colorClass = COLOR_CLASS_MAP[info.color] || 'text-gray-600';
+  return info;
+}
+
+/**
+ * Generate the CLI launch command for a recipe.
+ * @param {string} recipeType - The recipe type value
+ * @param {string} recipeName - The recipe's unique name
+ * @returns {string} The CLI command to launch this recipe
+ */
+export function getLaunchCommand(recipeType, recipeName) {
+  switch (recipeType) {
+    case RecipeType.CLUSTER:
+      return `sky launch recipes:${recipeName}`;
+    case RecipeType.JOB:
+      return `sky jobs launch recipes:${recipeName}`;
+    case RecipeType.VOLUME:
+      return `sky volumes apply recipes:${recipeName}`;
+    case RecipeType.POOL:
+      return `sky jobs pool apply recipes:${recipeName}`;
+    default:
+      throw new Error(`Invalid recipe type: ${recipeType}`);
+  }
+}
diff --git a/sky/dashboard/src/hooks/useLogStreamer.js b/sky/dashboard/src/hooks/useLogStreamer.js
index 6d0f4844025..c9608856a63 100644
--- a/sky/dashboard/src/hooks/useLogStreamer.js
+++ b/sky/dashboard/src/hooks/useLogStreamer.js
@@ -43,8 +43,10 @@ export function useLogStreamer({
     setHasReceivedFirstChunk(false);
   }, []);
 
+  // progressTick triggers recalc when progress updates (progressMapRef is a ref)
   const displayLines = useMemo(
     () => [...logLines, ...Array.from(progressMapRef.current.values())],
+    // eslint-disable-next-line react-hooks/exhaustive-deps
     [logLines, progressTick]
   );
 
diff --git a/sky/dashboard/src/hooks/useTour.js b/sky/dashboard/src/hooks/useTour.js
index dca4a781ba5..b9fa3436a3b 100644
--- a/sky/dashboard/src/hooks/useTour.js
+++ b/sky/dashboard/src/hooks/useTour.js
@@ -1,4 +1,5 @@
 import React, {
+  useCallback,
   useEffect,
   useRef,
   createContext,
@@ -45,7 +46,7 @@ export function TourProvider({ children }) {
   const [tourJustStarted, setTourJustStarted] = useState(false);
   const tourNavigatingRef = useRef(false);
 
-  const startTour = () => {
+  const startTour = useCallback(() => {
     if (tourRef.current) {
       setIsTourActive(true);
       setTourJustStarted(true);
@@ -57,7 +58,7 @@ export function TourProvider({ children }) {
         setTourJustStarted(false);
       }, 1000);
     }
-  };
+  }, []);
 
   useEffect(() => {
     // Initialize the tour only once
@@ -1726,7 +1727,9 @@ export function TourProvider({ children }) {
         tourRef.current.complete();
       }
     };
-  }, [isFirstVisit]);
+    // markTourCompleted/router/tourAutoStarted used in step callbacks, not as effect deps
+    // eslint-disable-next-line react-hooks/exhaustive-deps
+  }, [isFirstVisit, startTour]);
 
   // Block navigation during tour
   useEffect(() => {
diff --git a/sky/dashboard/src/lib/jobs-cache-manager.js b/sky/dashboard/src/lib/jobs-cache-manager.js
index a0dffc44ae8..8cf0bd73aa9 100644
--- a/sky/dashboard/src/lib/jobs-cache-manager.js
+++ b/sky/dashboard/src/lib/jobs-cache-manager.js
@@ -1,261 +1,571 @@
-// Smart cache manager for managed jobs with client-side pagination
-// This manager optimizes job data fetching by caching full datasets and serving paginated views from cache
+// Smart cache manager for managed jobs with pagination support
+// This manager handles both default path (fetch all, paginate client-side)
+// and plugin path (fetch exact page from server)
 
 import dashboardCache from './cache';
 import { getManagedJobs } from '@/data/connectors/jobs';
 
+// Check if the jobs pagination plugin is available
+function isPluginAvailable() {
+  return (
+    typeof window !== 'undefined' &&
+    typeof window.__skyJobsPaginationFetch === 'function'
+  );
+}
+
+// Get the plugin fetch function
+function getPluginFetch() {
+  return typeof window !== 'undefined' ? window.__skyJobsPaginationFetch : null;
+}
+
 class JobsCacheManager {
   constructor() {
-    this.fullDataCache = new Map(); // Cache for full datasets keyed by filter combination
-    this.isLoading = new Map(); // Track ongoing requests to prevent duplicate fetches (full dataset only)
-    this.prefetching = new Map(); // Track background prefetch jobs for full datasets
+    // Cache for paginated data, keyed by full options (including page, limit, sort)
+    this.pageCache = new Map();
+    // Cache for full datasets (default path only), keyed by filter combination
+    this.fullDataCache = new Map();
+    // Track ongoing requests to prevent duplicate fetches
+    this.pendingRequests = new Map();
+    // Track background prefetch jobs for full datasets
+    this.backgroundPrefetching = new Map();
+    // TTL for cache entries (2 minutes)
+    this.cacheTTL = 2 * 60 * 1000;
   }
 
   /**
-   * Generate a cache key based on filter parameters (excluding pagination)
+   * Generate a cache key that includes ALL parameters (page, limit, sort, filters)
+   * This is the primary cache key for paginated results
    */
-  _generateFilterKey(filters) {
+  _generateCacheKey(options) {
     const {
+      page = 1,
+      limit = 10,
+      sortBy = 'submitted_at',
+      sortOrder = 'desc',
       allUsers = true,
       nameMatch,
       userMatch,
       workspaceMatch,
       poolMatch,
       statuses,
-    } = filters;
-
-    const keyParts = [
-      `allUsers:${allUsers}`,
-      nameMatch ? `name:${nameMatch}` : '',
-      userMatch ? `user:${userMatch}` : '',
-      workspaceMatch ? `workspace:${workspaceMatch}` : '',
-      poolMatch ? `pool:${poolMatch}` : '',
-      statuses && statuses.length > 0
-        ? `statuses:${statuses.sort().join(',')}`
-        : '',
-    ].filter(Boolean);
-
-    return keyParts.join('|') || 'default';
+    } = options;
+
+    // Create a normalized object for consistent JSON stringification
+    const keyObj = {
+      page,
+      limit,
+      sortBy,
+      sortOrder,
+      allUsers,
+      nameMatch: nameMatch || null,
+      userMatch: userMatch || null,
+      workspaceMatch: workspaceMatch || null,
+      poolMatch: poolMatch || null,
+      statuses: statuses && statuses.length > 0 ? [...statuses].sort() : null,
+    };
+
+    return JSON.stringify(keyObj);
   }
 
   /**
-   * Check if full dataset is cached and fresh
-   * Returns object with status information
+   * Generate a filter-only key (excludes pagination/sorting)
+   * Used for full dataset caching in default path
    */
-  _getCacheStatus(filterKey) {
-    const cached = this.fullDataCache.get(filterKey);
-    const now = Date.now();
-    const maxAge = 2 * 60 * 1000; // 2 minutes TTL (align with dashboardCache)
+  _generateFilterKey(options) {
+    const {
+      allUsers = true,
+      nameMatch,
+      userMatch,
+      workspaceMatch,
+      poolMatch,
+      statuses,
+    } = options;
+
+    const keyObj = {
+      allUsers,
+      nameMatch: nameMatch || null,
+      userMatch: userMatch || null,
+      workspaceMatch: workspaceMatch || null,
+      poolMatch: poolMatch || null,
+      statuses: statuses && statuses.length > 0 ? [...statuses].sort() : null,
+    };
 
-    if (!cached) {
+    return JSON.stringify(keyObj);
+  }
+
+  /**
+   * Check if a cache entry is valid (exists and not expired)
+   */
+  _isCacheValid(cacheEntry) {
+    if (!cacheEntry) return false;
+    const age = Date.now() - cacheEntry.timestamp;
+    return age < this.cacheTTL;
+  }
+
+  /**
+   * Group tasks by job_id and return unique job IDs in order
+   */
+  _groupTasksByJob(tasks) {
+    const jobMap = new Map();
+    const jobOrder = [];
+
+    for (const task of tasks) {
+      const jobId = task.id;
+      if (!jobMap.has(jobId)) {
+        jobMap.set(jobId, []);
+        jobOrder.push(jobId);
+      }
+      jobMap.get(jobId).push(task);
+    }
+
+    return { jobMap, jobOrder };
+  }
+
+  /**
+   * Default path: Fetch a single page immediately for fast UI response
+   * This fetches only the requested page, not the full dataset
+   */
+  async _populateCacheDefaultPathSinglePage(options) {
+    const {
+      page = 1,
+      limit = 10,
+      sortBy,
+      sortOrder,
+      ...filterOptions
+    } = options;
+    const cacheKey = this._generateCacheKey(options);
+
+    console.log('[JobsCacheManager] Default path: fetching single page', page);
+
+    // Fetch the specific page using getManagedJobs with pagination params
+    const pageResponse = await dashboardCache.get(getManagedJobs, [
+      {
+        ...filterOptions,
+        page,
+        limit,
+      },
+    ]);
+
+    if (pageResponse.__skipCache) {
+      return pageResponse;
+    }
+
+    if (pageResponse.controllerStopped) {
       return {
-        isCached: false,
-        isFresh: false,
-        age: 0,
-        maxAge,
-        hasData: false,
+        jobs: [],
+        total: 0,
+        totalNoFilter: 0,
+        totalPages: 0,
+        hasNext: false,
+        hasPrev: false,
+        controllerStopped: true,
+        statusCounts: {},
       };
     }
 
-    const age = now - cached.timestamp;
-    const isFresh = age < maxAge;
+    const jobs = pageResponse.jobs || [];
+    const total =
+      typeof pageResponse.total === 'number' ? pageResponse.total : jobs.length;
+    const totalNoFilter = pageResponse.totalNoFilter || total;
+    const totalPages = Math.ceil(total / limit) || 1;
+    const hasNext = page < totalPages;
+    const hasPrev = page > 1;
+    const statusCounts = pageResponse.statusCounts || {};
+
+    // Cache this single page
+    this.pageCache.set(cacheKey, {
+      jobs,
+      total,
+      totalNoFilter,
+      totalPages,
+      hasNext,
+      hasPrev,
+      controllerStopped: false,
+      statusCounts,
+      timestamp: Date.now(),
+    });
 
     return {
-      isCached: true,
-      isFresh,
-      age,
-      maxAge,
-      hasData: cached.jobs && Array.isArray(cached.jobs),
-      data: cached,
+      jobs,
+      total,
+      totalNoFilter,
+      totalPages,
+      hasNext,
+      hasPrev,
+      controllerStopped: false,
+      statusCounts,
+      fromCache: false,
+      cacheStatus: 'default_path_single_page',
     };
   }
 
   /**
-   * Get paginated jobs data with intelligent caching
-   * - If full dataset cache is fresh: slice locally
-   * - Otherwise: fetch only the requested page from server and prefetch full dataset in background
-   * @param {Object} options - Query options including pagination
-   * @returns {Promise<{jobs: Array, total: number, totalNoFilter: number, controllerStopped: boolean, statusCounts: Object, fromCache: boolean, cacheStatus: string}>}
+   * Default path background prefetch: Fetch ALL jobs and populate cache for all pages
+   * This is called in the background after the initial page load
    */
-  async getPaginatedJobs(options = {}) {
-    const { page = 1, limit = 10, ...filterOptions } = options;
-
+  async _loadFullDatasetAndCacheAllPages(options) {
+    const {
+      page = 1,
+      limit = 10,
+      sortBy,
+      sortOrder,
+      ...filterOptions
+    } = options;
     const filterKey = this._generateFilterKey(filterOptions);
 
-    try {
-      // 1) Prefer serving from local full-data cache whenever present (even if stale)
-      const localCacheStatus = this._getCacheStatus(filterKey);
-      if (localCacheStatus.isCached && localCacheStatus.hasData) {
-        const cachedData = localCacheStatus.data;
-        const isEmptyFullCache =
-          !cachedData.jobs ||
-          (Array.isArray(cachedData.jobs) && cachedData.jobs.length === 0);
-
-        // If the cached full dataset is empty, do not short-circuit.
-        // Kick off an immediate full-dataset prefetch (if not running),
-        // and fall through to server-side page fetch to get fresh data now.
-        if (isEmptyFullCache) {
-          if (!this.prefetching.has(filterKey)) {
-            const prefetchPromise = this._loadFullDataset(
-              filterOptions,
-              filterKey
-            )
-              .catch(() => {})
-              .finally(() => this.prefetching.delete(filterKey));
-            this.prefetching.set(filterKey, prefetchPromise);
-          }
-          // fall through to server fetch below
-        } else {
-          const startIndex = (page - 1) * limit;
-          const endIndex = startIndex + limit;
-          const paginatedJobs = cachedData.jobs.slice(startIndex, endIndex);
-
-          // Background refresh when stale or older than half TTL
-          if (
-            !this.prefetching.has(filterKey) &&
-            (!localCacheStatus.isFresh ||
-              localCacheStatus.age > localCacheStatus.maxAge / 2)
-          ) {
-            const prefetchPromise = this._loadFullDataset(
-              filterOptions,
-              filterKey
-            )
-              .catch(() => {})
-              .finally(() => this.prefetching.delete(filterKey));
-            this.prefetching.set(filterKey, prefetchPromise);
-          }
-
-          return {
-            jobs: paginatedJobs,
-            total: cachedData.total,
-            totalNoFilter: cachedData.totalNoFilter || cachedData.total,
-            controllerStopped: cachedData.controllerStopped,
-            statusCounts: cachedData.statusCounts || {},
-            fromCache: true,
-            cacheStatus: localCacheStatus.isFresh
-              ? 'local_cache_hit'
-              : 'local_cache_stale_hit',
-          };
-        }
-      }
+    console.log('[JobsCacheManager] Background: fetching full dataset');
 
-      // 2) No local cache: fetch current page only to keep UI responsive
-      const pageResponse = await dashboardCache.get(getManagedJobs, [
-        {
-          ...filterOptions,
-          page,
-          limit,
-        },
-      ]);
-
-      const pageJobs = pageResponse?.jobs || [];
-      const pageTotal =
-        typeof pageResponse?.total === 'number'
-          ? pageResponse.total
-          : pageJobs.length;
-      const controllerStopped = !!pageResponse?.controllerStopped;
-
-      // 3) Kick off background prefetch of full dataset (now that we have no cache yet)
-      if (!this.prefetching.has(filterKey)) {
-        const prefetchPromise = this._loadFullDataset(filterOptions, filterKey)
-          .catch((e) => {
-            console.warn('Background prefetch of full jobs failed:', e);
-          })
-          .finally(() => {
-            this.prefetching.delete(filterKey);
-          });
-        this.prefetching.set(filterKey, prefetchPromise);
+    // Fetch all jobs without pagination
+    const fullDataResponse = await dashboardCache.get(getManagedJobs, [
+      filterOptions,
+    ]);
+
+    if (fullDataResponse.__skipCache || fullDataResponse.controllerStopped) {
+      return;
+    }
+
+    const allJobs = fullDataResponse.jobs || [];
+    const { jobMap, jobOrder } = this._groupTasksByJob(allJobs);
+    const totalJobs = jobOrder.length;
+
+    // Store the full dataset for future page calculations
+    this.fullDataCache.set(filterKey, {
+      jobs: allJobs,
+      jobMap,
+      jobOrder,
+      totalJobs,
+      totalNoFilter: fullDataResponse.totalNoFilter || totalJobs,
+      statusCounts: fullDataResponse.statusCounts || {},
+      timestamp: Date.now(),
+    });
+
+    // Calculate and cache all pages
+    const totalPages = Math.ceil(totalJobs / limit) || 1;
+    for (let p = 1; p <= totalPages; p++) {
+      const startIdx = (p - 1) * limit;
+      const endIdx = startIdx + limit;
+      const pageJobIds = jobOrder.slice(startIdx, endIdx);
+      const pageJobs = [];
+      for (const jobId of pageJobIds) {
+        pageJobs.push(...jobMap.get(jobId));
       }
 
-      return {
+      const pageCacheKey = this._generateCacheKey({
+        ...filterOptions,
+        page: p,
+        limit,
+        sortBy,
+        sortOrder,
+      });
+
+      this.pageCache.set(pageCacheKey, {
         jobs: pageJobs,
-        total: pageTotal,
-        totalNoFilter: pageResponse?.totalNoFilter || pageTotal,
-        controllerStopped,
-        statusCounts: pageResponse?.statusCounts || {},
-        fromCache: false,
-        cacheStatus: 'server_page_fetch',
-      };
-    } catch (error) {
-      console.error('Error in getPaginatedJobs:', error);
-      throw error;
+        total: totalJobs,
+        totalNoFilter: fullDataResponse.totalNoFilter || totalJobs,
+        totalPages,
+        hasNext: p < totalPages,
+        hasPrev: p > 1,
+        controllerStopped: false,
+        statusCounts: fullDataResponse.statusCounts || {},
+        timestamp: Date.now(),
+      });
     }
+
+    console.log('[JobsCacheManager] Background: cached', totalPages, 'pages');
   }
 
   /**
-   * Load full dataset and cache it
+   * Plugin path: Fetch only the exact page needed from the server
+   * Uses server-side filtering and pagination
    */
-  async _loadFullDataset(filterOptions, filterKey) {
-    // Fetch all data without pagination parameters (server returns full list)
-    const fullDataResponse = await dashboardCache.get(getManagedJobs, [
-      filterOptions,
-    ]);
+  async _populateCachePluginPath(options) {
+    const {
+      page = 1,
+      limit = 10,
+      sortBy = 'submitted_at',
+      sortOrder = 'desc',
+      statuses,
+      ...filterOptions
+    } = options;
+    const cacheKey = this._generateCacheKey(options);
 
-    // If upstream indicates to skip cache (e.g., transient error), do not
-    // update the full dataset cache and propagate the response upward.
-    if (fullDataResponse && fullDataResponse.__skipCache) {
-      return fullDataResponse;
+    console.log('[JobsCacheManager] Plugin path: fetching page', page);
+
+    const pluginFetch = getPluginFetch();
+    if (!pluginFetch) {
+      throw new Error('Plugin fetch function not available');
     }
 
-    if (fullDataResponse.controllerStopped || !fullDataResponse.jobs) {
-      return fullDataResponse;
+    // Call the plugin fetch function with all options
+    const result = await pluginFetch({
+      page,
+      limit,
+      sortBy,
+      sortOrder,
+      statuses,
+      filters: this._convertFiltersToPluginFormat(filterOptions),
+    });
+
+    // Handle controller stopped
+    if (result.controllerStopped) {
+      return {
+        jobs: [],
+        total: 0,
+        totalNoFilter: 0,
+        totalPages: 0,
+        hasNext: false,
+        hasPrev: false,
+        controllerStopped: true,
+        statusCounts: {},
+        fromCache: false,
+        cacheStatus: 'plugin_path_controller_stopped',
+      };
     }
 
-    const fullData = {
-      jobs: fullDataResponse.jobs,
-      total: fullDataResponse.jobs.length,
-      totalNoFilter:
-        fullDataResponse.totalNoFilter || fullDataResponse.jobs.length,
+    const jobs = result.items || result.data || [];
+    const total = result.total || 0;
+    const totalNoFilter = result.totalNoFilter || total;
+    const totalPages =
+      result.totalPages || result.total_pages || Math.ceil(total / limit) || 1;
+    const hasNext = result.hasNext || result.has_next || page < totalPages;
+    const hasPrev = result.hasPrev || result.has_prev || page > 1;
+    const statusCounts = result.statusCounts || {};
+
+    // Cache this specific page
+    this.pageCache.set(cacheKey, {
+      jobs,
+      total,
+      totalNoFilter,
+      totalPages,
+      hasNext,
+      hasPrev,
       controllerStopped: false,
-      statusCounts: fullDataResponse.statusCounts || {},
+      statusCounts,
       timestamp: Date.now(),
+    });
+
+    return {
+      jobs,
+      total,
+      totalNoFilter,
+      totalPages,
+      hasNext,
+      hasPrev,
+      controllerStopped: false,
+      statusCounts,
+      fromCache: false,
+      cacheStatus: 'plugin_path_fetched',
     };
+  }
+
+  /**
+   * Convert filter options to the format expected by the plugin
+   */
+  _convertFiltersToPluginFormat(filterOptions) {
+    const filters = [];
 
-    // Cache the full dataset for fast client-side pagination
-    this.fullDataCache.set(filterKey, fullData);
+    if (filterOptions.nameMatch) {
+      filters.push({ property: 'name', value: filterOptions.nameMatch });
+    }
+    if (filterOptions.userMatch) {
+      filters.push({ property: 'user', value: filterOptions.userMatch });
+    }
+    if (filterOptions.workspaceMatch) {
+      filters.push({
+        property: 'workspace',
+        value: filterOptions.workspaceMatch,
+      });
+    }
+    if (filterOptions.poolMatch) {
+      filters.push({ property: 'pool', value: filterOptions.poolMatch });
+    }
 
-    return fullData;
+    return filters;
   }
 
   /**
-   * Check if data is being loaded for the given filters (full dataset)
+   * Get paginated jobs data with intelligent caching
+   * - Checks cache first (keyed by all parameters including page, limit, sort)
+   * - On cache miss, uses plugin path if available, otherwise default path
+   * - Default path: fetches single page immediately, then prefetches full dataset in background
+   *
+   * @param {Object} options - Query options including pagination
+   * @returns {Promise<{jobs: Array, total: number, totalNoFilter: number, totalPages: number, hasNext: boolean, hasPrev: boolean, controllerStopped: boolean, statusCounts: Object, fromCache: boolean, cacheStatus: string}>}
    */
-  isDataLoading(filterOptions = {}) {
-    const filterKey = this._generateFilterKey(filterOptions);
-    return this.isLoading.has(filterKey) || this.prefetching.has(filterKey);
+  async getPaginatedJobs(options = {}) {
+    const { page = 1, limit = 10 } = options;
+    const cacheKey = this._generateCacheKey(options);
+    const filterKey = this._generateFilterKey(options);
+
+    console.log('[JobsCacheManager] getPaginatedJobs called:', {
+      page,
+      limit,
+      cacheKey: cacheKey.substring(0, 50) + '...',
+    });
+
+    try {
+      // 1) Check if we have a valid cached page
+      const cachedPage = this.pageCache.get(cacheKey);
+      if (cachedPage && this._isCacheValid(cachedPage)) {
+        console.log('[JobsCacheManager] Cache hit for page', page);
+        return {
+          jobs: cachedPage.jobs,
+          total: cachedPage.total,
+          totalNoFilter: cachedPage.totalNoFilter,
+          totalPages: cachedPage.totalPages,
+          hasNext: cachedPage.hasNext,
+          hasPrev: cachedPage.hasPrev,
+          controllerStopped: cachedPage.controllerStopped,
+          statusCounts: cachedPage.statusCounts,
+          fromCache: true,
+          cacheStatus: 'cache_hit',
+        };
+      }
+
+      // 2) Check if there's already a pending request for this exact key
+      if (this.pendingRequests.has(cacheKey)) {
+        console.log('[JobsCacheManager] Waiting for pending request');
+        return await this.pendingRequests.get(cacheKey);
+      }
+
+      // 3) Cache miss - populate based on available path
+      let populatePromise;
+      if (isPluginAvailable()) {
+        // Plugin path: fetch exact page from server
+        populatePromise = this._populateCachePluginPath(options);
+      } else {
+        // Default path: fetch single page immediately for responsiveness
+        populatePromise = this._populateCacheDefaultPathSinglePage(options);
+      }
+
+      // Track the pending request
+      this.pendingRequests.set(cacheKey, populatePromise);
+
+      try {
+        const result = await populatePromise;
+
+        // For default path, kick off background prefetch of full dataset
+        if (
+          !isPluginAvailable() &&
+          !this.backgroundPrefetching.has(filterKey)
+        ) {
+          this._kickOffBackgroundPrefetch(options, filterKey);
+        }
+
+        return result;
+      } finally {
+        this.pendingRequests.delete(cacheKey);
+      }
+    } catch (error) {
+      console.error('[JobsCacheManager] Error in getPaginatedJobs:', error);
+      throw error;
+    }
   }
 
   /**
-   * Check if data is cached and fresh for the given filters (full dataset)
+   * Kick off background prefetch of full dataset (default path only)
    */
-  isDataCached(filterOptions = {}) {
-    const filterKey = this._generateFilterKey(filterOptions);
-    const status = this._getCacheStatus(filterKey);
-    return status.isCached && status.isFresh && status.hasData;
+  _kickOffBackgroundPrefetch(options, filterKey) {
+    console.log('[JobsCacheManager] Kicking off background prefetch');
+
+    const prefetchPromise = this._loadFullDatasetAndCacheAllPages(options)
+      .catch((err) => {
+        console.warn('[JobsCacheManager] Background prefetch failed:', err);
+      })
+      .finally(() => {
+        this.backgroundPrefetching.delete(filterKey);
+      });
+
+    this.backgroundPrefetching.set(filterKey, prefetchPromise);
+  }
+
+  /**
+   * Prefetch the next page in the background
+   * Call this after successfully loading a page to warm the cache
+   */
+  async prefetchNextPage(options = {}) {
+    const { page = 1, limit = 10 } = options;
+    const nextPage = page + 1;
+    const nextOptions = { ...options, page: nextPage };
+    const nextCacheKey = this._generateCacheKey(nextOptions);
+
+    // Don't prefetch if already cached or pending
+    if (
+      this.pageCache.has(nextCacheKey) ||
+      this.pendingRequests.has(nextCacheKey)
+    ) {
+      return;
+    }
+
+    console.log('[JobsCacheManager] Prefetching page', nextPage);
+
+    try {
+      // For plugin path, prefetch the specific page
+      if (isPluginAvailable()) {
+        await this._populateCachePluginPath(nextOptions);
+      }
+      // For default path, all pages are already cached when we load full data
+    } catch (error) {
+      console.warn('[JobsCacheManager] Prefetch failed:', error);
+    }
+  }
+
+  /**
+   * Check if data is being loaded for the given options
+   */
+  isDataLoading(options = {}) {
+    const cacheKey = this._generateCacheKey(options);
+    return this.pendingRequests.has(cacheKey);
+  }
+
+  /**
+   * Check if data is cached and fresh for the given options
+   */
+  isDataCached(options = {}) {
+    const cacheKey = this._generateCacheKey(options);
+    const cached = this.pageCache.get(cacheKey);
+    return cached && this._isCacheValid(cached);
   }
 
   /**
    * Get cache status for debugging
    */
-  getCacheStatus(filterOptions = {}) {
-    const filterKey = this._generateFilterKey(filterOptions);
-    return this._getCacheStatus(filterKey);
+  getCacheStatus(options = {}) {
+    const cacheKey = this._generateCacheKey(options);
+    const cached = this.pageCache.get(cacheKey);
+
+    if (!cached) {
+      return { isCached: false, isFresh: false, age: 0 };
+    }
+
+    const age = Date.now() - cached.timestamp;
+    return {
+      isCached: true,
+      isFresh: age < this.cacheTTL,
+      age,
+      maxAge: this.cacheTTL,
+    };
   }
 
   /**
-   * Invalidate cache for specific filters or all cache
+   * Invalidate cache for specific options or all cache
    */
-  invalidateCache(filterOptions = null) {
-    if (filterOptions) {
-      const filterKey = this._generateFilterKey(filterOptions);
+  invalidateCache(options = null) {
+    if (options) {
+      // Invalidate specific cache key
+      const cacheKey = this._generateCacheKey(options);
+      this.pageCache.delete(cacheKey);
+
+      // Also invalidate related filter key for full data cache and background prefetch
+      const filterKey = this._generateFilterKey(options);
       this.fullDataCache.delete(filterKey);
-      this.isLoading.delete(filterKey);
-      this.prefetching.delete(filterKey);
+      this.backgroundPrefetching.delete(filterKey);
     } else {
       // Clear all cache
+      this.pageCache.clear();
       this.fullDataCache.clear();
-      this.isLoading.clear();
-      this.prefetching.clear();
+      this.pendingRequests.clear();
+      this.backgroundPrefetching.clear();
     }
 
     // Also invalidate the underlying dashboard cache
@@ -263,34 +573,61 @@ class JobsCacheManager {
   }
 
   /**
-   * Get cache statistics for debugging
+   * Invalidate all pages for a given filter combination
+   * Useful when filters change but we want to keep other cached data
    */
-  getCacheStats() {
-    const stats = {
-      cachedFilters: Array.from(this.fullDataCache.keys()),
-      loadingFilters: Array.from(this.isLoading.keys()),
-      prefetchingFilters: Array.from(this.prefetching.keys()),
-      cacheSize: this.fullDataCache.size,
-      loadingCount: this.isLoading.size,
-      prefetchingCount: this.prefetching.size,
-    };
+  invalidateFilteredPages(filterOptions) {
+    const filterKey = this._generateFilterKey(filterOptions);
 
-    // Add detailed status for each cached filter
-    stats.detailedStatus = {};
-    for (const [filterKey, cachedData] of this.fullDataCache.entries()) {
-      const status = this._getCacheStatus(filterKey);
-      stats.detailedStatus[filterKey] = {
-        age: status.age,
-        isFresh: status.isFresh,
-        hasData: status.hasData,
-        jobCount: cachedData.jobs ? cachedData.jobs.length : 0,
-      };
+    // Remove all page cache entries that match this filter
+    for (const [key] of this.pageCache.entries()) {
+      try {
+        const keyObj = JSON.parse(key);
+        const keyFilterObj = {
+          allUsers: keyObj.allUsers,
+          nameMatch: keyObj.nameMatch,
+          userMatch: keyObj.userMatch,
+          workspaceMatch: keyObj.workspaceMatch,
+          poolMatch: keyObj.poolMatch,
+          statuses: keyObj.statuses,
+        };
+        if (JSON.stringify(keyFilterObj) === filterKey) {
+          this.pageCache.delete(key);
+        }
+      } catch (e) {
+        // Skip malformed keys
+      }
     }
 
-    return stats;
+    // Also remove full data cache for this filter
+    this.fullDataCache.delete(filterKey);
+  }
+
+  /**
+   * Get cache statistics for debugging
+   */
+  getCacheStats() {
+    return {
+      pageCacheSize: this.pageCache.size,
+      fullDataCacheSize: this.fullDataCache.size,
+      pendingRequestsCount: this.pendingRequests.size,
+      backgroundPrefetchingCount: this.backgroundPrefetching.size,
+      isPluginAvailable: isPluginAvailable(),
+      cachedKeys: Array.from(this.pageCache.keys()).map((k) => {
+        try {
+          const obj = JSON.parse(k);
+          return `page:${obj.page},limit:${obj.limit}`;
+        } catch {
+          return k.substring(0, 30);
+        }
+      }),
+    };
   }
 }
 
 // Export singleton instance
 const jobsCacheManager = new JobsCacheManager();
 export default jobsCacheManager;
+
+// Also export the class for testing
+export { JobsCacheManager };
diff --git a/sky/dashboard/src/lib/yamlUtils.js b/sky/dashboard/src/lib/yamlUtils.js
index 83243b7d013..18fbeb6927d 100644
--- a/sky/dashboard/src/lib/yamlUtils.js
+++ b/sky/dashboard/src/lib/yamlUtils.js
@@ -64,7 +64,7 @@ export const getYamlPreview = (parsed) => {
     return parsed.substring(0, 50) + '...';
   }
   if (parsed && parsed.name) {
-    return `name: ${parsed.name}`;
+    return parsed.name;
   }
   if (parsed && parsed.resources) {
     return 'Task configuration';
diff --git a/sky/dashboard/src/pages/clusters/[cluster].js b/sky/dashboard/src/pages/clusters/[cluster].js
index 5360ee6a6ea..63264a6b593 100644
--- a/sky/dashboard/src/pages/clusters/[cluster].js
+++ b/sky/dashboard/src/pages/clusters/[cluster].js
@@ -1,4 +1,4 @@
-import React, { useState, useEffect, useCallback } from 'react';
+import React, { useState, useEffect } from 'react';
 import { CircularProgress } from '@mui/material';
 import { ClusterJobs } from '@/components/jobs';
 import { useRouter } from 'next/router';
@@ -25,7 +25,7 @@ import {
   NonCapitalizedTooltip,
   formatFullTimestamp,
 } from '@/components/utils';
-import { checkGrafanaAvailability, getGrafanaUrl } from '@/utils/grafana';
+import { checkGrafanaAvailability } from '@/utils/grafana';
 import {
   SSHInstructionsModal,
   VSCodeInstructionsModal,
@@ -36,6 +36,7 @@ import { formatYaml } from '@/lib/yamlUtils';
 import { UserDisplay } from '@/components/elements/UserDisplay';
 import { YamlHighlighter } from '@/components/YamlHighlighter';
 import { PluginSlot } from '@/plugins/PluginSlot';
+import { GPUMetricsSection } from '@/components/GPUMetricsSection';
 
 // Helper function to format autostop information, similar to _get_autostop in CLI utils
 const formatAutostop = (autostop, toDown) => {
@@ -73,10 +74,6 @@ function ClusterDetails() {
   // When this value changes, the iframe key changes, causing React to remount the iframe.
   const [gpuMetricsRefreshTrigger, setGpuMetricsRefreshTrigger] = useState(0);
   const isMobile = useMobile();
-  const [timeRange, setTimeRange] = useState({
-    from: 'now-1h',
-    to: 'now',
-  });
   const {
     clusterData,
     clusterJobData,
@@ -88,8 +85,6 @@ function ClusterDetails() {
   } = useClusterDetails({ cluster });
 
   // GPU metrics state
-  const [matchedClusterName, setMatchedClusterName] = useState(null);
-  const [isLoadingClusterMatch, setIsLoadingClusterMatch] = useState(false);
   const [isGrafanaAvailable, setIsGrafanaAvailable] = useState(false);
 
   // Check Grafana availability on mount
@@ -98,66 +93,9 @@ function ClusterDetails() {
       const available = await checkGrafanaAvailability();
       setIsGrafanaAvailable(available);
     };
-
-    if (typeof window !== 'undefined') {
-      checkGrafana();
-    }
+    checkGrafana();
   }, []);
 
-  // Fetch available clusters from Grafana and find matching cluster
-  const fetchMatchingCluster = useCallback(async () => {
-    if (!isGrafanaAvailable || !clusterData?.cluster) return;
-
-    setIsLoadingClusterMatch(true);
-    try {
-      const grafanaUrl = getGrafanaUrl();
-      const endpoint =
-        '/api/datasources/proxy/uid/prometheus/api/v1/label/label_skypilot_cluster_name/values';
-
-      const response = await fetch(`${grafanaUrl}${endpoint}`, {
-        method: 'GET',
-        credentials: 'include',
-        headers: {
-          Accept: 'application/json',
-        },
-      });
-
-      if (response.ok) {
-        const data = await response.json();
-        if (data.data && data.data.length > 0) {
-          // Find cluster that matches our current cluster name as prefix
-          const matchingCluster = data.data.find((cluster) =>
-            cluster.startsWith(clusterData.cluster_name_on_cloud)
-          );
-          if (matchingCluster) {
-            setMatchedClusterName(matchingCluster);
-          }
-        }
-      }
-    } catch (error) {
-      console.error('Error fetching matching cluster:', error);
-    } finally {
-      setIsLoadingClusterMatch(false);
-    }
-  }, [clusterData?.cluster, isGrafanaAvailable]);
-
-  // Fetch matching cluster when component mounts and Grafana is available
-  useEffect(() => {
-    if (isGrafanaAvailable && clusterData?.cluster) {
-      fetchMatchingCluster();
-    }
-  }, [clusterData?.cluster, fetchMatchingCluster, isGrafanaAvailable]);
-
-  // Function to build Grafana panel URL with filters
-  const buildGrafanaMetricsUrl = (panelId) => {
-    const grafanaUrl = getGrafanaUrl();
-    // Use the matched cluster name if available, otherwise fall back to the cluster name on cloud
-    const clusterParam =
-      matchedClusterName || clusterData?.cluster_name_on_cloud;
-
-    return `${grafanaUrl}/d-solo/skypilot-dcgm-gpu/skypilot-dcgm-gpu-metrics?orgId=1&from=${encodeURIComponent(timeRange.from)}&to=${encodeURIComponent(timeRange.to)}&timezone=browser&var-cluster=${encodeURIComponent(clusterParam)}&var-node=$__all&var-gpu=$__all&theme=light&panelId=${panelId}&__feature.dashboardSceneSolo`;
-  };
-
   // Update isInitialLoad when cluster details are first loaded (not waiting for jobs)
   React.useEffect(() => {
     if (!clusterDetailsLoading && isInitialLoad) {
@@ -211,13 +149,6 @@ function ClusterDetails() {
     setIsVSCodeModalOpen(true);
   };
 
-  const handleTimeRangePreset = (preset) => {
-    setTimeRange({
-      from: `now-${preset}`,
-      to: 'now',
-    });
-  };
-
   // Render loading state until data is available
   if (!router.isReady) {
     return <div>Loading...</div>;
@@ -296,11 +227,6 @@ function ClusterDetails() {
             refreshClusterJobsOnly={refreshClusterJobsOnly}
             isVSCodeModalOpen={isVSCodeModalOpen}
             setIsVSCodeModalOpen={setIsVSCodeModalOpen}
-            timeRange={timeRange}
-            handleTimeRangePreset={handleTimeRangePreset}
-            buildGrafanaMetricsUrl={buildGrafanaMetricsUrl}
-            matchedClusterName={matchedClusterName}
-            isLoadingClusterMatch={isLoadingClusterMatch}
             isGrafanaAvailable={isGrafanaAvailable}
             gpuMetricsRefreshTrigger={gpuMetricsRefreshTrigger}
             isHistoricalCluster={false}
@@ -313,11 +239,6 @@ function ClusterDetails() {
             refreshClusterJobsOnly={() => {}}
             isVSCodeModalOpen={false}
             setIsVSCodeModalOpen={() => {}}
-            timeRange={timeRange}
-            handleTimeRangePreset={handleTimeRangePreset}
-            buildGrafanaMetricsUrl={buildGrafanaMetricsUrl}
-            matchedClusterName={null}
-            isLoadingClusterMatch={false}
             isGrafanaAvailable={false}
             gpuMetricsRefreshTrigger={0}
             isHistoricalCluster={true}
@@ -355,48 +276,18 @@ function ActiveTab({
   refreshClusterJobsOnly,
   isVSCodeModalOpen,
   setIsVSCodeModalOpen,
-  timeRange,
-  handleTimeRangePreset,
-  buildGrafanaMetricsUrl,
-  matchedClusterName,
-  isLoadingClusterMatch,
   isGrafanaAvailable,
   gpuMetricsRefreshTrigger,
   isHistoricalCluster = false,
 }) {
-  // Define panel data
-  const gpuPanels = [
-    { id: '1', title: 'GPU Utilization', keyPrefix: 'gpu-util' },
-    { id: '2', title: 'GPU Memory Utilization', keyPrefix: 'gpu-memory' },
-    { id: '3', title: 'GPU Temperature', keyPrefix: 'gpu-temp' },
-    { id: '4', title: 'GPU Power Usage', keyPrefix: 'gpu-power' },
-  ];
-
-  const GPU_METRICS_EXPANDED_KEY = 'skypilot-gpu-metrics-expanded';
-
   const [isYamlExpanded, setIsYamlExpanded] = useState(false);
   const [isCopied, setIsCopied] = useState(false);
   const [isCommandCopied, setIsCommandCopied] = useState(false);
-  const [isGpuMetricsExpanded, setIsGpuMetricsExpanded] = useState(() => {
-    if (typeof window !== 'undefined') {
-      const saved = localStorage.getItem(GPU_METRICS_EXPANDED_KEY);
-      return saved === 'true';
-    }
-    return false;
-  });
 
   const toggleYamlExpanded = () => {
     setIsYamlExpanded(!isYamlExpanded);
   };
 
-  const toggleGpuMetricsExpanded = () => {
-    const newValue = !isGpuMetricsExpanded;
-    setIsGpuMetricsExpanded(newValue);
-    if (typeof window !== 'undefined') {
-      localStorage.setItem(GPU_METRICS_EXPANDED_KEY, String(newValue));
-    }
-  };
-
   const copyYamlToClipboard = async () => {
     try {
       const yamlContent =
@@ -630,6 +521,20 @@ function ActiveTab({
                 </div>
               )}
 
+              {/* Queue Details section - right column */}
+              {clusterData.details && (
+                <PluginSlot
+                  name="clusters.detail.queue_details"
+                  context={{
+                    details: clusterData.details,
+                    queueName: clusterData.kueue_queue_name,
+                    infra: clusterData.full_infra,
+                    clusterData: clusterData,
+                    title: 'Queue Details',
+                  }}
+                />
+              )}
+
               {/* Created by section - spans both columns */}
               {hasCreationArtifacts && (
                 <div className="col-span-2">
@@ -737,103 +642,6 @@ function ActiveTab({
         </div>
       </div>
 
-      {/* GPU Metrics Section - Show for all Kubernetes clusters (in-cluster and external), but not SSH node pools */}
-      {clusterData &&
-        clusterData.full_infra &&
-        clusterData.full_infra.includes('Kubernetes') &&
-        !clusterData.full_infra.includes('SSH') &&
-        !clusterData.full_infra.includes('ssh') &&
-        isGrafanaAvailable && (
-          <div className="mb-6">
-            <div className="rounded-lg border bg-card text-card-foreground shadow-sm">
-              <div
-                className={`flex items-center justify-between px-4 ${isGpuMetricsExpanded ? 'pt-4' : 'py-4'}`}
-              >
-                <button
-                  onClick={toggleGpuMetricsExpanded}
-                  className="flex items-center text-left focus:outline-none hover:text-gray-700 transition-colors duration-200"
-                >
-                  {isGpuMetricsExpanded ? (
-                    <ChevronDownIcon className="w-5 h-5 mr-2" />
-                  ) : (
-                    <ChevronRightIcon className="w-5 h-5 mr-2" />
-                  )}
-                  <h3 className="text-lg font-semibold">GPU Metrics</h3>
-                </button>
-              </div>
-              {isGpuMetricsExpanded && (
-                <div className="p-5">
-                  {/* Filtering Controls */}
-                  <div className="mb-4 p-4 bg-gray-50 rounded-md border border-gray-200">
-                    <div className="flex flex-col sm:flex-row gap-4 items-start sm:items-center">
-                      {/* Time Range Selection */}
-                      <div className="flex items-center gap-2">
-                        <label className="text-sm font-medium text-gray-700 whitespace-nowrap">
-                          Time Range:
-                        </label>
-                        <div className="flex gap-1">
-                          {[
-                            { label: '15m', value: '15m' },
-                            { label: '1h', value: '1h' },
-                            { label: '6h', value: '6h' },
-                            { label: '24h', value: '24h' },
-                            { label: '7d', value: '7d' },
-                          ].map((preset) => (
-                            <button
-                              key={preset.value}
-                              onClick={() =>
-                                handleTimeRangePreset(preset.value)
-                              }
-                              className={`px-2 py-1 text-xs font-medium rounded border transition-colors ${
-                                timeRange.from === `now-${preset.value}` &&
-                                timeRange.to === 'now'
-                                  ? 'bg-sky-blue text-white border-sky-blue'
-                                  : 'bg-white text-gray-600 border-gray-300 hover:bg-gray-50'
-                              }`}
-                            >
-                              {preset.label}
-                            </button>
-                          ))}
-                        </div>
-                      </div>
-                    </div>
-
-                    {/* Show current selection info */}
-                    <div className="mt-2 text-xs text-gray-500">
-                      Showing: {clusterData?.cluster} • Time: {timeRange.from}{' '}
-                      to {timeRange.to}
-                      {isLoadingClusterMatch && (
-                        <span> • Finding cluster data...</span>
-                      )}
-                    </div>
-                  </div>
-
-                  <div className="grid gap-4 [grid-template-columns:repeat(auto-fit,minmax(300px,1fr))]">
-                    {gpuPanels.map((panel) => (
-                      <div
-                        key={panel.id}
-                        className="bg-white rounded-md border border-gray-200 shadow-sm"
-                      >
-                        <div className="p-2">
-                          <iframe
-                            src={buildGrafanaMetricsUrl(panel.id)}
-                            width="100%"
-                            height="400"
-                            frameBorder="0"
-                            title={panel.title}
-                            className="rounded"
-                            key={`${panel.keyPrefix}-${clusterData?.cluster}-${timeRange.from}-${timeRange.to}-${gpuMetricsRefreshTrigger || 0}`}
-                          />
-                        </div>
-                      </div>
-                    ))}
-                  </div>
-                </div>
-              )}
-            </div>
-          </div>
-        )}
-
       {/* Jobs Table - Only show for active clusters */}
       {!isHistoricalCluster && (
         <div className="mb-8">
@@ -847,6 +655,23 @@ function ActiveTab({
         </div>
       )}
 
+      {/* GPU Metrics Section - Show for all Kubernetes clusters (in-cluster and external), but not SSH node pools */}
+      {clusterData &&
+        clusterData.full_infra &&
+        clusterData.full_infra.includes('Kubernetes') &&
+        !clusterData.full_infra.includes('SSH') &&
+        !clusterData.full_infra.includes('ssh') &&
+        isGrafanaAvailable && (
+          <div className="mb-6">
+            <GPUMetricsSection
+              clusterNameOnCloud={clusterData?.cluster_name_on_cloud}
+              displayName={clusterData?.cluster}
+              refreshTrigger={gpuMetricsRefreshTrigger}
+              storageKey="skypilot-gpu-metrics-expanded"
+            />
+          </div>
+        )}
+
       {/* Plugin Slot: Cluster Detail Events */}
       <PluginSlot
         name="clusters.detail.events"
diff --git a/sky/dashboard/src/pages/jobs/[job].js b/sky/dashboard/src/pages/jobs/[job].js
index 72c004ad58f..51ee8698b22 100755
--- a/sky/dashboard/src/pages/jobs/[job].js
+++ b/sky/dashboard/src/pages/jobs/[job].js
@@ -8,7 +8,26 @@ import React, {
 import { CircularProgress } from '@mui/material';
 import { useRouter } from 'next/router';
 import { Card } from '@/components/ui/card';
-import { useSingleManagedJob, getPoolStatus } from '@/data/connectors/jobs';
+import {
+  Table,
+  TableHeader,
+  TableRow,
+  TableHead,
+  TableBody,
+  TableCell,
+} from '@/components/ui/table';
+import {
+  Select,
+  SelectContent,
+  SelectItem,
+  SelectTrigger,
+  SelectValue,
+} from '@/components/ui/select';
+import {
+  useSingleManagedJob,
+  getPoolStatus,
+  computeJobGroupStatus,
+} from '@/data/connectors/jobs';
 import Link from 'next/link';
 import {
   RotateCwIcon,
@@ -21,7 +40,9 @@ import {
 import {
   CustomTooltip as Tooltip,
   formatFullTimestamp,
+  formatDuration,
   renderPoolLink,
+  extractNodeTypes,
 } from '@/components/utils';
 import { LogFilter } from '@/components/utils';
 import {
@@ -29,6 +50,7 @@ import {
   downloadManagedJobLogs,
 } from '@/data/connectors/jobs';
 import { StatusBadge } from '@/components/elements/StatusBadge';
+import { PrimaryBadge } from '@/components/elements/PrimaryBadge';
 import { useMobile } from '@/hooks/useMobile';
 import Head from 'next/head';
 import { NonCapitalizedTooltip } from '@/components/utils';
@@ -37,6 +59,8 @@ import { UserDisplay } from '@/components/elements/UserDisplay';
 import { YamlHighlighter } from '@/components/YamlHighlighter';
 import dashboardCache from '@/lib/cache';
 import { PluginSlot } from '@/plugins/PluginSlot';
+import { checkGrafanaAvailability } from '@/utils/grafana';
+import { GPUMetricsSection } from '@/components/GPUMetricsSection';
 import { useLogStreamer } from '@/hooks/useLogStreamer';
 import PropTypes from 'prop-types';
 
@@ -55,8 +79,18 @@ function JobDetails() {
   const [domReady, setDomReady] = useState(false);
   const [refreshLogsFlag, setRefreshLogsFlag] = useState(0);
   const [refreshControllerLogsFlag, setRefreshControllerLogsFlag] = useState(0);
+  const [selectedTaskIndex, setSelectedTaskIndex] = useState(0);
+  const [selectedNode, setSelectedNode] = useState('all');
+  const [logNodes, setLogNodes] = useState([]);
   const [logExtractedLinks, setLogExtractedLinks] = useState({});
   const isMobile = useMobile();
+
+  // GPU metrics state
+  const [isGrafanaAvailable, setIsGrafanaAvailable] = useState(false);
+  // GPU metrics task selection for job groups
+  const [gpuMetricsTaskIndex, setGpuMetricsTaskIndex] = useState(0);
+  const GPU_METRICS_EXPANDED_KEY = 'skypilot-jobs-gpu-metrics-expanded';
+
   // Update isInitialLoad when data is first loaded
   React.useEffect(() => {
     if (!loading && isInitialLoad) {
@@ -78,6 +112,15 @@ function JobDetails() {
     fetchPoolsData();
   }, []);
 
+  // Check Grafana availability on mount
+  useEffect(() => {
+    const checkGrafana = async () => {
+      const available = await checkGrafanaAvailability();
+      setIsGrafanaAvailable(available);
+    };
+    checkGrafana();
+  }, []);
+
   // Function to scroll to a specific section
   const scrollToSection = (sectionId) => {
     const element = document.getElementById(sectionId);
@@ -153,6 +196,8 @@ function JobDetails() {
       setRefreshLogsFlag((prev) => prev + 1);
       // Trigger controller logs refresh
       setRefreshControllerLogsFlag((prev) => prev + 1);
+      // Trigger GPU metrics refresh
+      setGpuMetricsRefreshTrigger((prev) => prev + 1);
     } catch (error) {
       console.error('Error refreshing data:', error);
     } finally {
@@ -169,13 +214,77 @@ function JobDetails() {
     setRefreshControllerLogsFlag((prev) => prev + 1);
   };
 
+  // Get all tasks for this job (supports multi-task jobs) - computed early for GPU metrics
+  const allTasksForGpuMetrics = useMemo(() => {
+    return (
+      jobData?.jobs?.filter((item) => String(item.id) === String(jobId)) || []
+    );
+  }, [jobData, jobId]);
+
+  // Determine which tasks have GPU metrics (Kubernetes, not pool, has cluster_name_on_cloud)
+  const tasksWithGpuMetrics = useMemo(() => {
+    return allTasksForGpuMetrics.map((task, index) => ({
+      index,
+      task,
+      hasMetrics:
+        task.full_infra?.includes('Kubernetes') &&
+        !task.pool &&
+        task.cluster_name_on_cloud,
+    }));
+  }, [allTasksForGpuMetrics]);
+
+  const hasAnyTaskWithGpuMetrics = tasksWithGpuMetrics.some(
+    (t) => t.hasMetrics
+  );
+
+  // Get the currently selected task for GPU metrics
+  const gpuMetricsTask =
+    allTasksForGpuMetrics[gpuMetricsTaskIndex] || allTasksForGpuMetrics[0];
+
+  // Get cluster name for GPU metrics from selected task
+  const gpuMetricsClusterName =
+    gpuMetricsTask?.cluster_name_on_cloud ||
+    allTasksForGpuMetrics[0]?.cluster_name_on_cloud;
+
   if (!router.isReady) {
     return <div>Loading...</div>;
   }
 
-  const detailJobData = jobData?.jobs?.find(
-    (item) => String(item.id) === String(jobId)
-  );
+  // Get all tasks for this job (supports multi-task jobs)
+  const allTasks =
+    jobData?.jobs?.filter((item) => String(item.id) === String(jobId)) || [];
+
+  // Use the first task for main details display
+  const detailJobData = allTasks.length > 0 ? allTasks[0] : null;
+  const isMultiTask = allTasks.length > 1;
+
+  // For multi-task jobs, find fields from any task (they may only be on one task)
+  const jobYaml =
+    allTasks.find((t) => t.dag_yaml)?.dag_yaml || detailJobData?.dag_yaml;
+  const jobEntrypoint =
+    allTasks.find((t) => t.entrypoint)?.entrypoint || detailJobData?.entrypoint;
+  const jobIsJobGroup =
+    allTasks.find((t) => t.is_job_group)?.is_job_group ||
+    detailJobData?.is_job_group ||
+    allTasks.length > 1;
+
+  // For execution, check stored values first, then apply defaults for multi-task jobs
+  // Older jobs may not have these fields stored, so provide sensible defaults
+  const storedExecution =
+    allTasks.find((t) => t.execution)?.execution || detailJobData?.execution;
+  // Default execution to 'parallel' for multi-task jobs without stored value
+  const jobExecution = storedExecution || (isMultiTask ? 'parallel' : null);
+
+  // Enhanced job data with fields from any task
+  const enhancedJobData = detailJobData
+    ? {
+        ...detailJobData,
+        dag_yaml: jobYaml,
+        entrypoint: jobEntrypoint,
+        execution: jobExecution,
+        is_job_group: jobIsJobGroup,
+      }
+    : null;
 
   const title = jobId
     ? `Job: ${jobId} | SkyPilot Dashboard`
@@ -198,6 +307,11 @@ function JobDetails() {
               className="text-sky-blue hover:underline"
             >
               {jobId} {detailJobData?.name ? `(${detailJobData.name})` : ''}
+              {isMultiTask && (
+                <span className="ml-2 text-xs text-gray-500 bg-gray-200 px-1.5 py-0.5 rounded">
+                  {allTasks.length} tasks
+                </span>
+              )}
             </Link>
           </div>
 
@@ -239,7 +353,8 @@ function JobDetails() {
                 </div>
                 <div className="p-4">
                   <JobDetailsContent
-                    jobData={detailJobData}
+                    jobData={enhancedJobData}
+                    allTasks={allTasks}
                     activeTab="info"
                     setIsLoadingLogs={setIsLoadingLogs}
                     setIsLoadingControllerLogs={setIsLoadingControllerLogs}
@@ -247,42 +362,277 @@ function JobDetails() {
                     isLoadingControllerLogs={isLoadingControllerLogs}
                     refreshFlag={0}
                     poolsData={poolsData}
-                    links={detailJobData.links}
+                    links={enhancedJobData?.links}
                     logExtractedLinks={logExtractedLinks}
+                    onLinksExtracted={setLogExtractedLinks}
                   />
                 </div>
               </Card>
             </div>
 
-            {/* GPU Metrics Plugin Slot */}
-            <PluginSlot
-              name="jobs.detail.gpu-metrics"
-              context={{
-                jobId: detailJobData.id,
-                jobName: detailJobData.name,
-                jobData: detailJobData,
-                pool: detailJobData.pool,
-                userHash: detailJobData.user_hash,
-                infra: detailJobData.full_infra || detailJobData.infra,
-                refreshTrigger: refreshTrigger,
-              }}
-              wrapperClassName="mt-6"
-            />
+            {/* Tasks Section - only show for multi-task jobs */}
+            {isMultiTask && (
+              <div id="tasks-section" className="mt-6">
+                <Card>
+                  <div className="flex items-center justify-between px-4 pt-4">
+                    <h3 className="text-lg font-semibold flex items-center">
+                      Tasks
+                      <span className="ml-2 text-sm font-normal text-gray-500">
+                        ({allTasks.length} tasks)
+                      </span>
+                    </h3>
+                  </div>
+                  <div className="p-4">
+                    <div className="overflow-x-auto rounded-lg border">
+                      <Table className="min-w-full">
+                        <TableHeader>
+                          <TableRow>
+                            <TableHead className="whitespace-nowrap">
+                              ID
+                            </TableHead>
+                            <TableHead className="whitespace-nowrap">
+                              Name
+                            </TableHead>
+                            <TableHead className="whitespace-nowrap">
+                              Status
+                            </TableHead>
+                            <TableHead className="whitespace-nowrap">
+                              Duration
+                            </TableHead>
+                            <TableHead className="whitespace-nowrap">
+                              Infra
+                            </TableHead>
+                            <TableHead className="whitespace-nowrap">
+                              Resources
+                            </TableHead>
+                            <TableHead className="whitespace-nowrap">
+                              Recoveries
+                            </TableHead>
+                            <TableHead className="whitespace-nowrap">
+                              Logs
+                            </TableHead>
+                          </TableRow>
+                        </TableHeader>
+                        <TableBody>
+                          {allTasks.map((task, index) => (
+                            <TableRow
+                              key={task.task_job_id}
+                              className="hover:bg-gray-50"
+                            >
+                              <TableCell>
+                                <Link
+                                  href={`/jobs/${jobId}/${index}`}
+                                  className="text-blue-600 hover:underline"
+                                >
+                                  {index}
+                                </Link>
+                              </TableCell>
+                              <TableCell>
+                                <Link
+                                  href={`/jobs/${jobId}/${index}`}
+                                  className="text-blue-600 hover:underline"
+                                >
+                                  {task.task || `Job ${index}`}
+                                  {/* Show Primary badge for primary tasks in job groups with auxiliaries */}
+                                  {allTasks.some(
+                                    (t) => t.is_primary_in_job_group === false
+                                  ) &&
+                                    task.is_primary_in_job_group === true && (
+                                      <span className="ml-1.5">
+                                        <PrimaryBadge />
+                                      </span>
+                                    )}
+                                </Link>
+                              </TableCell>
+                              <TableCell>
+                                <StatusBadge status={task.status} />
+                              </TableCell>
+                              <TableCell>
+                                {formatDuration(task.job_duration)}
+                              </TableCell>
+                              <TableCell>
+                                {task.infra && task.infra !== '-' ? (
+                                  <NonCapitalizedTooltip
+                                    content={task.full_infra || task.infra}
+                                    className="text-sm text-muted-foreground"
+                                  >
+                                    <span>
+                                      {task.cloud ||
+                                        task.infra.split('(')[0].trim()}
+                                      {task.infra.includes('(') && (
+                                        <span className="text-gray-500">
+                                          {' ' +
+                                            task.infra.substring(
+                                              task.infra.indexOf('(')
+                                            )}
+                                        </span>
+                                      )}
+                                    </span>
+                                  </NonCapitalizedTooltip>
+                                ) : (
+                                  <span>-</span>
+                                )}
+                              </TableCell>
+                              <TableCell>
+                                <NonCapitalizedTooltip
+                                  content={
+                                    task.requested_resources ||
+                                    task.resources_str_full ||
+                                    task.resources_str ||
+                                    '-'
+                                  }
+                                  className="text-sm text-muted-foreground"
+                                >
+                                  <span>
+                                    {task.requested_resources ||
+                                      task.resources_str ||
+                                      '-'}
+                                  </span>
+                                </NonCapitalizedTooltip>
+                              </TableCell>
+                              <TableCell>{task.recoveries || 0}</TableCell>
+                              <TableCell>
+                                <Tooltip
+                                  content="Download job logs"
+                                  className="text-muted-foreground"
+                                >
+                                  <button
+                                    onClick={() =>
+                                      downloadManagedJobLogs({
+                                        jobId: parseInt(jobId),
+                                        controller: false,
+                                      })
+                                    }
+                                    className="text-sky-blue hover:text-sky-blue-bright"
+                                  >
+                                    <Download className="w-4 h-4" />
+                                  </button>
+                                </Tooltip>
+                              </TableCell>
+                            </TableRow>
+                          ))}
+                        </TableBody>
+                      </Table>
+                    </div>
+                  </div>
+                </Card>
+              </div>
+            )}
+
+            {/* GPU Metrics Section - Show for Kubernetes managed jobs with cluster_name_on_cloud */}
+            {isGrafanaAvailable && hasAnyTaskWithGpuMetrics && (
+              <GPUMetricsSection
+                clusterNameOnCloud={gpuMetricsClusterName}
+                displayName={
+                  isMultiTask
+                    ? `${gpuMetricsTask?.task || gpuMetricsTask?.name || detailJobData.name} (Task ${gpuMetricsTaskIndex})`
+                    : gpuMetricsTask?.task ||
+                      gpuMetricsTask?.name ||
+                      detailJobData.name
+                }
+                storageKey={GPU_METRICS_EXPANDED_KEY}
+                noMetricsMessage={
+                  gpuMetricsTask?.pool
+                    ? 'GPU metrics are not available for pool jobs.'
+                    : !gpuMetricsTask?.full_infra?.includes('Kubernetes')
+                      ? 'GPU metrics are only available for Kubernetes tasks.'
+                      : 'No GPU metrics available for this task.'
+                }
+                headerExtra={
+                  isMultiTask && (
+                    <Select
+                      onValueChange={(value) =>
+                        setGpuMetricsTaskIndex(parseInt(value, 10))
+                      }
+                      value={String(gpuMetricsTaskIndex)}
+                    >
+                      <SelectTrigger
+                        onClick={(e) => e.stopPropagation()}
+                        aria-label="Task"
+                        className="focus:ring-0 focus:ring-offset-0 h-8 w-auto min-w-[160px] text-sm ml-4"
+                      >
+                        <SelectValue placeholder="Select Task" />
+                      </SelectTrigger>
+                      <SelectContent>
+                        {tasksWithGpuMetrics.map(
+                          ({ index, task, hasMetrics }) => (
+                            <SelectItem
+                              key={index}
+                              value={String(index)}
+                              disabled={!hasMetrics}
+                            >
+                              Task {index}
+                              {task.task ? `: ${task.task}` : ''}
+                              {!hasMetrics ? ' (no metrics)' : ''}
+                            </SelectItem>
+                          )
+                        )}
+                      </SelectContent>
+                    </Select>
+                  )
+                }
+              />
+            )}
 
             {/* Logs Section */}
             <div id="logs-section" className="mt-6">
               <Card>
                 <div className="flex items-center justify-between px-4 pt-4">
-                  <div className="flex items-center">
+                  <div className="flex items-center gap-4">
                     <h3 className="text-lg font-semibold">Logs</h3>
-                    <span className="ml-2 text-xs text-gray-500">
+                    {isMultiTask && (
+                      <Select
+                        onValueChange={(value) =>
+                          setSelectedTaskIndex(parseInt(value, 10))
+                        }
+                        value={String(selectedTaskIndex)}
+                      >
+                        <SelectTrigger
+                          aria-label="Task"
+                          className="focus:ring-0 focus:ring-offset-0 h-8 w-auto min-w-[160px] text-sm"
+                        >
+                          <SelectValue placeholder="Select Task" />
+                        </SelectTrigger>
+                        <SelectContent>
+                          {allTasks.map((task, index) => (
+                            <SelectItem
+                              key={task.task_job_id || index}
+                              value={String(index)}
+                            >
+                              Task {index}
+                              {task.task ? `: ${task.task}` : ''}
+                            </SelectItem>
+                          ))}
+                        </SelectContent>
+                      </Select>
+                    )}
+                    <Select
+                      onValueChange={(value) => setSelectedNode(value)}
+                      value={selectedNode}
+                    >
+                      <SelectTrigger
+                        aria-label="Node"
+                        className="focus:ring-0 focus:ring-offset-0 h-8 w-auto min-w-[120px] text-sm"
+                      >
+                        <SelectValue placeholder="All Nodes" />
+                      </SelectTrigger>
+                      <SelectContent>
+                        <SelectItem value="all">All Nodes</SelectItem>
+                        {logNodes.map((node) => (
+                          <SelectItem key={node} value={node}>
+                            {node}
+                          </SelectItem>
+                        ))}
+                      </SelectContent>
+                    </Select>
+                    <span className="text-xs text-gray-500">
                       (Logs are not streaming; click refresh to fetch the latest
                       logs.)
                     </span>
                   </div>
                   <div className="flex items-center space-x-3">
                     <Tooltip
-                      content="Download full logs"
+                      content="Download all job logs (zip)"
                       className="text-muted-foreground"
                     >
                       <button
@@ -317,7 +667,10 @@ function JobDetails() {
                 </div>
                 <div className="p-4">
                   <JobDetailsContent
-                    jobData={detailJobData}
+                    jobData={
+                      isMultiTask ? allTasks[selectedTaskIndex] : detailJobData
+                    }
+                    allTasks={allTasks}
                     activeTab="logs"
                     setIsLoadingLogs={setIsLoadingLogs}
                     setIsLoadingControllerLogs={setIsLoadingControllerLogs}
@@ -325,6 +678,9 @@ function JobDetails() {
                     isLoadingControllerLogs={isLoadingControllerLogs}
                     refreshFlag={refreshLogsFlag}
                     poolsData={poolsData}
+                    selectedTaskIndex={isMultiTask ? selectedTaskIndex : null}
+                    selectedNode={selectedNode}
+                    onNodesExtracted={setLogNodes}
                     onLinksExtracted={setLogExtractedLinks}
                   />
                 </div>
@@ -474,6 +830,7 @@ const URL_PATTERNS = {
 
 function JobDetailsContent({
   jobData,
+  allTasks = [],
   activeTab,
   setIsLoadingLogs,
   setIsLoadingControllerLogs,
@@ -484,9 +841,14 @@ function JobDetailsContent({
   links,
   logExtractedLinks: logExtractedLinksFromParent,
   onLinksExtracted,
+  selectedTaskIndex = null,
+  onTaskChange = null,
+  selectedNode = 'all',
+  onNodesExtracted = null,
 }) {
   const [isYamlExpanded, setIsYamlExpanded] = useState(false);
   const [expandedYamlDocs, setExpandedYamlDocs] = useState({});
+  const [showFullYaml, setShowFullYaml] = useState(false);
   const [isCopied, setIsCopied] = useState(false);
   const [isCommandCopied, setIsCommandCopied] = useState(false);
 
@@ -527,6 +889,15 @@ function JobDetailsContent({
   const isPreStart = PRE_START_STATUSES.includes(jobData.status);
   const isRecovering = RECOVERING_STATUSES.includes(jobData.status);
 
+  // Compute job group status based on primary tasks
+  // For job groups with primary/auxiliary tasks, status is determined only by primary tasks
+  const computedStatus = useMemo(() => {
+    if (allTasks.length > 1) {
+      return computeJobGroupStatus(allTasks);
+    }
+    return jobData.status;
+  }, [allTasks, jobData.status]);
+
   const toggleYamlExpanded = () => {
     setIsYamlExpanded(!isYamlExpanded);
   };
@@ -541,17 +912,29 @@ function JobDetailsContent({
   const copyYamlToClipboard = async () => {
     try {
       const yamlDocs = formatJobYaml(jobData.dag_yaml);
+      // Build JobGroup header with name and execution
+      const hasJobGroupConfig = jobData.name || jobData.execution;
+      const jobGroupHeader = hasJobGroupConfig
+        ? [
+            jobData.name ? `name: ${jobData.name}` : null,
+            jobData.execution ? `execution: ${jobData.execution}` : null,
+          ]
+            .filter(Boolean)
+            .join('\n') + '\n---\n'
+        : '';
+
       let textToCopy = '';
 
       if (yamlDocs.length === 1) {
         // Single document - use the formatted content directly
-        textToCopy = yamlDocs[0].content;
+        textToCopy = jobGroupHeader + yamlDocs[0].content;
       } else if (yamlDocs.length > 1) {
         // Multiple documents - join them with document separators
-        textToCopy = yamlDocs.map((doc) => doc.content).join('\n---\n');
+        textToCopy =
+          jobGroupHeader + yamlDocs.map((doc) => doc.content).join('\n---\n');
       } else {
         // Fallback to raw YAML if formatting fails
-        textToCopy = jobData.dag_yaml;
+        textToCopy = jobGroupHeader + jobData.dag_yaml;
       }
 
       await navigator.clipboard.writeText(textToCopy);
@@ -576,8 +959,10 @@ function JobDetailsContent({
     () => ({
       jobId: jobData.id,
       controller: false,
+      // Pass task index (as int) when viewing a specific task in a multi-task job
+      task: selectedTaskIndex,
     }),
-    [jobData.id]
+    [jobData.id, selectedTaskIndex]
   );
 
   const controllerStreamArgs = useMemo(
@@ -628,6 +1013,15 @@ function JobDetailsContent({
     setIsLoadingControllerLogs(streamingControllerLogsLoading);
   }, [streamingControllerLogsLoading, setIsLoadingControllerLogs]);
 
+  // Extract node types from logs and pass them to parent
+  useEffect(() => {
+    if (onNodesExtracted && logs.length > 0) {
+      const logsText = logs.join('\n');
+      const nodes = extractNodeTypes(logsText);
+      onNodesExtracted(nodes);
+    }
+  }, [logs, onNodesExtracted]);
+
   // Persist extracted links across tab changes using a ref
   const extractedLinksRef = useRef({});
 
@@ -726,15 +1120,12 @@ function JobDetailsContent({
               Waiting for the job to recover; refresh in a few moments.
             </span>
           </div>
-        ) : hasReceivedLogChunk || logs.length ? (
-          <LogFilter logs={logs} />
-        ) : isLoadingLogs ? (
-          <div className="flex items-center justify-center py-4">
-            <CircularProgress size={20} className="mr-2" />
-            <span>Loading logs...</span>
-          </div>
         ) : (
-          <LogFilter logs={logs} />
+          <LogFilter
+            logs={logs}
+            isLoading={isLoadingLogs && !hasReceivedLogChunk && !logs.length}
+            selectedNode={selectedNode}
+          />
         )}
       </div>
     );
@@ -772,8 +1163,16 @@ function JobDetailsContent({
     <div className="grid grid-cols-2 gap-6">
       <div>
         <div className="text-gray-600 font-medium text-base">Job ID (Name)</div>
-        <div className="text-base mt-1">
-          {jobData.id} {jobData.name ? `(${jobData.name})` : ''}
+        <div className="text-base mt-1 flex items-center gap-2">
+          <span>
+            {jobData.id} {jobData.name ? `(${jobData.name})` : ''}
+          </span>
+          {/* Badge for job group */}
+          {jobData.is_job_group && (
+            <span className="px-2 py-0.5 rounded text-xs font-medium bg-gray-200 text-gray-700">
+              JobGroup
+            </span>
+          )}
         </div>
       </div>
       <div>
@@ -782,7 +1181,7 @@ function JobDetailsContent({
           <PluginSlot
             name="jobs.detail.status.badge"
             context={jobData}
-            fallback={<StatusBadge status={jobData.status} />}
+            fallback={<StatusBadge status={computedStatus} />}
           />
         </div>
       </div>
@@ -816,7 +1215,31 @@ function JobDetailsContent({
           Requested Resources
         </div>
         <div className="text-base mt-1">
-          {jobData.requested_resources || 'N/A'}
+          {allTasks.length > 1 ? (
+            <NonCapitalizedTooltip
+              content={`Aggregated from ${allTasks.length} tasks:\n${allTasks
+                .map(
+                  (task, index) =>
+                    `Task ${index}${task.task ? ` (${task.task})` : ''}: ${task.requested_resources || task.resources_str || 'N/A'}`
+                )
+                .join('\n')}`}
+              className="text-sm text-muted-foreground"
+            >
+              <span className="cursor-help border-b border-dotted border-gray-400">
+                {(() => {
+                  const resourcesList = allTasks
+                    .map((t) => t.requested_resources || t.resources_str)
+                    .filter(Boolean);
+                  const uniqueResources = [...new Set(resourcesList)];
+                  return uniqueResources.length === 1
+                    ? `${uniqueResources[0]} (x${allTasks.length} tasks)`
+                    : `${resourcesList[0]} (+${allTasks.length - 1} more)`;
+                })()}
+              </span>
+            </NonCapitalizedTooltip>
+          ) : (
+            jobData.requested_resources || 'N/A'
+          )}
         </div>
       </div>
       <div>
@@ -922,6 +1345,20 @@ function JobDetailsContent({
         </div>
       </div>
 
+      {/* Queue Details section - right column */}
+      {jobData.details && (
+        <PluginSlot
+          name="jobs.detail.queue_details"
+          context={{
+            details: jobData.details,
+            queueName: jobData.kueue_queue_name,
+            infra: jobData.full_infra,
+            jobData: jobData,
+            title: 'Queue Details',
+          }}
+        />
+      )}
+
       {/* Entrypoint section - full width row */}
       {(jobData.entrypoint || jobData.dag_yaml) && (
         <div className="col-span-2">
@@ -960,7 +1397,7 @@ function JobDetailsContent({
               </div>
             )}
 
-            {/* Task YAML - Collapsible */}
+            {/* Job YAML - Collapsible */}
             {jobData.dag_yaml && jobData.dag_yaml !== '{}' && (
               <div>
                 <div className="flex items-center mb-2">
@@ -997,6 +1434,20 @@ function JobDetailsContent({
                   <div className="bg-gray-50 border border-gray-200 rounded-md p-3 max-h-96 overflow-y-auto">
                     {(() => {
                       const yamlDocs = formatJobYaml(jobData.dag_yaml);
+                      // Build JobGroup header with name and execution
+                      const hasJobGroupConfig =
+                        jobData.name || jobData.execution;
+                      const jobGroupHeader = hasJobGroupConfig
+                        ? [
+                            jobData.name ? `name: ${jobData.name}` : null,
+                            jobData.execution
+                              ? `execution: ${jobData.execution}`
+                              : null,
+                          ]
+                            .filter(Boolean)
+                            .join('\n') + '\n---\n'
+                        : '';
+
                       if (yamlDocs.length === 0) {
                         return (
                           <div className="text-gray-500">No YAML available</div>
@@ -1005,42 +1456,69 @@ function JobDetailsContent({
                         // Single document - show directly
                         return (
                           <YamlHighlighter className="whitespace-pre-wrap">
-                            {yamlDocs[0].content}
+                            {jobGroupHeader + yamlDocs[0].content}
                           </YamlHighlighter>
                         );
                       } else {
-                        // Multiple documents - show with collapsible sections
+                        // Multiple documents - show toggle and content
                         return (
                           <div className="space-y-4">
-                            {yamlDocs.map((doc, index) => (
-                              <div
-                                key={index}
-                                className="border-b border-gray-200 pb-4 last:border-b-0"
+                            {/* Toggle for Full YAML vs Per-Job */}
+                            <div className="flex items-center space-x-4 pb-2 border-b border-gray-200">
+                              <button
+                                onClick={() => setShowFullYaml(false)}
+                                className={`text-sm px-2 py-1 rounded ${!showFullYaml ? 'bg-blue-100 text-blue-700' : 'text-gray-600 hover:text-gray-800'}`}
                               >
-                                <button
-                                  onClick={() => toggleYamlDocExpanded(index)}
-                                  className="flex items-center justify-between w-full text-left focus:outline-none"
+                                By Job
+                              </button>
+                              <button
+                                onClick={() => setShowFullYaml(true)}
+                                className={`text-sm px-2 py-1 rounded ${showFullYaml ? 'bg-blue-100 text-blue-700' : 'text-gray-600 hover:text-gray-800'}`}
+                              >
+                                Full YAML
+                              </button>
+                            </div>
+
+                            {showFullYaml ? (
+                              // Show full YAML with JobGroup header
+                              <YamlHighlighter className="whitespace-pre-wrap">
+                                {jobGroupHeader +
+                                  yamlDocs
+                                    .map((doc) => doc.content)
+                                    .join('\n---\n')}
+                              </YamlHighlighter>
+                            ) : (
+                              // Show per-job YAMLs
+                              yamlDocs.map((doc, index) => (
+                                <div
+                                  key={index}
+                                  className="border-b border-gray-200 pb-4 last:border-b-0"
                                 >
-                                  <div className="flex items-center">
-                                    {expandedYamlDocs[index] ? (
-                                      <ChevronDownIcon className="w-4 h-4 mr-2" />
-                                    ) : (
-                                      <ChevronRightIcon className="w-4 h-4 mr-2" />
-                                    )}
-                                    <span className="text-sm font-medium text-gray-700">
-                                      Task {index + 1}: {doc.preview}
-                                    </span>
-                                  </div>
-                                </button>
-                                {expandedYamlDocs[index] && (
-                                  <div className="mt-3 ml-6">
-                                    <YamlHighlighter className="whitespace-pre-wrap">
-                                      {doc.content}
-                                    </YamlHighlighter>
-                                  </div>
-                                )}
-                              </div>
-                            ))}
+                                  <button
+                                    onClick={() => toggleYamlDocExpanded(index)}
+                                    className="flex items-center justify-between w-full text-left focus:outline-none"
+                                  >
+                                    <div className="flex items-center">
+                                      {expandedYamlDocs[index] ? (
+                                        <ChevronDownIcon className="w-4 h-4 mr-2" />
+                                      ) : (
+                                        <ChevronRightIcon className="w-4 h-4 mr-2" />
+                                      )}
+                                      <span className="text-sm font-medium text-gray-700">
+                                        Job {index + 1}: {doc.preview}
+                                      </span>
+                                    </div>
+                                  </button>
+                                  {expandedYamlDocs[index] && (
+                                    <div className="mt-3 ml-6">
+                                      <YamlHighlighter className="whitespace-pre-wrap">
+                                        {doc.content}
+                                      </YamlHighlighter>
+                                    </div>
+                                  )}
+                                </div>
+                              ))
+                            )}
                           </div>
                         );
                       }
@@ -1081,6 +1559,7 @@ JobDetailsContent.propTypes = {
     entrypoint: PropTypes.string,
     dag_yaml: PropTypes.string,
   }),
+  allTasks: PropTypes.array,
   activeTab: PropTypes.string,
   setIsLoadingLogs: PropTypes.func,
   setIsLoadingControllerLogs: PropTypes.func,
@@ -1091,6 +1570,8 @@ JobDetailsContent.propTypes = {
   links: PropTypes.object,
   logExtractedLinks: PropTypes.object,
   onLinksExtracted: PropTypes.func,
+  selectedTaskIndex: PropTypes.number,
+  onTaskChange: PropTypes.func,
 };
 
 export default JobDetails;
diff --git a/sky/dashboard/src/pages/jobs/[job]/[task].js b/sky/dashboard/src/pages/jobs/[job]/[task].js
new file mode 100644
index 00000000000..b98f23c93b9
--- /dev/null
+++ b/sky/dashboard/src/pages/jobs/[job]/[task].js
@@ -0,0 +1,456 @@
+import React, { useState, useEffect } from 'react';
+import { CircularProgress } from '@mui/material';
+import { useRouter } from 'next/router';
+import { Card } from '@/components/ui/card';
+import { useSingleManagedJob, getPoolStatus } from '@/data/connectors/jobs';
+import Link from 'next/link';
+import {
+  RotateCwIcon,
+  ChevronDownIcon,
+  ChevronRightIcon,
+  Download,
+} from 'lucide-react';
+import {
+  CustomTooltip as Tooltip,
+  formatFullTimestamp,
+  formatDuration,
+  renderPoolLink,
+} from '@/components/utils';
+import { LogFilter } from '@/components/utils';
+import {
+  streamManagedJobLogs,
+  downloadManagedJobLogs,
+} from '@/data/connectors/jobs';
+import { StatusBadge } from '@/components/elements/StatusBadge';
+import { useMobile } from '@/hooks/useMobile';
+import Head from 'next/head';
+import { NonCapitalizedTooltip } from '@/components/utils';
+import { UserDisplay } from '@/components/elements/UserDisplay';
+import dashboardCache from '@/lib/cache';
+import { useLogStreamer } from '@/hooks/useLogStreamer';
+import { checkGrafanaAvailability } from '@/utils/grafana';
+import { GPUMetricsSection } from '@/components/GPUMetricsSection';
+
+function TaskDetails() {
+  const router = useRouter();
+  const { job: jobId, task: taskIndex } = router.query;
+  const [refreshTrigger, setRefreshTrigger] = useState(0);
+  const { jobData, loading } = useSingleManagedJob(jobId, refreshTrigger);
+  const [poolsData, setPoolsData] = useState([]);
+  const [isRefreshing, setIsRefreshing] = useState(false);
+  const [isInitialLoad, setIsInitialLoad] = useState(true);
+  const [isLoadingLogs, setIsLoadingLogs] = useState(false);
+  const [refreshLogsFlag, setRefreshLogsFlag] = useState(0);
+  const [isLogsExpanded, setIsLogsExpanded] = useState(true);
+  const isMobile = useMobile();
+
+  // GPU metrics state
+  const [isGrafanaAvailable, setIsGrafanaAvailable] = useState(false);
+  const [gpuMetricsRefreshTrigger, setGpuMetricsRefreshTrigger] = useState(0);
+
+  // Update isInitialLoad when data is first loaded
+  React.useEffect(() => {
+    if (!loading && isInitialLoad) {
+      setIsInitialLoad(false);
+    }
+  }, [loading, isInitialLoad]);
+
+  // Fetch pools data for hash comparison
+  useEffect(() => {
+    async function fetchPoolsData() {
+      try {
+        const poolsResponse = await dashboardCache.get(getPoolStatus, [{}]);
+        setPoolsData(poolsResponse.pools || []);
+      } catch (error) {
+        console.error('Error fetching pools data:', error);
+        setPoolsData([]);
+      }
+    }
+    fetchPoolsData();
+  }, []);
+
+  // Check Grafana availability on mount
+  useEffect(() => {
+    const checkGrafana = async () => {
+      const available = await checkGrafanaAvailability();
+      setIsGrafanaAvailable(available);
+    };
+    checkGrafana();
+  }, []);
+
+  // Handle manual refresh
+  const handleManualRefresh = async () => {
+    setIsRefreshing(true);
+    try {
+      setRefreshTrigger((prev) => prev + 1);
+      setRefreshLogsFlag((prev) => prev + 1);
+      setGpuMetricsRefreshTrigger((prev) => prev + 1);
+    } catch (error) {
+      console.error('Error refreshing data:', error);
+    } finally {
+      setIsRefreshing(false);
+    }
+  };
+
+  const handleLogsRefresh = () => {
+    setRefreshLogsFlag((prev) => prev + 1);
+  };
+
+  if (!router.isReady) {
+    return <div>Loading...</div>;
+  }
+
+  // Get all tasks for this job
+  const allTasks =
+    jobData?.jobs?.filter((item) => String(item.id) === String(jobId)) || [];
+
+  // Get the specific task by index
+  const taskIndexNum = parseInt(taskIndex, 10);
+  const taskData = allTasks[taskIndexNum] || null;
+  const jobName = allTasks.length > 0 ? allTasks[0].name : '';
+
+  const title = taskData
+    ? `Task ${taskIndex}: ${taskData.task || 'Unnamed'} | Job ${jobId} | SkyPilot Dashboard`
+    : 'Task Details | SkyPilot Dashboard';
+
+  return (
+    <>
+      <Head>
+        <title>{title}</title>
+      </Head>
+      <>
+        <div className="flex items-center justify-between mb-4">
+          <div className="text-base flex items-center flex-wrap">
+            <Link href="/jobs" className="text-sky-blue hover:underline">
+              Managed Jobs
+            </Link>
+            <span className="mx-2 text-gray-500">›</span>
+            <Link
+              href={`/jobs/${jobId}`}
+              className="text-sky-blue hover:underline"
+            >
+              {jobId} {jobName ? `(${jobName})` : ''}
+            </Link>
+            <span className="mx-2 text-gray-500">›</span>
+            <span className="text-gray-700">
+              Task {taskIndex}
+              {taskData?.task && (
+                <span className="text-gray-500"> ({taskData.task})</span>
+              )}
+            </span>
+          </div>
+
+          <div className="text-sm flex items-center">
+            {(loading || isRefreshing || isLoadingLogs) && (
+              <div className="flex items-center mr-4">
+                <CircularProgress size={15} className="mt-0" />
+                <span className="ml-2 text-gray-500">Loading...</span>
+              </div>
+            )}
+            <Tooltip content="Refresh" className="text-muted-foreground">
+              <button
+                onClick={handleManualRefresh}
+                disabled={loading || isRefreshing}
+                className="text-sky-blue hover:text-sky-blue-bright font-medium inline-flex items-center h-8"
+              >
+                <RotateCwIcon className="w-4 h-4 mr-1.5" />
+                {!isMobile && <span>Refresh</span>}
+              </button>
+            </Tooltip>
+          </div>
+        </div>
+
+        {loading && isInitialLoad ? (
+          <div className="flex items-center justify-center py-32">
+            <CircularProgress size={20} className="mr-2" />
+            <span>Loading...</span>
+          </div>
+        ) : taskData ? (
+          <div className="space-y-8">
+            {/* Task Details Section */}
+            <div id="details-section">
+              <Card>
+                <div className="flex items-center justify-between px-4 pt-4">
+                  <h3 className="text-lg font-semibold">Task Details</h3>
+                </div>
+                <div className="p-4">
+                  <TaskDetailsContent
+                    taskData={taskData}
+                    taskIndex={taskIndexNum}
+                    poolsData={poolsData}
+                  />
+                </div>
+              </Card>
+            </div>
+
+            {/* GPU Metrics Section - Show for Kubernetes tasks with cluster_name_on_cloud */}
+            {isGrafanaAvailable &&
+              taskData.full_infra?.includes('Kubernetes') &&
+              !taskData.pool &&
+              taskData.cluster_name_on_cloud && (
+                <GPUMetricsSection
+                  clusterNameOnCloud={taskData.cluster_name_on_cloud}
+                  displayName={taskData.task || `Task ${taskIndex}`}
+                  refreshTrigger={gpuMetricsRefreshTrigger}
+                  storageKey="skypilot-task-gpu-metrics-expanded"
+                />
+              )}
+
+            {/* Logs Section */}
+            <div id="logs-section" className="mt-6">
+              <Card>
+                <button
+                  onClick={() => setIsLogsExpanded(!isLogsExpanded)}
+                  className="flex items-center justify-between w-full px-4 py-4 text-left focus:outline-none"
+                >
+                  <div className="flex items-center">
+                    {isLogsExpanded ? (
+                      <ChevronDownIcon className="w-5 h-5 mr-2 text-gray-500" />
+                    ) : (
+                      <ChevronRightIcon className="w-5 h-5 mr-2 text-gray-500" />
+                    )}
+                    <h3 className="text-lg font-semibold">Logs</h3>
+                    <span className="ml-2 text-xs text-gray-500">
+                      (Task {taskIndex} logs)
+                    </span>
+                  </div>
+                  {isLogsExpanded && (
+                    <div className="flex items-center space-x-3">
+                      <Tooltip
+                        content="Download task logs"
+                        className="text-muted-foreground"
+                      >
+                        <button
+                          onClick={(e) => {
+                            e.stopPropagation();
+                            downloadManagedJobLogs({
+                              jobId: parseInt(jobId),
+                              controller: false,
+                            });
+                          }}
+                          className="text-sky-blue hover:text-sky-blue-bright flex items-center"
+                        >
+                          <Download className="w-4 h-4" />
+                        </button>
+                      </Tooltip>
+                      <Tooltip
+                        content="Refresh logs"
+                        className="text-muted-foreground"
+                      >
+                        <button
+                          onClick={(e) => {
+                            e.stopPropagation();
+                            handleLogsRefresh();
+                          }}
+                          disabled={isLoadingLogs}
+                          className="text-sky-blue hover:text-sky-blue-bright flex items-center"
+                        >
+                          <RotateCwIcon
+                            className={`w-4 h-4 ${isLoadingLogs ? 'animate-spin' : ''}`}
+                          />
+                        </button>
+                      </Tooltip>
+                    </div>
+                  )}
+                </button>
+                {isLogsExpanded && (
+                  <div className="p-4">
+                    <TaskLogsContent
+                      taskData={taskData}
+                      taskIndex={taskIndexNum}
+                      refreshFlag={refreshLogsFlag}
+                      setIsLoadingLogs={setIsLoadingLogs}
+                      isLoadingLogs={isLoadingLogs}
+                    />
+                  </div>
+                )}
+              </Card>
+            </div>
+          </div>
+        ) : (
+          <div className="flex items-center justify-center py-32">
+            <span>Task not found</span>
+          </div>
+        )}
+      </>
+    </>
+  );
+}
+
+function TaskDetailsContent({ taskData, taskIndex, poolsData }) {
+  return (
+    <div className="grid grid-cols-2 gap-6">
+      <div>
+        <div className="text-gray-600 font-medium text-base">Task</div>
+        <div className="text-base mt-1">
+          {taskIndex}
+          {taskData.task && (
+            <span className="text-gray-500"> ({taskData.task})</span>
+          )}
+        </div>
+      </div>
+      <div>
+        <div className="text-gray-600 font-medium text-base">Job</div>
+        <div className="text-base mt-1">
+          <Link
+            href={`/jobs/${taskData.id}`}
+            className="text-sky-blue hover:text-sky-blue-bright hover:underline"
+          >
+            {taskData.id}
+            {taskData.name ? ` (${taskData.name})` : ''}
+          </Link>
+        </div>
+      </div>
+      <div>
+        <div className="text-gray-600 font-medium text-base">Status</div>
+        <div className="text-base mt-1">
+          <StatusBadge status={taskData.status} />
+        </div>
+      </div>
+      <div>
+        <div className="text-gray-600 font-medium text-base">User</div>
+        <div className="text-base mt-1">
+          <UserDisplay username={taskData.user} userHash={taskData.user_hash} />
+        </div>
+      </div>
+      <div>
+        <div className="text-gray-600 font-medium text-base">Workspace</div>
+        <div className="text-base mt-1">
+          <Link
+            href="/workspaces"
+            className="text-gray-700 hover:text-blue-600 hover:underline"
+          >
+            {taskData.workspace || 'default'}
+          </Link>
+        </div>
+      </div>
+      <div>
+        <div className="text-gray-600 font-medium text-base">Duration</div>
+        <div className="text-base mt-1">
+          {formatDuration(taskData.job_duration)}
+        </div>
+      </div>
+      <div>
+        <div className="text-gray-600 font-medium text-base">
+          Requested Resources
+        </div>
+        <div className="text-base mt-1">
+          {taskData.requested_resources || taskData.resources_str || 'N/A'}
+        </div>
+      </div>
+      <div>
+        <div className="text-gray-600 font-medium text-base">Infra</div>
+        <div className="text-base mt-1">
+          {taskData.infra ? (
+            <NonCapitalizedTooltip
+              content={taskData.full_infra || taskData.infra}
+              className="text-sm text-muted-foreground"
+            >
+              <span>
+                <Link href="/infra" className="text-blue-600 hover:underline">
+                  {taskData.cloud || taskData.infra.split('(')[0].trim()}
+                </Link>
+                {taskData.infra.includes('(') && (
+                  <span>
+                    {' ' +
+                      taskData.infra.substring(taskData.infra.indexOf('('))}
+                  </span>
+                )}
+              </span>
+            </NonCapitalizedTooltip>
+          ) : (
+            '-'
+          )}
+        </div>
+      </div>
+      <div>
+        <div className="text-gray-600 font-medium text-base">Recoveries</div>
+        <div className="text-base mt-1">{taskData.recoveries || 0}</div>
+      </div>
+      <div>
+        <div className="text-gray-600 font-medium text-base">Pool</div>
+        <div className="text-base mt-1">
+          {renderPoolLink(taskData.pool, taskData.pool_hash, poolsData)}
+        </div>
+      </div>
+      {taskData.details && (
+        <div className="col-span-2">
+          <div className="text-gray-600 font-medium text-base">Details</div>
+          <div className="text-base mt-1 text-gray-700 whitespace-pre-wrap">
+            {taskData.details}
+          </div>
+        </div>
+      )}
+    </div>
+  );
+}
+
+function TaskLogsContent({
+  taskData,
+  taskIndex,
+  refreshFlag,
+  setIsLoadingLogs,
+  isLoadingLogs,
+}) {
+  const PENDING_STATUSES = ['PENDING', 'SUBMITTED', 'STARTING'];
+  const RECOVERING_STATUSES = ['RECOVERING'];
+
+  const isPending = PENDING_STATUSES.includes(taskData.status);
+  const isRecovering = RECOVERING_STATUSES.includes(taskData.status);
+
+  const logStreamArgs = React.useMemo(
+    () => ({
+      jobId: taskData.id,
+      task: taskIndex,
+      controller: false,
+    }),
+    [taskData.id, taskIndex]
+  );
+
+  const handleLogsError = React.useCallback((error) => {
+    console.error('Error streaming logs:', error);
+  }, []);
+
+  const {
+    lines: logs,
+    isLoading: streamingLogsLoading,
+    hasReceivedFirstChunk: hasReceivedLogChunk,
+  } = useLogStreamer({
+    streamFn: streamManagedJobLogs,
+    streamArgs: logStreamArgs,
+    enabled: !isPending && !isRecovering,
+    refreshTrigger: refreshFlag,
+    onError: handleLogsError,
+  });
+
+  React.useEffect(() => {
+    setIsLoadingLogs(streamingLogsLoading);
+  }, [streamingLogsLoading, setIsLoadingLogs]);
+
+  return (
+    <div className="max-h-96 overflow-y-auto">
+      {isPending ? (
+        <div className="bg-[#f7f7f7] flex items-center justify-center py-4 text-gray-500">
+          <span>Waiting for the task to start; refresh in a few moments.</span>
+        </div>
+      ) : isRecovering ? (
+        <div className="bg-[#f7f7f7] flex items-center justify-center py-4 text-gray-500">
+          <span>
+            Waiting for the task to recover; refresh in a few moments.
+          </span>
+        </div>
+      ) : hasReceivedLogChunk || logs.length ? (
+        <LogFilter logs={logs} />
+      ) : isLoadingLogs ? (
+        <div className="flex items-center justify-center py-4">
+          <CircularProgress size={20} className="mr-2" />
+          <span>Loading logs...</span>
+        </div>
+      ) : (
+        <LogFilter logs={logs} />
+      )}
+    </div>
+  );
+}
+
+export default TaskDetails;
diff --git a/sky/dashboard/src/pages/recipes.js b/sky/dashboard/src/pages/recipes.js
new file mode 100644
index 00000000000..cfb28059dd8
--- /dev/null
+++ b/sky/dashboard/src/pages/recipes.js
@@ -0,0 +1,19 @@
+import React from 'react';
+import Head from 'next/head';
+import dynamic from 'next/dynamic';
+
+const RecipeHub = dynamic(
+  () => import('@/components/recipe-hub').then((mod) => mod.RecipeHub),
+  { ssr: false }
+);
+
+export default function RecipesPage() {
+  return (
+    <>
+      <Head>
+        <title>Recipes | SkyPilot Dashboard</title>
+      </Head>
+      <RecipeHub />
+    </>
+  );
+}
diff --git a/sky/dashboard/src/pages/recipes/[recipe].js b/sky/dashboard/src/pages/recipes/[recipe].js
new file mode 100644
index 00000000000..1247b68e20b
--- /dev/null
+++ b/sky/dashboard/src/pages/recipes/[recipe].js
@@ -0,0 +1,27 @@
+import React from 'react';
+import Head from 'next/head';
+import dynamic from 'next/dynamic';
+import { useRouter } from 'next/router';
+
+const RecipeDetail = dynamic(
+  () => import('@/components/recipe-detail').then((mod) => mod.RecipeDetail),
+  { ssr: false }
+);
+
+export default function RecipeDetailPage() {
+  const router = useRouter();
+  const { recipe } = router.query;
+
+  const title = recipe
+    ? `${recipe} | Recipes | SkyPilot Dashboard`
+    : 'Recipes | SkyPilot Dashboard';
+
+  return (
+    <>
+      <Head>
+        <title>{title}</title>
+      </Head>
+      <RecipeDetail />
+    </>
+  );
+}
diff --git a/sky/dashboard/src/plugins/PluginProvider.jsx b/sky/dashboard/src/plugins/PluginProvider.jsx
index c03e883a2ec..5f30fb32a4a 100644
--- a/sky/dashboard/src/plugins/PluginProvider.jsx
+++ b/sky/dashboard/src/plugins/PluginProvider.jsx
@@ -16,18 +16,27 @@ const PluginContext = createContext({
   topNavLinks: [],
   routes: [],
   components: {},
+  dataEnhancements: {},
+  tableColumns: {},
+  dataProviders: {},
 });
 
 const initialState = {
   topNavLinks: [],
   routes: [],
   components: {}, // Map of slot name → array of component configs
+  dataEnhancements: {}, // Map of dataSource → array of enhancements
+  tableColumns: {}, // Map of table name → array of column configs
+  dataProviders: {}, // Map of provider id → provider config (with useHook)
 };
 
 const actions = {
   REGISTER_TOP_NAV_LINK: 'REGISTER_TOP_NAV_LINK',
   REGISTER_ROUTE: 'REGISTER_ROUTE',
   REGISTER_COMPONENT: 'REGISTER_COMPONENT',
+  REGISTER_DATA_ENHANCEMENT: 'REGISTER_DATA_ENHANCEMENT',
+  REGISTER_TABLE_COLUMN: 'REGISTER_TABLE_COLUMN',
+  REGISTER_DATA_PROVIDER: 'REGISTER_DATA_PROVIDER',
 };
 
 function pluginReducer(state, action) {
@@ -56,6 +65,60 @@ function pluginReducer(state, action) {
         },
       };
     }
+    case actions.REGISTER_DATA_ENHANCEMENT: {
+      const { dataSource } = action.payload;
+      const existing = state.dataEnhancements[dataSource] || [];
+      const updated = upsertById(existing, action.payload);
+      // Sort by priority, then by dependencies
+      updated.sort((a, b) => {
+        const aPriority = a.priority ?? 100;
+        const bPriority = b.priority ?? 100;
+        if (aPriority !== bPriority) {
+          return aPriority - bPriority;
+        }
+        // If b depends on a, a should run first
+        if (b.dependencies?.includes(a.id)) {
+          return -1;
+        }
+        if (a.dependencies?.includes(b.id)) {
+          return 1;
+        }
+        return 0;
+      });
+      return {
+        ...state,
+        dataEnhancements: {
+          ...state.dataEnhancements,
+          [dataSource]: updated,
+        },
+      };
+    }
+    case actions.REGISTER_TABLE_COLUMN: {
+      const { table } = action.payload;
+      const existing = state.tableColumns[table] || [];
+      const updated = upsertById(existing, action.payload);
+      // Sort by order (lower order = earlier position)
+      updated.sort((a, b) => {
+        const aOrder = a.header?.order ?? 100;
+        const bOrder = b.header?.order ?? 100;
+        return aOrder - bOrder;
+      });
+      return {
+        ...state,
+        tableColumns: {
+          ...state.tableColumns,
+          [table]: updated,
+        },
+      };
+    }
+    case actions.REGISTER_DATA_PROVIDER:
+      return {
+        ...state,
+        dataProviders: {
+          ...state.dataProviders,
+          [action.payload.id]: action.payload,
+        },
+      };
     default:
       return state;
   }
@@ -95,7 +158,7 @@ function resolveScriptUrl(jsPath) {
   }
 }
 
-function loadPluginScript(jsPath) {
+function loadPluginScript(jsPath, requiresEarlyInit = false) {
   if (typeof window === 'undefined') {
     return null;
   }
@@ -113,6 +176,7 @@ function loadPluginScript(jsPath) {
     script.type = 'text/javascript';
     script.async = true;
     script.src = resolved;
+    if (requiresEarlyInit) script.dataset.requiresEarlyInit = 'true';
     script.onload = () => resolve();
     script.onerror = (error) => {
       console.warn(
@@ -262,6 +326,194 @@ function normalizeComponent(config) {
   };
 }
 
+function normalizeDataEnhancement(config) {
+  if (
+    !config ||
+    typeof config !== 'object' ||
+    !config.id ||
+    !config.dataSource ||
+    typeof config.enhance !== 'function'
+  ) {
+    console.warn(
+      '[SkyDashboardPlugin] Invalid data enhancement registration:',
+      config
+    );
+    return null;
+  }
+
+  return {
+    id: String(config.id),
+    dataSource: String(config.dataSource),
+    enhance: config.enhance,
+    priority: Number.isFinite(config.priority) ? config.priority : 100,
+    dependencies: Array.isArray(config.dependencies)
+      ? config.dependencies.map(String)
+      : undefined,
+    fields: Array.isArray(config.fields)
+      ? config.fields.map(String)
+      : undefined,
+  };
+}
+
+function normalizeTableColumn(config) {
+  if (
+    !config ||
+    typeof config !== 'object' ||
+    !config.id ||
+    !config.table ||
+    !config.header ||
+    typeof config.header !== 'object' ||
+    !config.header.label ||
+    !config.cell ||
+    typeof config.cell !== 'object' ||
+    typeof config.cell.render !== 'function'
+  ) {
+    console.warn(
+      '[SkyDashboardPlugin] Invalid table column registration:',
+      config
+    );
+    return null;
+  }
+
+  return {
+    id: String(config.id),
+    table: String(config.table),
+    header: {
+      label: String(config.header.label),
+      sortKey: config.header.sortKey
+        ? String(config.header.sortKey)
+        : undefined,
+      className: config.header.className
+        ? String(config.header.className)
+        : undefined,
+      order: Number.isFinite(config.header.order) ? config.header.order : 100,
+    },
+    cell: {
+      render: config.cell.render,
+      className: config.cell.className
+        ? String(config.cell.className)
+        : undefined,
+    },
+    conditions:
+      config.conditions && typeof config.conditions === 'object'
+        ? {
+            showWhen:
+              typeof config.conditions.showWhen === 'function'
+                ? config.conditions.showWhen
+                : undefined,
+          }
+        : undefined,
+  };
+}
+
+/**
+ * Normalizes a URL by stripping credentials and ensuring it's safe for history API.
+ * This prevents SecurityError when the current URL has credentials but the target URL doesn't.
+ * Relative URLs are returned as-is since they're safe for history API.
+ * @param {string} url - The URL to normalize
+ * @returns {string} Normalized URL without credentials, or the original URL if it's relative or invalid
+ */
+function normalizeUrlForHistory(url) {
+  if (!url || typeof url !== 'string') {
+    return url;
+  }
+
+  // If it's a relative URL (starts with / or is a path), keep it relative
+  // Relative URLs are safe for history API and don't need normalization
+  if (
+    url.startsWith('/') ||
+    (!url.startsWith('http://') && !url.startsWith('https://'))
+  ) {
+    return url;
+  }
+
+  try {
+    // Parse the absolute URL
+    const urlObj = new URL(url);
+
+    // Strip credentials from the URL
+    urlObj.username = '';
+    urlObj.password = '';
+
+    // Return the normalized URL
+    return urlObj.toString();
+  } catch (error) {
+    // If URL parsing fails, return the original URL
+    console.warn('[SkyDashboardPlugin] Failed to normalize URL:', url, error);
+    return url;
+  }
+}
+
+/**
+ * Intercepts history.pushState and history.replaceState to normalize URLs.
+ * This prevents SecurityError when URLs contain credentials.
+ */
+function interceptHistoryApi() {
+  if (typeof window === 'undefined' || !window.history) {
+    return;
+  }
+
+  // Store original methods
+  const originalPushState = window.history.pushState;
+  const originalReplaceState = window.history.replaceState;
+
+  // Override pushState
+  window.history.pushState = function (state, title, url) {
+    let normalizedUrl = url;
+    if (url && typeof url === 'string') {
+      normalizedUrl = normalizeUrlForHistory(url);
+    }
+    try {
+      return originalPushState.call(this, state, title, normalizedUrl);
+    } catch (error) {
+      // If pushState still fails (e.g., due to origin mismatch), try with a relative URL
+      if (
+        error.name === 'SecurityError' &&
+        normalizedUrl &&
+        typeof normalizedUrl === 'string'
+      ) {
+        try {
+          const urlObj = new URL(normalizedUrl, window.location.href);
+          const relativeUrl = urlObj.pathname + urlObj.search + urlObj.hash;
+          return originalPushState.call(this, state, title, relativeUrl);
+        } catch {
+          // If that also fails, rethrow the original error
+          throw error;
+        }
+      }
+      throw error;
+    }
+  };
+
+  // Override replaceState
+  window.history.replaceState = function (state, title, url) {
+    let normalizedUrl = url;
+    if (url && typeof url === 'string') {
+      normalizedUrl = normalizeUrlForHistory(url);
+    }
+    try {
+      return originalReplaceState.call(this, state, title, normalizedUrl);
+    } catch (error) {
+      // If replaceState still fails (e.g., due to origin mismatch), try with a relative URL
+      if (
+        error.name === 'SecurityError' &&
+        normalizedUrl &&
+        typeof normalizedUrl === 'string'
+      ) {
+        try {
+          const urlObj = new URL(normalizedUrl, window.location.href);
+          const relativeUrl = urlObj.pathname + urlObj.search + urlObj.hash;
+          return originalReplaceState.call(this, state, title, relativeUrl);
+        } catch {
+          // If that also fails, rethrow the original error
+          throw error;
+        }
+      }
+      throw error;
+    }
+  };
+}
+
 function createPluginApi(dispatch) {
   return {
     registerTopNavLink(link) {
@@ -297,6 +549,52 @@ function createPluginApi(dispatch) {
       });
       return normalized.id;
     },
+    registerDataEnhancement(config) {
+      const normalized = normalizeDataEnhancement(config);
+      if (!normalized) {
+        return null;
+      }
+      // Validate field conflicts with existing enhancements
+      const existingEnhancements = getDataEnhancements(normalized.dataSource);
+      if (normalized.fields && normalized.fields.length > 0) {
+        const conflicts = [];
+        existingEnhancements.forEach((existing) => {
+          if (existing.fields && existing.fields.length > 0) {
+            const overlap = normalized.fields.filter((f) =>
+              existing.fields.includes(f)
+            );
+            if (overlap.length > 0) {
+              conflicts.push({
+                plugin: existing.id,
+                fields: overlap,
+              });
+            }
+          }
+        });
+        if (conflicts.length > 0) {
+          console.warn(
+            `[SkyDashboardPlugin] Field conflicts detected for ${normalized.id}:`,
+            conflicts
+          );
+        }
+      }
+      dispatch({
+        type: actions.REGISTER_DATA_ENHANCEMENT,
+        payload: normalized,
+      });
+      return normalized.id;
+    },
+    registerTableColumn(config) {
+      const normalized = normalizeTableColumn(config);
+      if (!normalized) {
+        return null;
+      }
+      dispatch({
+        type: actions.REGISTER_TABLE_COLUMN,
+        payload: normalized,
+      });
+      return normalized.id;
+    },
     getContext() {
       return {
         basePath: BASE_PATH,
@@ -306,7 +604,29 @@ function createPluginApi(dispatch) {
           checkGrafanaAvailability,
           getGrafanaUrl,
         },
+        // Provide URL normalization utility for plugins
+        normalizeUrl: normalizeUrlForHistory,
+      };
+    },
+    registerDataProvider(config) {
+      if (!config?.id) {
+        console.warn(
+          '[SkyDashboardPlugin] Invalid data provider: missing id',
+          config
+        );
+        return null;
+      }
+      const normalized = {
+        id: String(config.id),
+        name: config.name || config.id,
+        useHook: config.useHook,
       };
+      dispatch({
+        type: actions.REGISTER_DATA_PROVIDER,
+        payload: normalized,
+      });
+      console.log('[SkyDashboardPlugin] Registered data provider:', config.id);
+      return config.id;
     },
   };
 }
@@ -314,11 +634,27 @@ function createPluginApi(dispatch) {
 export function PluginProvider({ children }) {
   const [state, dispatch] = useReducer(pluginReducer, initialState);
 
+  // Expose state reference for getDataEnhancements to access outside React context
+  useEffect(() => {
+    if (typeof window !== 'undefined') {
+      window.__pluginStateRef = { current: state };
+      return () => {
+        if (window.__pluginStateRef) {
+          delete window.__pluginStateRef;
+        }
+      };
+    }
+  }, [state]);
+
   useEffect(() => {
     if (typeof window === 'undefined') {
       return;
     }
 
+    // Intercept history API to normalize URLs and prevent SecurityError
+    // when URLs contain credentials
+    interceptHistoryApi();
+
     let cancelled = false;
     const api = createPluginApi(dispatch);
     window.SkyDashboardPluginAPI = api;
@@ -330,14 +666,14 @@ export function PluginProvider({ children }) {
       if (cancelled) {
         return;
       }
-      manifest
-        .map((pluginDescriptor) => extractJsPath(pluginDescriptor))
-        .filter(Boolean)
-        .forEach((jsPath) => {
-          if (!cancelled) {
-            loadPluginScript(jsPath);
-          }
-        });
+      manifest.forEach((pluginDescriptor) => {
+        const jsPath = extractJsPath(pluginDescriptor);
+        if (jsPath && !cancelled) {
+          const requiresEarlyInit =
+            pluginDescriptor.requires_early_init === true;
+          loadPluginScript(jsPath, requiresEarlyInit);
+        }
+      });
     };
     void bootstrapPlugins();
 
@@ -347,7 +683,7 @@ export function PluginProvider({ children }) {
         delete window.SkyDashboardPluginAPI;
       }
     };
-  }, []);
+  }, [dispatch]);
 
   const value = useMemo(() => state, [state]);
 
@@ -432,3 +768,113 @@ export function usePluginComponents(slot) {
     });
   }, [slot, components]);
 }
+
+/**
+ * Get data enhancements for a specific data source
+ * @param {string} dataSource - The data source name (e.g., 'jobs', 'clusters')
+ * @returns {Array} Array of enhancement configurations
+ */
+export function getDataEnhancements(dataSource) {
+  // This function needs access to the current state
+  // Since it's called from outside React context, we need to access it differently
+  // For now, we'll use a module-level state reference
+  if (typeof window !== 'undefined' && window.__pluginStateRef) {
+    const state = window.__pluginStateRef.current;
+    return state?.dataEnhancements?.[dataSource] || [];
+  }
+  return [];
+}
+
+/**
+ * Hook to get table columns for a specific table
+ * @param {string} tableName - The table name (e.g., 'clusters', 'jobs')
+ * @param {object} context - Optional context for filtering columns
+ * @returns {Array} Array of column configurations sorted by order
+ */
+export function useTableColumns(tableName, context = {}) {
+  const { tableColumns } = usePluginState();
+  return useMemo(() => {
+    if (!tableName) {
+      return [];
+    }
+    const columns = tableColumns[tableName] || [];
+    // Filter by conditions if provided
+    return columns.filter((column) => {
+      if (column.conditions?.showWhen) {
+        return column.conditions.showWhen(context);
+      }
+      return true;
+    });
+  }, [tableName, tableColumns, context]);
+}
+
+export function useDataProvider(id) {
+  const { dataProviders } = usePluginState();
+  return dataProviders[id] || null;
+}
+
+/**
+ * Hook to merge base columns with plugin columns, automatically handling replacements.
+ * Plugin columns with the same ID as base columns will replace the base columns.
+ *
+ * @param {string} tableName - The table name (e.g., 'clusters', 'jobs')
+ * @param {Array} baseColumns - Array of base column definitions
+ * @param {object} context - Optional context for filtering columns and conditional display
+ * @param {function} transformPluginColumn - Optional function to transform plugin columns into the format expected by the table
+ * @returns {Array} Merged and filtered columns, sorted by order
+ */
+export function useMergedTableColumns(
+  tableName,
+  baseColumns = [],
+  context = {},
+  transformPluginColumn = null
+) {
+  const pluginColumns = useTableColumns(tableName, context);
+
+  return useMemo(() => {
+    // Transform plugin columns if a transform function is provided
+    const pluginColumnDefs = transformPluginColumn
+      ? pluginColumns.map((col) => transformPluginColumn(col))
+      : pluginColumns.map((col) => ({
+          id: col.id,
+          order: col.header.order,
+          isPlugin: true,
+          pluginColumn: col,
+        }));
+
+    // Create a set of plugin column IDs to identify replacements
+    const pluginColumnIds = new Set(pluginColumnDefs.map((col) => col.id));
+
+    // Merge base and plugin columns, sort by order
+    const allColumns = [...baseColumns, ...pluginColumnDefs].sort(
+      (a, b) => a.order - b.order
+    );
+
+    // Filter columns:
+    // 1. Remove base columns that have a plugin replacement (same ID)
+    // 2. Handle conditional columns based on context
+    const visibleColumns = allColumns.filter((col) => {
+      // Filter out base columns that have a plugin replacement
+      if (!col.isPlugin && pluginColumnIds.has(col.id)) {
+        return false;
+      }
+
+      // Handle conditional columns
+      if (col.conditional) {
+        // Allow context to provide a function to check conditional columns
+        if (
+          context.shouldShowColumn &&
+          typeof context.shouldShowColumn === 'function'
+        ) {
+          return context.shouldShowColumn(col.id);
+        }
+        // Default: don't show conditional columns unless explicitly enabled
+        return false;
+      }
+
+      return true;
+    });
+
+    return visibleColumns;
+  }, [baseColumns, pluginColumns, transformPluginColumn, context]);
+}
diff --git a/sky/dashboard/src/plugins/dataEnhancement.js b/sky/dashboard/src/plugins/dataEnhancement.js
new file mode 100644
index 00000000000..85f3f46f3a6
--- /dev/null
+++ b/sky/dashboard/src/plugins/dataEnhancement.js
@@ -0,0 +1,70 @@
+/**
+ * Data enhancement execution utility
+ * Applies plugin data enhancements to data returned by connectors
+ */
+
+import { getDataEnhancements } from './PluginProvider';
+import dashboardCache from '@/lib/cache';
+
+/**
+ * Apply data enhancements to a dataset
+ * @param {Array} data - The processed data array
+ * @param {string} dataSource - The data source name (e.g., 'jobs', 'clusters')
+ * @param {Object} [context={}] - Optional context to pass to enhancements
+ * @param {Array} [context.rawData] - Raw backend response data (optional)
+ * @param {Object} [context.dashboardCache] - Dashboard cache instance (optional)
+ * @returns {Promise<Array>} The enhanced data
+ */
+export async function applyEnhancements(data, dataSource, context = {}) {
+  const enhancements = getDataEnhancements(dataSource);
+
+  if (enhancements.length === 0) {
+    return data;
+  }
+
+  // Build enhancement context
+  const enhancementContext = {
+    dashboardCache: context.dashboardCache || dashboardCache,
+    getOriginalData: () => Promise.resolve(data),
+    rawData: context.rawData || null, // Raw backend response for field extraction
+    ...context,
+  };
+
+  let enhancedData = data;
+
+  // Execute enhancements sequentially (each receives data from previous)
+  for (const enhancement of enhancements) {
+    const dataBeforeEnhancement = enhancedData;
+    try {
+      const result = await enhancement.enhance(
+        dataBeforeEnhancement,
+        enhancementContext
+      );
+
+      // Validate that enhancement returned an array
+      if (!Array.isArray(result)) {
+        console.error(
+          `[Plugin] Data enhancement ${enhancement.id} did not return an array. Skipping.`
+        );
+        continue;
+      }
+
+      // Validate that array length matches (enhancements should not add/remove items)
+      if (result.length !== dataBeforeEnhancement.length) {
+        console.warn(
+          `[Plugin] Data enhancement ${enhancement.id} changed array length from ${dataBeforeEnhancement.length} to ${result.length}. This is not recommended.`
+        );
+      }
+      enhancedData = result;
+    } catch (error) {
+      console.error(
+        `[Plugin] Data enhancement ${enhancement.id} failed:`,
+        error
+      );
+      // On error, enhancedData is not updated, so it remains as it was before this failed enhancement.
+      // This correctly skips the failed enhancement and continues the chain.
+    }
+  }
+
+  return enhancedData;
+}
diff --git a/sky/dashboard/src/utils/resourceUtils.js b/sky/dashboard/src/utils/resourceUtils.js
index c7038774969..a94b77c4208 100644
--- a/sky/dashboard/src/utils/resourceUtils.js
+++ b/sky/dashboard/src/utils/resourceUtils.js
@@ -9,7 +9,7 @@
  */
 export function formatCpu(cpu) {
   if (cpu === null || cpu === undefined) return '-';
-  return cpu === Math.floor(cpu) ? Math.floor(cpu).toString() : cpu.toFixed(1);
+  return Math.round(cpu).toString();
 }
 
 /**
@@ -19,7 +19,7 @@ export function formatCpu(cpu) {
  */
 export function formatMemory(memory) {
   if (memory === null || memory === undefined) return '-';
-  return `${memory.toFixed(1)} GB`;
+  return `${Math.round(memory)} GB`;
 }
 
 /**
diff --git a/sky/data/mounting_utils.py b/sky/data/mounting_utils.py
index ba6e68ac417..2a9dff82c9f 100644
--- a/sky/data/mounting_utils.py
+++ b/sky/data/mounting_utils.py
@@ -250,14 +250,19 @@ def get_az_mount_install_cmd() -> str:
         # Try to install fuse3 from default repos
         'sudo apt-get update && '
         'FUSE3_INSTALLED=0 && '
+        # Detect which libfuse3 package is available. Debian 13+ (trixie) uses
+        # libfuse3-4 instead of libfuse3-3 due to library soname bump.
+        'LIBFUSE3_PKG=$(apt-cache search --names-only "^libfuse3-[0-9]+$" '
+        '2>/dev/null | head -1 | cut -d" " -f1) && '
+        'LIBFUSE3_PKG="${LIBFUSE3_PKG:-libfuse3-3}" && '
         # On Kubernetes, if FUSERMOUNT_SHARED_DIR is set, it means
         # fusermount and fusermount3 is symlinked to fusermount-shim.
         # If we reinstall fuse3, it may overwrite the symlink, so
         # just install libfuse3, which is needed by blobfuse2.
         'if [ -n "${FUSERMOUNT_SHARED_DIR:-}" ]; then '
-        '  PACKAGES="libfuse3-3 libfuse3-dev"; '
+        '  PACKAGES="$LIBFUSE3_PKG libfuse3-dev"; '
         'else '
-        '  PACKAGES="fuse3 libfuse3-3 libfuse3-dev"; '
+        '  PACKAGES="fuse3 $LIBFUSE3_PKG libfuse3-dev"; '
         'fi && '
         'if sudo apt-get install -y '
         '-o Dpkg::Options::="--force-confdef" '
@@ -296,6 +301,19 @@ def get_az_mount_install_cmd() -> str:
         # Install blobfuse2 only if fuse3 is available
         'if [ "$FUSE3_INSTALLED" = "1" ]; then '
         '  echo "Installing blobfuse2 with libfuse3 support"; '
+        # Workaround for Debian 13+ where libfuse3 soname changed from 3 to 4.
+        # The blobfuse2 binary still links against libfuse3.so.3, so we create
+        # a symlink if needed.
+        '  LIBFUSE_SO=$(find /usr/lib -name "libfuse3.so.3.*" 2>/dev/null | '
+        'head -1) && '
+        '  if [ -n "$LIBFUSE_SO" ]; then '
+        '    LIBFUSE_DIR=$(dirname "$LIBFUSE_SO") && '
+        '    if [ ! -e "$LIBFUSE_DIR/libfuse3.so.3" ]; then '
+        '      echo "Creating libfuse3.so.3 symlink for compatibility"; '
+        '      sudo ln -s $(basename "$LIBFUSE_SO") '
+        '"$LIBFUSE_DIR/libfuse3.so.3"; '
+        '    fi; '
+        '  fi && '
         '  wget -nc https://github.com/Azure/azure-storage-fuse'
         f'/releases/download/blobfuse2-{BLOBFUSE2_VERSION}/'
         f'blobfuse2-{BLOBFUSE2_VERSION}-Debian-11.0.x86_64.deb '
diff --git a/sky/data/storage.py b/sky/data/storage.py
index 0ea857eb6ca..a82d87d207c 100644
--- a/sky/data/storage.py
+++ b/sky/data/storage.py
@@ -4126,7 +4126,7 @@ def _validate(self):
                     'Storage \'store: oci\' specified, but ' \
                     'OCI access is disabled. To fix, enable '\
                     'OCI by running `sky check`. '\
-                    'More info: https://skypilot.readthedocs.io/en/latest/getting-started/installation.html.' # pylint: disable=line-too-long
+                    'More info: https://docs.skypilot.co/en/latest/getting-started/installation.html.' # pylint: disable=line-too-long
                     )
 
     @classmethod
diff --git a/sky/data/storage_utils.py b/sky/data/storage_utils.py
index 9c8cd94b80a..67f1fc8aafd 100644
--- a/sky/data/storage_utils.py
+++ b/sky/data/storage_utils.py
@@ -209,7 +209,21 @@ def get_excluded_files(src_dir_path: str) -> List[str]:
 def zip_files_and_folders(items: List[str],
                           output_file: Union[str, pathlib.Path],
                           log_file: Optional[TextIO] = None,
-                          relative_to_items: bool = False):
+                          relative_to_items: bool = False,
+                          compression: int = zipfile.ZIP_DEFLATED,
+                          compresslevel: Optional[int] = None):
+    """Zip files and folders.
+
+    Args:
+        items: List of file/folder paths to include in the zip.
+        output_file: Path to the output zip file.
+        log_file: Optional file to log progress to.
+        relative_to_items: If True, paths in zip are relative to items.
+        compression: Compression method (default: ZIP_DEFLATED for good
+            compression with universal compatibility).
+        compresslevel: Compression level (1-9 for DEFLATED). None uses
+            the default (6).
+    """
 
     def _get_archive_name(file_path: str, item_path: str) -> str:
         """Get the archive name for a file based on the relative parameters."""
@@ -239,7 +253,10 @@ def _store_symlink(zipf, path: str, archive_name: str, is_dir: bool):
         warnings.filterwarnings('ignore',
                                 category=UserWarning,
                                 message='Duplicate name:')
-        with zipfile.ZipFile(output_file, 'w') as zipf:
+        with zipfile.ZipFile(output_file,
+                             'w',
+                             compression=compression,
+                             compresslevel=compresslevel) as zipf:
             for item in items:
                 item = os.path.expanduser(item)
                 if not os.path.isfile(item) and not os.path.isdir(item):
diff --git a/sky/exceptions.py b/sky/exceptions.py
index a2714f9b5ed..44df030098e 100644
--- a/sky/exceptions.py
+++ b/sky/exceptions.py
@@ -425,6 +425,16 @@ class InvalidClusterNameError(Exception):
     pass
 
 
+class InvalidRecipeNameError(Exception):
+    """Raised when the recipe name is invalid."""
+    pass
+
+
+class RecipeAlreadyExistsError(Exception):
+    """Raised when attempting to create a recipe with an existing name."""
+    pass
+
+
 class CloudUserIdentityError(Exception):
     """Raised when the cloud identity is invalid."""
     pass
@@ -631,6 +641,11 @@ class PermissionDeniedError(Exception):
     pass
 
 
+class VolumeNotReadyError(Exception):
+    """Raised when a volume is not ready."""
+    pass
+
+
 class VolumeNotFoundError(Exception):
     """Raised when a volume is not found."""
     pass
diff --git a/sky/execution.py b/sky/execution.py
index 6d83e81f45f..d89b558548a 100644
--- a/sky/execution.py
+++ b/sky/execution.py
@@ -4,6 +4,7 @@
 """
 import enum
 import logging
+import time
 import typing
 from typing import Callable, List, Optional, Tuple, Union
 
@@ -12,6 +13,7 @@
 from sky import admin_policy
 from sky import backends
 from sky import clouds
+from sky import exceptions
 from sky import global_user_state
 from sky import optimizer
 from sky import sky_logging
@@ -328,12 +330,16 @@ def _execute_dag(
         idle_minutes_to_autostop: Optional[int] = None
         down = False
         wait_for: Optional[autostop_lib.AutostopWaitFor] = None
+        hook: Optional[str] = None
+        hook_timeout: Optional[int] = None
         if resource_autostop_config is not None:
             if resource_autostop_config.enabled:
                 idle_minutes_to_autostop = (
                     resource_autostop_config.idle_minutes)
                 down = resource_autostop_config.down
                 wait_for = resource_autostop_config.wait_for
+                hook = resource_autostop_config.hook
+                hook_timeout = resource_autostop_config.hook_timeout
             else:
                 # Autostop is explicitly disabled, so cancel it if it's
                 # already set.
@@ -510,9 +516,14 @@ def _planner(_t: 'sky.Task'):
             if idle_minutes_to_autostop is not None:
                 assert isinstance(backend, backends.CloudVmRayBackend)
                 assert isinstance(handle, backends.CloudVmRayResourceHandle)
-                backend.set_autostop(handle, idle_minutes_to_autostop, wait_for,
-                                     down)
-
+                backend.set_autostop(handle,
+                                     idle_minutes_to_autostop,
+                                     wait_for,
+                                     down,
+                                     hook=hook,
+                                     hook_timeout=hook_timeout)
+
+        job_id = None
         if Stage.EXEC in stages:
             try:
                 global_user_state.update_last_use(handle.get_cluster_name())
@@ -654,10 +665,36 @@ def launch(
     handle = None
     stages = None
     skip_unnecessary_provisioning = False
-    # Check if cluster exists and we are doing fast provisioning
-    if fast and cluster_name is not None:
+
+    # Check for AUTOSTOPPING and wait with spinner (applies to all modes)
+    cluster_status = None
+    maybe_handle = None
+    if cluster_name is not None:
         cluster_status, maybe_handle = (
             backend_utils.refresh_cluster_status_handle(cluster_name))
+        if cluster_status == status_lib.ClusterStatus.AUTOSTOPPING:
+            # Use spinner to show progress while waiting
+            with rich_utils.safe_status(
+                    ux_utils.spinner_message(
+                        f'Waiting for autostop to complete on {cluster_name!r}')
+            ):
+                while cluster_status == status_lib.ClusterStatus.AUTOSTOPPING:
+                    time.sleep(
+                        backend_utils.CLUSTER_STATUS_CACHE_DURATION_SECONDS)
+                    cluster_status, maybe_handle = (
+                        backend_utils.refresh_cluster_status_handle(
+                            cluster_name))
+            # Log final status after spinner completes
+            logger.info(
+                f'Autostop completed. Cluster status: '
+                f'{cluster_status.value if cluster_status else "TERMINATED"}')
+
+    # Check if cluster exists and we are doing fast provisioning
+    if fast and cluster_name is not None:
+        # Reuse cluster_status/maybe_handle if already fetched
+        if cluster_status is None:
+            cluster_status, maybe_handle = (
+                backend_utils.refresh_cluster_status_handle(cluster_name))
         if cluster_status == status_lib.ClusterStatus.INIT:
             # If the cluster is INIT, it may be provisioning. We want to prevent
             # concurrent calls from queueing up many sequential reprovision
@@ -798,6 +835,17 @@ def exec(  # pylint: disable=redefined-builtin
     controller_utils.check_cluster_name_not_controller(cluster_name,
                                                        operation_str='sky.exec')
 
+    # Check if cluster is autostopping - reject exec on autostopping clusters
+    if not dryrun:
+        cluster_status, _ = backend_utils.refresh_cluster_status_handle(
+            cluster_name)
+        if cluster_status == status_lib.ClusterStatus.AUTOSTOPPING:
+            raise exceptions.ClusterNotUpError(
+                f'Cannot execute on cluster {cluster_name!r}: cluster is '
+                'autostopping. Please wait for autostop to complete, then use '
+                f'`sky start {cluster_name}` to restart.',
+                cluster_status=cluster_status)
+
     handle = backend_utils.check_cluster_available(
         cluster_name,
         operation='executing tasks',
diff --git a/sky/global_user_state.py b/sky/global_user_state.py
index 7cfff7a2eaf..0309bc40dc5 100644
--- a/sky/global_user_state.py
+++ b/sky/global_user_state.py
@@ -125,6 +125,10 @@
     sqlalchemy.Column('skylet_ssh_tunnel_metadata',
                       sqlalchemy.LargeBinary,
                       server_default=None),
+    # Infrastructure columns for efficient filtering
+    sqlalchemy.Column('cloud', sqlalchemy.Text, server_default=None),
+    sqlalchemy.Column('region', sqlalchemy.Text, server_default=None),
+    sqlalchemy.Column('zone', sqlalchemy.Text, server_default=None),
 )
 
 storage_table = sqlalchemy.Table(
@@ -153,6 +157,10 @@
     sqlalchemy.Column('last_use', sqlalchemy.Text),
     sqlalchemy.Column('status', sqlalchemy.Text),
     sqlalchemy.Column('is_ephemeral', sqlalchemy.Integer, server_default='0'),
+    sqlalchemy.Column('error_message', sqlalchemy.Text, server_default=None),
+    # JSON-encoded lists of pods/clusters using the volume
+    sqlalchemy.Column('usedby_pods', sqlalchemy.Text, server_default=None),
+    sqlalchemy.Column('usedby_clusters', sqlalchemy.Text, server_default=None),
 )
 
 # Table for Cluster History
@@ -196,6 +204,10 @@
                       sqlalchemy.Integer,
                       server_default=None,
                       index=True),
+    # Infrastructure columns for efficient filtering
+    sqlalchemy.Column('cloud', sqlalchemy.Text, server_default=None),
+    sqlalchemy.Column('region', sqlalchemy.Text, server_default=None),
+    sqlalchemy.Column('zone', sqlalchemy.Text, server_default=None),
 )
 
 
@@ -365,7 +377,7 @@ async def wrapper(*args, **kwargs):
             # this may happen multiple times since there is no locking
             # here but thats fine, this is just a short circuit for the
             # common case.
-            await context_utils.to_thread(initialize_and_get_db_async)
+            await asyncio.to_thread(initialize_and_get_db_async)
 
         return await func(*args, **kwargs)
 
@@ -673,6 +685,17 @@ def add_or_update_cluster(cluster_name: str,
         status = status_lib.ClusterStatus.UP
     status_updated_at = int(time.time())
 
+    # Extract cloud/region/zone from launched_resources for efficient filtering
+    cloud = None
+    region = None
+    zone = None
+    if hasattr(cluster_handle, 'launched_resources'):
+        lr = cluster_handle.launched_resources
+        if lr is not None:
+            cloud = str(lr.cloud) if getattr(lr, 'cloud', None) else None
+            region = str(lr.region) if getattr(lr, 'region', None) else None
+            zone = str(lr.zone) if getattr(lr, 'zone', None) else None
+
     # TODO (sumanth): Cluster history table will have multiple entries
     # when the cluster failover through multiple regions (one entry per region).
     # It can be more inaccurate for the multi-node cluster
@@ -762,9 +785,13 @@ def add_or_update_cluster(cluster_name: str,
         if existing_cluster_hash is not None:
             count = session.query(cluster_table).filter_by(
                 name=cluster_name, cluster_hash=existing_cluster_hash).update({
-                    **conditional_values, cluster_table.c.handle: handle,
+                    **conditional_values,
+                    cluster_table.c.handle: handle,
                     cluster_table.c.status: status.value,
-                    cluster_table.c.status_updated_at: status_updated_at
+                    cluster_table.c.status_updated_at: status_updated_at,
+                    cluster_table.c.cloud: cloud,
+                    cluster_table.c.region: region,
+                    cluster_table.c.zone: zone,
                 })
             assert count <= 1
             if count == 0:
@@ -782,6 +809,9 @@ def add_or_update_cluster(cluster_name: str,
                 # set storage_mounts_metadata to server default (null)
                 status_updated_at=status_updated_at,
                 is_managed=int(is_managed),
+                cloud=cloud,
+                region=region,
+                zone=zone,
             )
             insert_or_update_stmt = insert_stmnt.on_conflict_do_update(
                 index_elements=[cluster_table.c.name],
@@ -795,6 +825,9 @@ def add_or_update_cluster(cluster_name: str,
                     # do not update storage_mounts_metadata
                     cluster_table.c.status_updated_at: status_updated_at,
                     # do not update user_hash
+                    cluster_table.c.cloud: cloud,
+                    cluster_table.c.region: region,
+                    cluster_table.c.zone: zone,
                 })
             session.execute(insert_or_update_stmt)
 
@@ -830,6 +863,9 @@ def add_or_update_cluster(cluster_name: str,
             provision_log_path=provision_log_path,
             last_activity_time=last_activity_time,
             launched_at=launched_at,
+            cloud=cloud,
+            region=region,
+            zone=zone,
             **creation_info,
         )
         do_update_stmt = insert_stmnt.on_conflict_do_update(
@@ -848,6 +884,9 @@ def add_or_update_cluster(cluster_name: str,
                 cluster_history_table.c.provision_log_path: provision_log_path,
                 cluster_history_table.c.last_activity_time: last_activity_time,
                 cluster_history_table.c.launched_at: launched_at,
+                cluster_history_table.c.cloud: cloud,
+                cluster_history_table.c.region: region,
+                cluster_history_table.c.zone: zone,
                 **creation_info,
             })
         session.execute(do_update_stmt)
@@ -1071,7 +1110,7 @@ def get_cluster_events(
     cluster_name: Optional[str],
     cluster_hash: Optional[str],
     event_type: ClusterEventType,
-    include_timestamps: Literal[False],
+    include_timestamps: Literal[False] = False,
     limit: Optional[int] = ...,
 ) -> List[str]:
     ...
@@ -2386,6 +2425,10 @@ def get_volumes(is_ephemeral: Optional[bool] = None) -> List[Dict[str, Any]]:
                 is_ephemeral=int(is_ephemeral)).all()
     records = []
     for row in rows:
+        # Decode JSON-encoded usedby fields
+        usedby_pods = json.loads(row.usedby_pods) if row.usedby_pods else []
+        usedby_clusters = (json.loads(row.usedby_clusters)
+                           if row.usedby_clusters else [])
         records.append({
             'name': row.name,
             'launched_at': row.launched_at,
@@ -2396,6 +2439,9 @@ def get_volumes(is_ephemeral: Optional[bool] = None) -> List[Dict[str, Any]]:
             'last_use': row.last_use,
             'status': status_lib.VolumeStatus[row.status],
             'is_ephemeral': bool(row.is_ephemeral),
+            'error_message': row.error_message,
+            'usedby_pods': usedby_pods,
+            'usedby_clusters': usedby_clusters,
         })
     return records
 
@@ -2407,6 +2453,10 @@ def get_volume_by_name(name: str) -> Optional[Dict[str, Any]]:
     with orm.Session(_SQLALCHEMY_ENGINE) as session:
         row = session.query(volume_table).filter_by(name=name).first()
     if row:
+        # Decode JSON-encoded usedby fields
+        usedby_pods = json.loads(row.usedby_pods) if row.usedby_pods else []
+        usedby_clusters = (json.loads(row.usedby_clusters)
+                           if row.usedby_clusters else [])
         return {
             'name': row.name,
             'launched_at': row.launched_at,
@@ -2416,6 +2466,9 @@ def get_volume_by_name(name: str) -> Optional[Dict[str, Any]]:
             'last_attached_at': row.last_attached_at,
             'last_use': row.last_use,
             'status': status_lib.VolumeStatus[row.status],
+            'error_message': row.error_message,
+            'usedby_pods': usedby_pods,
+            'usedby_clusters': usedby_clusters,
         }
     return None
 
@@ -2491,12 +2544,34 @@ def update_volume(name: str, last_attached_at: int,
 
 @_init_db
 @metrics_lib.time_me
-def update_volume_status(name: str, status: status_lib.VolumeStatus) -> None:
+def update_volume_status(name: str,
+                         status: status_lib.VolumeStatus,
+                         error_message: Optional[str] = None,
+                         usedby_pods: Optional[List[str]] = None,
+                         usedby_clusters: Optional[List[str]] = None) -> None:
+    """Update volume status and related fields.
+
+    Args:
+        name: Volume name.
+        status: New volume status.
+        error_message: Error message (None clears it).
+        usedby_pods: List of pods using the volume (None keeps existing value).
+        usedby_clusters: List of clusters using the volume (None keeps it).
+    """
     assert _SQLALCHEMY_ENGINE is not None
     with orm.Session(_SQLALCHEMY_ENGINE) as session:
-        session.query(volume_table).filter_by(name=name).update({
+        update_dict: Dict[str, Any] = {
             volume_table.c.status: status.value,
-        })
+        }
+        # Always update error_message (None clears it)
+        update_dict[volume_table.c.error_message] = error_message
+        # Update usedby fields if provided (encode as JSON)
+        if usedby_pods is not None:
+            update_dict[volume_table.c.usedby_pods] = json.dumps(usedby_pods)
+        if usedby_clusters is not None:
+            update_dict[volume_table.c.usedby_clusters] = json.dumps(
+                usedby_clusters)
+        session.query(volume_table).filter_by(name=name).update(update_dict)
         session.commit()
 
 
diff --git a/sky/jobs/client/sdk.py b/sky/jobs/client/sdk.py
index 8761af3eafe..a2f11150225 100644
--- a/sky/jobs/client/sdk.py
+++ b/sky/jobs/client/sdk.py
@@ -49,7 +49,7 @@ def launch(
     # Internal only:
     # pylint: disable=invalid-name
     _need_confirmation: bool = False,
-) -> server_common.RequestId[Tuple[Optional[int],
+) -> server_common.RequestId[Tuple[Optional[List[int]],
                                    Optional['backends.ResourceHandle']]]:
     """Launches a managed job.
 
@@ -66,7 +66,7 @@ def launch(
         The request ID of the launch request.
 
     Request Returns:
-        job_id (Optional[int]): Job ID for the managed job
+        job_ids (Optional[List[int]]]): Job IDs for the managed jobs
         controller_handle (Optional[ResourceHandle]): ResourceHandle of the
           controller
 
@@ -118,7 +118,7 @@ def launch(
                               show_default=True)
 
         dag = client_common.upload_mounts_to_api_server(dag)
-        dag_str = dag_utils.dump_chain_dag_to_yaml_str(dag)
+        dag_str = dag_utils.dump_dag_to_yaml_str(dag)
         body = payloads.JobsLaunchBody(
             task=dag_str,
             name=name,
@@ -143,6 +143,8 @@ def queue_v2(
     job_ids: Optional[List[int]] = None,
     limit: Optional[int] = None,
     fields: Optional[List[str]] = None,
+    sort_by: Optional[str] = None,
+    sort_order: Optional[str] = None,
 ) -> server_common.RequestId[Tuple[List[responses.ManagedJobRecord], int, Dict[
         str, int], int]]:
     """Gets statuses of managed jobs.
@@ -156,6 +158,8 @@ def queue_v2(
         job_ids: IDs of the managed jobs to show.
         limit: Number of jobs to show.
         fields: Fields to get for the managed jobs.
+        sort_by: Field to sort by (e.g., 'job_id', 'name', 'submitted_at').
+        sort_order: Sort direction ('asc' or 'desc').
 
     Returns:
         The request ID of the queue request.
@@ -191,6 +195,12 @@ def queue_v2(
           does not exist.
         RuntimeError: if failed to get the managed jobs with ssh.
     """
+    # Filter out fields not supported by older servers
+    remote_api_version = versions.get_remote_api_version()
+    if fields is not None and (remote_api_version is None or
+                               remote_api_version < 31):
+        fields = [f for f in fields if f != 'is_primary_in_job_group']
+
     body = payloads.JobsQueueV2Body(
         refresh=refresh,
         skip_finished=skip_finished,
@@ -198,6 +208,8 @@ def queue_v2(
         job_ids=job_ids,
         limit=limit,
         fields=fields,
+        sort_by=sort_by,
+        sort_order=sort_order,
     )
     path = '/jobs/queue/v2'
     response = server_common.make_authenticated_request(
@@ -336,7 +348,8 @@ def tail_logs(name: Optional[str] = None,
               controller: bool = False,
               refresh: bool = False,
               tail: Optional[int] = None,
-              output_stream: Optional['io.TextIOBase'] = None) -> Optional[int]:
+              output_stream: Optional['io.TextIOBase'] = None,
+              task: Optional[Union[str, int]] = None) -> Optional[int]:
     """Tails logs of managed jobs.
 
     You can provide either a job name or a job ID to tail logs. If both are not
@@ -351,6 +364,9 @@ def tail_logs(name: Optional[str] = None,
         tail: Number of lines to tail from the end of the log file.
         output_stream: The stream to write the logs to. If None, print to the
             console.
+        task: Task identifier to view logs for a specific task in a JobGroup.
+            If an int, it is treated as a task ID. If a str, it is treated as
+            a task name. If None, logs for all tasks are shown.
 
     Returns:
         Exit code based on success or failure of the job. 0 if success,
@@ -370,6 +386,7 @@ def tail_logs(name: Optional[str] = None,
         controller=controller,
         refresh=refresh,
         tail=tail,
+        task=task,
     )
     response = server_common.make_authenticated_request(
         'POST',
diff --git a/sky/jobs/client/sdk_async.py b/sky/jobs/client/sdk_async.py
index fc154dfff4a..8c95438bbde 100644
--- a/sky/jobs/client/sdk_async.py
+++ b/sky/jobs/client/sdk_async.py
@@ -1,4 +1,5 @@
 """Async SDK functions for managed jobs."""
+import asyncio
 import typing
 from typing import Dict, List, Optional, Tuple, Union
 
@@ -11,7 +12,6 @@
 from sky.skylet import constants
 from sky.usage import usage_lib
 from sky.utils import common_utils
-from sky.utils import context_utils
 
 if typing.TYPE_CHECKING:
     import io
@@ -36,10 +36,10 @@ async def launch(
     _need_confirmation: bool = False,
     stream_logs: Optional[
         sdk_async.StreamConfig] = sdk_async.DEFAULT_STREAM_CONFIG,
-) -> Tuple[Optional[int], Optional[backends.ResourceHandle]]:
+) -> Tuple[Optional[List[int]], Optional[backends.ResourceHandle]]:
     """Async version of launch() that launches a managed job."""
-    request_id = await context_utils.to_thread(sdk.launch, task, name, pool,
-                                               num_jobs, _need_confirmation)
+    request_id = await asyncio.to_thread(sdk.launch, task, name, pool, num_jobs,
+                                         _need_confirmation)
     if stream_logs is not None:
         return await sdk_async._stream_and_get(request_id, stream_logs)  # pylint: disable=protected-access
     else:
@@ -58,9 +58,8 @@ async def queue_v2(
         sdk_async.StreamConfig] = sdk_async.DEFAULT_STREAM_CONFIG
 ) -> Tuple[List[responses.ManagedJobRecord], int, Dict[str, int], int]:
     """Async version of queue_v2() that gets statuses of managed jobs."""
-    request_id = await context_utils.to_thread(sdk.queue_v2, refresh,
-                                               skip_finished, all_users,
-                                               job_ids, limit, fields)
+    request_id = await asyncio.to_thread(sdk.queue_v2, refresh, skip_finished,
+                                         all_users, job_ids, limit, fields)
     if stream_logs is not None:
         return await sdk_async._stream_and_get(request_id, stream_logs)  # pylint: disable=protected-access
     else:
@@ -77,9 +76,8 @@ async def queue(
         sdk_async.StreamConfig] = sdk_async.DEFAULT_STREAM_CONFIG
 ) -> List[responses.ManagedJobRecord]:
     """Async version of queue() that gets statuses of managed jobs."""
-    request_id = await context_utils.to_thread(sdk.queue, refresh,
-                                               skip_finished, all_users,
-                                               job_ids)
+    request_id = await asyncio.to_thread(sdk.queue, refresh, skip_finished,
+                                         all_users, job_ids)
     if stream_logs is not None:
         return await sdk_async._stream_and_get(request_id, stream_logs)  # pylint: disable=protected-access
     else:
@@ -96,8 +94,8 @@ async def cancel(
         sdk_async.StreamConfig] = sdk_async.DEFAULT_STREAM_CONFIG,
 ) -> None:
     """Async version of cancel() that cancels managed jobs."""
-    request_id = await context_utils.to_thread(sdk.cancel, name, job_ids, all,
-                                               all_users)
+    request_id = await asyncio.to_thread(sdk.cancel, name, job_ids, all,
+                                         all_users)
     if stream_logs is not None:
         return await sdk_async._stream_and_get(request_id, stream_logs)  # pylint: disable=protected-access
     else:
@@ -111,7 +109,7 @@ async def tail_logs(cluster_name: str,
                     tail: int = 0,
                     output_stream: Optional['io.TextIOBase'] = None) -> int:
     """Async version of tail_logs() that tails the logs of a job."""
-    return await context_utils.to_thread(
+    return await asyncio.to_thread(
         sdk.tail_logs,
         cluster_name,
         job_id,
@@ -129,14 +127,14 @@ async def download_logs(
         controller: bool,
         local_dir: str = constants.SKY_LOGS_DIRECTORY) -> Dict[int, str]:
     """Async version of download_logs() that syncs down logs of managed jobs."""
-    return await context_utils.to_thread(sdk.download_logs, name, job_id,
-                                         refresh, controller, local_dir)
+    return await asyncio.to_thread(sdk.download_logs, name, job_id, refresh,
+                                   controller, local_dir)
 
 
 @usage_lib.entrypoint
 async def dashboard() -> None:
     """Async version of dashboard() that starts a dashboard for managed jobs."""
-    return await context_utils.to_thread(sdk.dashboard)
+    return await asyncio.to_thread(sdk.dashboard)
 
 
 # Deprecated functions
diff --git a/sky/jobs/constants.py b/sky/jobs/constants.py
index 98fb0162c4b..e1058f921c7 100644
--- a/sky/jobs/constants.py
+++ b/sky/jobs/constants.py
@@ -4,7 +4,11 @@
 
 from sky.skylet import constants as skylet_constants
 
+# Environment variable for JobGroup name, injected into all jobs in a JobGroup
+SKYPILOT_JOBGROUP_NAME_ENV_VAR = 'SKYPILOT_JOBGROUP_NAME'
+
 JOBS_CONTROLLER_TEMPLATE = 'jobs-controller.yaml.j2'
+JOBS_CONTROLLER_PROVISION_TEMPLATE = 'jobs-controller-provision.yaml.j2'
 JOBS_CONTROLLER_YAML_PREFIX = '~/.sky/jobs_controller'
 JOBS_CONTROLLER_LOGS_DIR = '~/sky_logs/jobs_controller'
 
@@ -54,7 +58,7 @@
 # job.utils.ManagedJobCodeGen to handle the version update.
 # WARNING: If you update this due to a codegen change, make sure to make the
 # corresponding change in the ManagedJobsService AND bump the SKYLET_VERSION.
-MANAGED_JOBS_VERSION = 13  # conditional plugin loading for bwcompat
+MANAGED_JOBS_VERSION = 15  # new fields for job groups
 
 # The command for setting up the jobs dashboard on the controller. It firstly
 # checks if the systemd services are available, and if not (e.g., Kubernetes
diff --git a/sky/jobs/controller.py b/sky/jobs/controller.py
index 4fa44e516cc..3f7e2459bb4 100644
--- a/sky/jobs/controller.py
+++ b/sky/jobs/controller.py
@@ -12,13 +12,14 @@
 import time
 import traceback
 import typing
-from typing import Dict, List, Optional, Set
+from typing import Dict, List, Optional, Set, Tuple
 
 import dotenv
 
 import sky
 from sky import core
 from sky import exceptions
+from sky import global_user_state
 from sky import sky_logging
 from sky import skypilot_config
 from sky.adaptors import common as adaptors_common
@@ -27,6 +28,7 @@
 from sky.data import data_utils
 from sky.jobs import constants as jobs_constants
 from sky.jobs import file_content_utils
+from sky.jobs import job_group_networking
 from sky.jobs import log_gc
 from sky.jobs import recovery_strategy
 from sky.jobs import scheduler
@@ -51,6 +53,7 @@
 if typing.TYPE_CHECKING:
     import psutil
 
+    from sky import task as task_lib
     from sky.schemas.generated import jobsv1_pb2
 else:
     psutil = adaptors_common.LazyImport('psutil')
@@ -90,7 +93,8 @@ def _get_dag(job_id: int) -> 'sky.Dag':
                            'the submission failed to persist the DAG. Please '
                            're-submit the job.')
 
-    dag = dag_utils.load_chain_dag_from_yaml_str(dag_content)
+    # Auto-detect YAML type (JobGroup or chain DAG) and parse accordingly
+    dag = dag_utils.load_dag_from_yaml_str(dag_content)
     assert dag.name is not None, dag
     return dag
 
@@ -166,6 +170,7 @@ def __init__(
         starting_lock: asyncio.Lock,
         starting_signal: asyncio.Condition,
         pool: Optional[str] = None,
+        rank: Optional[int] = None,
     ) -> None:
         """Initialize a ``JobsController``.
 
@@ -180,6 +185,8 @@ def __init__(
             pool: Optional pool name. When provided, the job is
                 submitted to the pool rather than launching a dedicated
                 cluster.
+            rank: Optional rank of the job that can be used to partition
+                workloads.
         """
 
         self.starting = starting
@@ -195,6 +202,8 @@ def __init__(
 
         self._backend = cloud_vm_ray_backend.CloudVmRayBackend()
         self._pool = pool
+        self._rank = rank
+        logger.info(f'Rank for job {self._job_id}: {self._rank}')
 
         # pylint: disable=line-too-long
         # Add a unique identifier to the task environment variables, so that
@@ -226,6 +235,12 @@ def __init__(
             task_envs[constants.TASK_ID_ENV_VAR] = job_id_env_vars[i]
             task_envs[constants.TASK_ID_LIST_ENV_VAR] = '\n'.join(
                 job_id_env_vars)
+            # Add SKYPILOT_JOB_RANK if it's set in the context or os.environ
+            # (os.environ may be hijacked to use ContextualEnviron which includes context overrides)
+            if self._rank is not None:
+                task_envs['SKYPILOT_JOB_RANK'] = str(self._rank)
+            else:
+                task_envs['SKYPILOT_JOB_RANK'] = '0'
             task.update_envs(task_envs)
 
     def _download_log_and_stream(
@@ -237,7 +252,7 @@ def _download_log_and_stream(
         """Downloads and streams the logs of the current job with given task ID.
 
         We do not stream the logs from the cluster directly, as the
-        donwload and stream should be faster, and more robust against
+        download and stream should be faster, and more robust against
         preemptions or ssh disconnection during the streaming.
         """
         if handle is None:
@@ -270,10 +285,10 @@ async def _cleanup_cluster(self, cluster_name: Optional[str]) -> None:
         if cluster_name is None:
             return
         if self._pool is None:
-            await context_utils.to_thread(managed_job_utils.terminate_cluster,
-                                          cluster_name)
+            await asyncio.to_thread(managed_job_utils.terminate_cluster,
+                                    cluster_name)
 
-    async def _get_job_exit_codes(
+    async def _get_cluster_job_exit_codes(
         self, job_id: Optional[int],
         handle: 'cloud_vm_ray_backend.CloudVmRayResourceHandle'
     ) -> Optional[list]:
@@ -287,49 +302,42 @@ async def _get_job_exit_codes(
             List of exit codes, or None if not available.
         """
         try:
-            use_legacy = not handle.is_grpc_enabled_with_flag
-
-            if not use_legacy:
+            # Try gRPC first if enabled
+            if handle.is_grpc_enabled_with_flag:
                 try:
                     request = jobsv1_pb2.GetJobExitCodesRequest()
                     if job_id is not None:
                         request.job_id = job_id
 
-                    response = await context_utils.to_thread(
+                    response = await asyncio.to_thread(
                         backend_utils.invoke_skylet_with_retries,
                         lambda: cloud_vm_ray_backend.SkyletClient(
                             handle.get_grpc_channel()).get_job_exit_codes(
                                 request))
 
-                    exit_codes = list(
+                    return list(
                         response.exit_codes) if response.exit_codes else None
-                    return exit_codes
                 except exceptions.SkyletMethodNotImplementedError:
-                    # Fall back to legacy if RPC not implemented
-                    use_legacy = True
-
-            if use_legacy:
-                # Use existing SSH-based code generation
-                code = job_lib.JobLibCodeGen.get_job_exit_codes(job_id)
-
-                returncode, stdout, stderr = await context_utils.to_thread(
-                    self._backend.run_on_head,
-                    handle,
-                    code,
-                    stream_logs=False,
-                    require_outputs=True,
-                    separate_stderr=True)
-
-                if returncode != 0:
-                    logger.debug(f'Failed to retrieve exit codes: {stderr}')
-                    return None
-
-                exit_codes = json.loads(stdout.strip())
-                return exit_codes
+                    pass  # Fall back to legacy SSH-based method
+
+            # Legacy SSH-based method
+            code = job_lib.JobLibCodeGen.get_job_exit_codes(job_id)
+            returncode, stdout, stderr = await asyncio.to_thread(
+                self._backend.run_on_head,
+                handle,
+                code,
+                stream_logs=False,
+                require_outputs=True,
+                separate_stderr=True)
+
+            if returncode != 0:
+                logger.debug(f'Failed to retrieve exit codes: {stderr}')
+                return None
+
+            return json.loads(stdout.strip())
         except Exception as e:  # pylint: disable=broad-except
             logger.debug(f'Failed to retrieve job exit codes: {e}')
             return None
-        return None
 
     async def _run_one_task(self, task_id: int, task: 'sky.Task') -> bool:
         """Busy loop monitoring cluster status and handling recovery.
@@ -367,7 +375,6 @@ async def _run_one_task(self, task_id: int, task: 'sky.Task') -> bool:
         Other exceptions may be raised depending on the backend.
         """
         _add_k8s_annotations(task, self._job_id)
-        task_start_time = time.time()
         logger.info(
             f'Starting task {task_id} ({task.name}) for job {self._job_id}')
 
@@ -500,9 +507,6 @@ async def _run_one_task(self, task_id: int, task: 'sky.Task') -> bool:
                 start_time=remote_job_submitted_at,
                 callback_func=callback_func)
 
-        monitoring_start_time = time.time()
-        status_check_count = 0
-
         async with self.starting_lock:
             try:
                 self.starting.remove(self._job_id)
@@ -515,64 +519,120 @@ async def _run_one_task(self, task_id: int, task: 'sky.Task') -> bool:
             except KeyError:
                 pass
 
-        transient_job_check_error_start_time = None
-        job_check_backoff = None
-        while True:
-            status_check_count += 1
+        # NOTE: if we are resuming from a controller failure, we only keep
+        # monitoring if the job is in RUNNING state. For all other cases,
+        # we will directly transit to recovering since we have no idea what
+        # the cluster status is.
+        # Handle resume logic before starting the monitoring loop.
+        # If resuming from a controller failure, check the previous state
+        # and determine if we need to force recovery.
+        force_transit_to_recovering = False
+        if is_resume:
+            prev_status = await (
+                managed_job_state.get_job_status_with_task_id_async(
+                    job_id=self._job_id, task_id=task_id))
+
+            if prev_status is not None:
+                if prev_status.is_terminal():
+                    logger.info(f'Task {task_id} already in terminal state: '
+                                f'{prev_status}')
+                    return (prev_status ==
+                            managed_job_state.ManagedJobStatus.SUCCEEDED)
+                if prev_status == managed_job_state.ManagedJobStatus.CANCELLING:
+                    # If the controller is down when cancelling the job,
+                    # we re-raise the error to run the `_cleanup` function
+                    # again to clean up any remaining resources.
+                    logger.info(f'Task {task_id} was being cancelled, '
+                                're-raising cancellation')
+                    raise asyncio.CancelledError()
+            if prev_status != managed_job_state.ManagedJobStatus.RUNNING:
+                force_transit_to_recovering = True
 
-            # NOTE: if we are resuming from a controller failure, we only keep
-            # monitoring if the job is in RUNNING state. For all other cases,
-            # we will directly transit to recovering since we have no idea what
-            # the cluster status is.
-            force_transit_to_recovering = False
-            if is_resume:
-                prev_status = await (
-                    managed_job_state.get_job_status_with_task_id_async(
-                        job_id=self._job_id, task_id=task_id))
+        logger.info('Started monitoring.')
+        return await self._monitor_one_task(
+            task_id=task_id,
+            task=task,
+            cluster_name=cluster_name,
+            executor=self._strategy_executor,
+            job_id_on_pool_cluster=job_id_on_pool_cluster,
+            callback_func=callback_func,
+            cleanup_cluster_on_success=True,
+            force_transit_to_recovering=force_transit_to_recovering,
+        )
+
+    async def _monitor_one_task(
+        self,
+        task_id: int,
+        task: 'sky.Task',
+        cluster_name: str,
+        executor: 'recovery_strategy.StrategyExecutor',
+        job_id_on_pool_cluster: Optional[int] = None,
+        callback_func: Optional[typing.Callable] = None,
+        cleanup_cluster_on_success: bool = True,
+        force_transit_to_recovering: bool = False,
+        on_recovery: Optional[typing.Callable[[], typing.Coroutine]] = None,
+    ) -> bool:
+        """Monitor a single task until completion with recovery support.
+
+        This is the core monitoring loop shared by both single-task execution
+        and JobGroup parallel execution. It handles:
+        - Periodic job status checks with transient error handling
+        - Success/failure detection with exit code-based restart logic
+        - External failure detection
+        - Preemption detection and recovery
 
-                if prev_status is not None:
-                    if prev_status.is_terminal():
-                        logger.info(
-                            f'Task {task_id} already in terminal state: '
-                            f'{prev_status}')
-                        return (prev_status ==
-                                managed_job_state.ManagedJobStatus.SUCCEEDED)
-                    if (prev_status ==
-                            managed_job_state.ManagedJobStatus.CANCELLING):
-                        # If the controller is down when cancelling the job,
-                        # we re-raise the error to run the `_cleanup` function
-                        # again to clean up any remaining resources.
-                        logger.info(f'Task {task_id} was being cancelled, '
-                                    're-raising cancellation')
-                        raise asyncio.CancelledError()
-                if prev_status != managed_job_state.ManagedJobStatus.RUNNING:
-                    force_transit_to_recovering = True
-                # This resume logic should only be triggered once.
-                is_resume = False
-
-            await asyncio.sleep(managed_job_utils.JOB_STATUS_CHECK_GAP_SECONDS)
-
-            # Check the network connection to avoid false alarm for job failure.
-            # Network glitch was observed even in the VM.
-            try:
-                await backend_utils.async_check_network_connection()
-            except exceptions.NetworkError:
-                logger.info('Network is not available. Retrying again in '
-                            f'{managed_job_utils.JOB_STATUS_CHECK_GAP_SECONDS} '
-                            'seconds.')
-                continue
+        Args:
+            task_id: Task ID.
+            task: The task to monitor.
+            cluster_name: Name of the cluster running the task.
+            executor: Recovery strategy executor for handling preemptions.
+            job_id_on_pool_cluster: Job ID on the cluster (for pools).
+            callback_func: Callback function for state updates.
+            cleanup_cluster_on_success: Whether to clean up cluster on success.
+            force_transit_to_recovering: If True, force recovery on first
+                iteration (used when resuming from controller failure).
+            on_recovery: Optional async callback called after recovery.
+                Used by JobGroups to re-setup networking.
+
+        Returns:
+            True if the task succeeded, False otherwise.
+        """
+        if callback_func is None:
+            callback_func = managed_job_utils.event_callback_func(
+                job_id=self._job_id, task_id=task_id, task=task)
 
-            # NOTE: we do not check cluster status first because race condition
-            # can occur, i.e. cluster can be down during the job status check.
-            # NOTE: If fetching the job status fails or we force to transit to
-            # recovering, we will set the job status to None, which will force
-            # enter the recovering logic.
+        transient_job_check_error_start_time = None
+        job_check_backoff = None
+
+        while True:
+            # Get job status (skip on first iteration if forcing recovery)
             job_status = None
             transient_job_check_error_reason = None
+
             if not force_transit_to_recovering:
+                await asyncio.sleep(
+                    managed_job_utils.JOB_STATUS_CHECK_GAP_SECONDS)
+
+                # Check the network connection to avoid false alarm for job
+                # failure. Network glitch was observed even in the VM.
                 try:
-                    job_status, transient_job_check_error_reason = await (
-                        managed_job_utils.get_job_status(
+                    await backend_utils.async_check_network_connection()
+                except exceptions.NetworkError:
+                    logger.info(
+                        'Network is not available. Retrying again in '
+                        f'{managed_job_utils.JOB_STATUS_CHECK_GAP_SECONDS} '
+                        'seconds.')
+                    continue
+
+                # NOTE: we do not check cluster status first because race
+                # condition can occur, i.e. cluster can be down during the job
+                # status check.
+                # NOTE: If fetching the job status fails or we force to transit
+                # to recovering, we will set the job status to None, which will
+                # force enter the recovering logic.
+                try:
+                    job_status, transient_job_check_error_reason = (
+                        await managed_job_utils.get_job_status(
                             self._backend,
                             cluster_name,
                             job_id=job_id_on_pool_cluster,
@@ -582,6 +642,7 @@ async def _run_one_task(self, task_id: int, task: 'sky.Task') -> bool:
                         'Failed to fetch the job status. Start recovery.\n'
                         f'Exception: {common_utils.format_exception(fetch_e)}\n'
                         f'Traceback: {traceback.format_exc()}')
+                    # Fall through to recovery logic below
 
             # When job status check fails, we need to retry to avoid false alarm
             # for job failure, as it could be a transient error for
@@ -600,11 +661,12 @@ async def _run_one_task(self, task_id: int, task: 'sky.Task') -> bool:
                 transient_job_check_error_start_time = None
                 job_check_backoff = None
 
+            # Handle success
             if job_status == job_lib.JobStatus.SUCCEEDED:
                 logger.info(f'Task {task_id} succeeded! '
                             'Getting end time and cleaning up')
                 try:
-                    success_end_time = await context_utils.to_thread(
+                    success_end_time = await asyncio.to_thread(
                         managed_job_utils.try_to_get_job_end_time,
                         self._backend, cluster_name, job_id_on_pool_cluster)
                 except Exception as e:  # pylint: disable=broad-except
@@ -624,10 +686,11 @@ async def _run_one_task(self, task_id: int, task: 'sky.Task') -> bool:
                 logger.info(
                     f'Managed job {self._job_id} (task: {task_id}) SUCCEEDED. '
                     f'Cleaning up the cluster {cluster_name}.')
+
                 try:
                     logger.info(f'Downloading logs on cluster {cluster_name} '
                                 f'and job id {job_id_on_pool_cluster}.')
-                    clusters = await context_utils.to_thread(
+                    clusters = await asyncio.to_thread(
                         backend_utils.get_clusters,
                         cluster_names=[cluster_name],
                         refresh=common.StatusRefreshMode.NONE,
@@ -637,25 +700,20 @@ async def _run_one_task(self, task_id: int, task: 'sky.Task') -> bool:
                         assert len(clusters) == 1, (clusters, cluster_name)
                         handle = clusters[0].get('handle')
                         # Best effort to download and stream the logs.
-                        await context_utils.to_thread(
-                            self._download_log_and_stream, task_id, handle,
-                            job_id_on_pool_cluster)
+                        await asyncio.to_thread(self._download_log_and_stream,
+                                                task_id, handle,
+                                                job_id_on_pool_cluster)
                 except Exception as e:  # pylint: disable=broad-except
                     # We don't want to crash here, so just log and continue.
                     logger.warning(
                         f'Failed to download and stream logs: '
                         f'{common_utils.format_exception(e)}',
                         exc_info=True)
-                # Only clean up the cluster, not the storages, because tasks may
-                # share storages.
-                await self._cleanup_cluster(cluster_name)
-
-                task_total_time = time.time() - task_start_time
-                monitoring_time = time.time() - monitoring_start_time
-                logger.info(f'Task {task_id} completed successfully in '
-                            f'{task_total_time:.2f}s '
-                            f'(monitoring time: {monitoring_time:.2f}s, '
-                            f'status checks: {status_check_count})')
+
+                if cleanup_cluster_on_success:
+                    # Only clean up the cluster, not the storages, because tasks
+                    # may share storages.
+                    await self._cleanup_cluster(cluster_name)
                 return True
 
             # For single-node jobs, non-terminated job_status indicates a
@@ -679,7 +737,7 @@ async def _run_one_task(self, task_id: int, task: 'sky.Task') -> bool:
             # depending on the cloud, which can also cause failure of the job.
             # Plugins can report such failures via ExternalFailureSource.
             # TODO(cooperc): do we need to add this to asyncio thread?
-            (cluster_status, handle) = await context_utils.to_thread(
+            (cluster_status, handle) = await asyncio.to_thread(
                 backend_utils.refresh_cluster_status_handle,
                 cluster_name,
                 force_refresh_statuses=set(status_lib.ClusterStatus))
@@ -696,7 +754,7 @@ async def _run_one_task(self, task_id: int, task: 'sky.Task') -> bool:
                     f'Cluster is preempted or failed{cluster_status_str}. '
                     'Recovering...')
                 if ExternalFailureSource.is_registered():
-                    cluster_failures = await context_utils.to_thread(
+                    cluster_failures = await asyncio.to_thread(
                         ExternalFailureSource.get, cluster_name=cluster_name)
                     if cluster_failures:
                         logger.info(
@@ -705,6 +763,7 @@ async def _run_one_task(self, task_id: int, task: 'sky.Task') -> bool:
                             ExternalClusterFailure.from_failure_list(
                                 cluster_failures))
             else:
+                # Cluster is UP
                 if job_status is not None and not job_status.is_terminal():
                     # The multi-node job is still running, continue monitoring.
                     continue
@@ -714,7 +773,7 @@ async def _run_one_task(self, task_id: int, task: 'sky.Task') -> bool:
                     # The user code has probably crashed, fail immediately.
                     logger.info(
                         f'Task {task_id} failed with status: {job_status}')
-                    end_time = await context_utils.to_thread(
+                    end_time = await asyncio.to_thread(
                         managed_job_utils.try_to_get_job_end_time,
                         self._backend, cluster_name, job_id_on_pool_cluster)
                     logger.info(
@@ -722,9 +781,9 @@ async def _run_one_task(self, task_id: int, task: 'sky.Task') -> bool:
                         'logs below.\n'
                         f'== Logs of the user job (ID: {self._job_id}) ==\n')
 
-                    await context_utils.to_thread(self._download_log_and_stream,
-                                                  task_id, handle,
-                                                  job_id_on_pool_cluster)
+                    await asyncio.to_thread(self._download_log_and_stream,
+                                            task_id, handle,
+                                            job_id_on_pool_cluster)
 
                     failure_reason = (
                         'To see the details, run: '
@@ -749,24 +808,23 @@ async def _run_one_task(self, task_id: int, task: 'sky.Task') -> bool:
                             f'or disk in your job definition. {failure_reason}')
 
                     # Retrieve exit codes from the failed job
-                    exit_codes = await self._get_job_exit_codes(
+                    assert handle is not None, (
+                        'Handle should not be None when cluster is UP', handle)
+                    exit_codes = await self._get_cluster_job_exit_codes(
                         job_id_on_pool_cluster, handle)
 
                     should_restart_on_failure = (
-                        self._strategy_executor.should_restart_on_failure(
+                        executor.should_restart_on_failure(
                             exit_codes=exit_codes))
                     if should_restart_on_failure:
-                        max_restarts = (
-                            self._strategy_executor.max_restarts_on_errors)
+                        max_restarts = executor.max_restarts_on_errors
                         exit_code_msg = (
                             '(Retry the job as '
                             f'max_restarts_on_errors is set to {max_restarts}. '
-                            f'[{self._strategy_executor.restart_cnt_on_failure}'
+                            f'[{executor.restart_cnt_on_failure}'
                             f'/{max_restarts}])')
-                        if (exit_codes and
-                                self._strategy_executor.recover_on_exit_codes):
-                            recover_codes = (
-                                self._strategy_executor.recover_on_exit_codes)
+                        if exit_codes and executor.recover_on_exit_codes:
+                            recover_codes = executor.recover_on_exit_codes
                             matching_codes = [
                                 c for c in exit_codes if c in recover_codes
                             ]
@@ -778,6 +836,7 @@ async def _run_one_task(self, task_id: int, task: 'sky.Task') -> bool:
                         logger.info(
                             'User program crashed '
                             f'({managed_job_status.value}). {exit_code_msg}')
+                        # Fall through to recovery
                     else:
                         logger.info(
                             f'Task {task_id} failed and will not be retried')
@@ -789,6 +848,7 @@ async def _run_one_task(self, task_id: int, task: 'sky.Task') -> bool:
                             end_time=end_time,
                             callback_func=callback_func)
                         return False
+
                 elif job_status is not None:
                     # Either the job is cancelled (should not happen) or in some
                     # unknown new state that we do not handle.
@@ -805,10 +865,10 @@ async def _run_one_task(self, task_id: int, task: 'sky.Task') -> bool:
                         callback_func=callback_func)
                     return False
                 else:
+                    # job_status is None but cluster is UP - transient error
                     # Although the cluster is healthy, we fail to access the
                     # job status. Try to recover the job (will not restart the
                     # cluster, if the cluster is healthy).
-                    assert job_status is None, job_status
                     if transient_job_check_error_reason is not None:
                         assert (transient_job_check_error_start_time
                                 is not None), (
@@ -818,12 +878,10 @@ async def _run_one_task(self, task_id: int, task: 'sky.Task') -> bool:
                             job_check_backoff, transient_job_check_error_reason)
                         elapsed = time.time(
                         ) - transient_job_check_error_start_time
-                        if (elapsed < managed_job_utils.
-                                JOB_STATUS_FETCH_TOTAL_TIMEOUT_SECONDS):
-                            remaining_timeout = (
-                                managed_job_utils.
-                                JOB_STATUS_FETCH_TOTAL_TIMEOUT_SECONDS -
-                                elapsed)
+                        timeout = (managed_job_utils.
+                                   JOB_STATUS_FETCH_TOTAL_TIMEOUT_SECONDS)
+                        if elapsed < timeout:
+                            remaining_timeout = timeout - elapsed
                             backoff_time = min(
                                 job_check_backoff.current_backoff(),
                                 remaining_timeout)
@@ -879,19 +937,591 @@ async def _run_one_task(self, task_id: int, task: 'sky.Task') -> bool:
                 external_failures=external_failures,
             )
 
-            recovered_time = await self._strategy_executor.recover()
+            recovered_time = await executor.recover()
 
+            # Update cluster_name for pools after recovery
             if self._pool is not None:
                 cluster_name, job_id_on_pool_cluster = (
                     await
                     managed_job_state.get_pool_submit_info_async(self._job_id))
                 assert cluster_name is not None
+
             await managed_job_state.set_recovered_async(
                 self._job_id,
                 task_id,
                 recovered_time=recovered_time,
                 callback_func=callback_func)
 
+            # Call recovery callback if provided
+            if on_recovery is not None:
+                await on_recovery()
+
+            logger.info(f'Task {task.name} recovered, continuing monitoring')
+
+            # Reset force flag after first recovery
+            force_transit_to_recovering = False
+
+    async def _prepare_job_group_task_for_launch(
+        self, task: 'sky.Task', task_id: int, job_group_name: str,
+        other_job_names: List[str]
+    ) -> Tuple[str, recovery_strategy.StrategyExecutor]:
+        """Prepare a JobGroup task for launch.
+
+        This function:
+        1. Injects a wait script to ensure networking is ready
+        2. Creates the recovery strategy executor
+        3. Sets task state to STARTING
+
+        Args:
+            task: Task to prepare.
+            task_id: Task ID.
+            job_group_name: JobGroup name.
+            other_job_names: Other task names in the group (to wait for).
+
+        Returns:
+            Tuple of (cluster_name, executor). cluster_name is always
+            deterministic for JobGroups (no pool support).
+        """
+        task_name = task.name
+        assert task_name is not None, f'Task {task_id} must have a name'
+
+        # Inject wait script to ensure networking is ready before task runs.
+        # We inject this into task.run (not task.setup) because:
+        # - setup runs during cluster provisioning (Phase 1)
+        # - DNS mappings file is written in Phase 3 (after clusters are UP)
+        # - If we block in setup, it times out before Phase 3 can run
+        wait_script = job_group_networking.generate_wait_for_networking_script(
+            job_group_name, other_job_names)
+        if wait_script:
+            # Prepend wait script to task run
+            current_run = task.run or ''
+            task.run = wait_script + '\n\n' + current_run
+
+        # JobGroups don't support pools, so cluster name is always deterministic
+        cluster_name = managed_job_utils.generate_managed_job_cluster_name(
+            task_name, self._job_id)
+
+        executor = recovery_strategy.StrategyExecutor.make(
+            cluster_name, self._backend, task, self._job_id, task_id, None,
+            self.starting, self.starting_lock, self.starting_signal)
+
+        callback_func = managed_job_utils.event_callback_func(
+            job_id=self._job_id, task_id=task_id, task=task)
+        resources_str = backend_utils.get_task_resources_str(
+            task, is_managed_job=True)
+        await managed_job_state.set_starting_async(
+            self._job_id,
+            task_id,
+            self._backend.run_timestamp,
+            time.time(),
+            resources_str=resources_str,
+            specs={
+                'max_restarts_on_errors': executor.max_restarts_on_errors,
+                'recover_on_exit_codes': executor.recover_on_exit_codes
+            },
+            callback_func=callback_func)
+
+        return cluster_name, executor
+
+    async def _monitor_job_group_task(
+        self,
+        task_id: int,
+        task: 'sky.Task',
+        cluster_name: str,
+        executor: recovery_strategy.StrategyExecutor,
+        job_group_name: str,
+        all_tasks_handles: List[Tuple['sky.Task', typing.Any]],
+        force_transit_to_recovering: bool = False,
+    ) -> bool:
+        """Monitor a single task in a JobGroup until completion.
+
+        Wraps _monitor_one_task with JobGroup-specific recovery callback
+        for re-setting up networking after recovery.
+
+        Args:
+            task_id: Task ID.
+            task: The task to monitor.
+            cluster_name: Name of the cluster running the task.
+            executor: Recovery strategy executor.
+            job_group_name: Name of the JobGroup.
+            all_tasks_handles: List of (task, handle) tuples for all tasks.
+            force_transit_to_recovering: If True, force recovery on first
+                iteration (used when resuming from controller failure).
+
+        Returns:
+            True if task succeeded, False otherwise.
+        """
+
+        async def on_recovery() -> None:
+            """Re-setup networking after recovery (new node may have new IP)."""
+            updated_handles = []
+            for t, _ in all_tasks_handles:
+                t_name = t.name
+                assert t_name is not None
+                # JobGroups don't support pools, cluster name is deterministic
+                t_cluster = managed_job_utils.generate_managed_job_cluster_name(
+                    t_name, self._job_id)
+                t_handle = await asyncio.to_thread(
+                    global_user_state.get_handle_from_cluster_name, t_cluster)
+                updated_handles.append((t, t_handle))
+
+            await job_group_networking.setup_job_group_networking(
+                job_group_name, updated_handles)
+
+        return await self._monitor_one_task(
+            task_id=task_id,
+            task=task,
+            cluster_name=cluster_name,
+            executor=executor,
+            job_id_on_pool_cluster=None,
+            cleanup_cluster_on_success=False,  # JobGroup cleans up all at end
+            force_transit_to_recovering=force_transit_to_recovering,
+            on_recovery=on_recovery,
+        )
+
+    async def _run_job_group(self) -> bool:
+        """Run a JobGroup with parallel execution.
+
+        Phases:
+        1. Launch clusters (all on same infrastructure)
+        2. Barrier sync - wait for all clusters to be ready
+        3. Set up networking (/etc/hosts injection)
+        4. Monitor all jobs in parallel with recovery support
+
+        Returns:
+            True if all jobs succeeded, False otherwise.
+        """
+        job_group_name = self._dag.name
+        assert job_group_name is not None, 'JobGroup name must be set'
+        assert self._pool is None, 'JobGroups do not support pools'
+        tasks = self._dag.tasks
+        logger.info(f'Starting JobGroup "{job_group_name}" with '
+                    f'{len(tasks)} jobs: {[t.name for t in tasks]}')
+
+        # Inject JobGroup environment variables into all tasks
+        for task in tasks:
+            task_envs = task.envs or {}
+            task_envs[jobs_constants.SKYPILOT_JOBGROUP_NAME_ENV_VAR] = (
+                job_group_name)
+            task.update_envs(task_envs)
+
+        # Collect task statuses and determine which tasks need launch vs resume.
+        # For JobGroups, all tasks run in parallel. Each task's action is
+        # determined by its own status:
+        #   - None/PENDING: fresh launch
+        #   - Terminal: skip (already done)
+        #   - RUNNING: resume monitoring without forced recovery
+        #   - Other non-terminal: resume with forced recovery
+        # Key: task_id, Value: (task_status, force_transit_to_recovering)
+        task_resume_info: Dict[int, Tuple[
+            Optional[managed_job_state.ManagedJobStatus], bool]] = {}
+
+        for task_id, task in enumerate(tasks):
+            task_status = await (
+                managed_job_state.get_job_status_with_task_id_async(
+                    job_id=self._job_id, task_id=task_id))
+
+            if task_status is None or task_status == (
+                    managed_job_state.ManagedJobStatus.PENDING):
+                # Fresh launch
+                task_resume_info[task_id] = (None, False)
+            elif task_status.is_terminal():
+                # Task already completed - no need to monitor
+                task_resume_info[task_id] = (task_status, False)
+                logger.info(f'Task {task_id} ({task.name}) already in '
+                            f'terminal state: {task_status}')
+            elif task_status == managed_job_state.ManagedJobStatus.CANCELLING:
+                # Job was being cancelled when controller went down
+                logger.info('JobGroup was being cancelled, '
+                            're-raising cancellation')
+                raise asyncio.CancelledError()
+            elif task_status == managed_job_state.ManagedJobStatus.RUNNING:
+                # Task was running - resume monitoring without forced recovery
+                task_resume_info[task_id] = (task_status, False)
+                logger.info(f'Task {task_id} ({task.name}) was RUNNING, '
+                            'resuming monitoring')
+            else:
+                # Task was in non-RUNNING, non-terminal state - force recovery
+                task_resume_info[task_id] = (task_status, True)
+                logger.info(f'Task {task_id} ({task.name}) was in '
+                            f'{task_status}, will force recovery')
+
+        def is_terminal(task_id: int) -> bool:
+            """Check if task is in terminal state."""
+            status, _ = task_resume_info[task_id]
+            return status is not None and status.is_terminal()
+
+        def needs_launch(task_id: int) -> bool:
+            """Check if task needs fresh launch (None or PENDING)."""
+            status, _ = task_resume_info[task_id]
+            return (status is None or
+                    status == managed_job_state.ManagedJobStatus.PENDING)
+
+        # Check if all tasks are already in terminal state
+        if all(is_terminal(tid) for tid in range(len(tasks))):
+            logger.info('All tasks already in terminal state')
+            all_succeeded = all(task_resume_info[tid][0] ==
+                                managed_job_state.ManagedJobStatus.SUCCEEDED
+                                for tid in range(len(tasks)))
+            return all_succeeded
+
+        # Phase 1: Launch clusters for tasks that need launching
+        launch_start = time.time()
+        cluster_names: List[Optional[str]] = []
+        strategy_executors: List[recovery_strategy.StrategyExecutor] = []
+        tasks_to_launch = [
+            tid for tid in range(len(tasks)) if needs_launch(tid)
+        ]
+
+        try:
+            # Prepare all tasks (create executors and set STARTING state)
+            for task_id, task in enumerate(tasks):
+                if is_terminal(task_id):
+                    cluster_names.append(None)
+                    strategy_executors.append(None)  # type: ignore[arg-type]
+                    continue
+
+                # Get list of other job names (excluding current task)
+                other_job_names = [t.name for t in tasks if t.name != task.name]
+                name, executor = await self._prepare_job_group_task_for_launch(
+                    task, task_id, job_group_name, other_job_names)
+                cluster_names.append(name)
+                strategy_executors.append(executor)
+
+            # Only launch tasks that need launching
+            if tasks_to_launch:
+                logger.info(f'Phase 1: Launching clusters for tasks '
+                            f'{tasks_to_launch}...')
+                launch_coros = []
+                for task_id in tasks_to_launch:
+                    executor = strategy_executors[task_id]
+                    if executor is not None:
+                        launch_coros.append(executor.launch())
+
+                if launch_coros:
+                    results = await asyncio.gather(*launch_coros,
+                                                   return_exceptions=True)
+                    for result in results:
+                        if isinstance(result, Exception):
+                            raise result
+                    logger.info(f'Clusters launched in '
+                                f'{time.time()-launch_start:.2f}s')
+            else:
+                logger.info('Phase 1: Skipping launch - resuming from '
+                            'previous execution')
+
+        except Exception as e:
+            logger.error(f'Failed to launch clusters: {e}')
+            await self._cleanup_job_group_clusters(cluster_names)
+            raise
+
+        # Phase 2: Barrier sync - collect handles and set RUNNING state
+        logger.info('Phase 2: Waiting for all clusters to be ready...')
+
+        async def sync_task_state(
+            task_id: int, task: 'task_lib.Task', cluster_name: str,
+            is_resuming: bool
+        ) -> 'cloud_vm_ray_backend.CloudVmRayResourceHandle':
+            """Sync state for a single task (parallel execution).
+
+            JobGroups don't support pools, so cluster_name is always
+            deterministic and provided by the caller.
+            """
+            handle = await asyncio.to_thread(
+                global_user_state.get_handle_from_cluster_name, cluster_name)
+
+            # Only set STARTED state if not resuming (already started before)
+            if not is_resuming:
+                callback_func = managed_job_utils.event_callback_func(
+                    job_id=self._job_id, task_id=task_id, task=task)
+                await managed_job_state.set_started_async(
+                    job_id=self._job_id,
+                    task_id=task_id,
+                    start_time=time.time(),
+                    callback_func=callback_func)
+
+            return handle
+
+        # Execute all state syncs in parallel (only for non-terminal tasks)
+        sync_coros = []
+        sync_task_ids = []
+        for task_id, task in enumerate(tasks):
+            if is_terminal(task_id):
+                continue
+            # Task is resuming if it has a status (i.e., was already started)
+            task_is_resuming = task_resume_info[task_id][0] is not None
+            # JobGroups always have deterministic cluster names
+            task_cluster_name = cluster_names[task_id]
+            assert task_cluster_name is not None, (
+                f'cluster_name should be set for non-terminal task {task_id}')
+            sync_coros.append(
+                sync_task_state(task_id, task, task_cluster_name,
+                                task_is_resuming))
+            sync_task_ids.append(task_id)
+
+        sync_results = await asyncio.gather(*sync_coros)
+
+        # Build handles list from sync results
+        handles: List[
+            Optional['cloud_vm_ray_backend.CloudVmRayResourceHandle']] = [
+                None
+            ] * len(tasks)
+        for i, handle in enumerate(sync_results):
+            task_id = sync_task_ids[i]
+            handles[task_id] = handle
+
+        # Phase 3: Set up networking
+        logger.info('Phase 3: Setting up JobGroup networking...')
+        # Build list of (task, handle) for non-terminal tasks with valid handles
+        tasks_handles: List[Tuple[
+            'sky.Task', 'cloud_vm_ray_backend.CloudVmRayResourceHandle']] = []
+        for tid, task in enumerate(tasks):
+            task_handle = handles[tid]
+            if task_handle is not None:
+                tasks_handles.append((task, task_handle))
+
+        if tasks_handles:
+            networking_success = await (
+                job_group_networking.setup_job_group_networking(
+                    job_group_name, tasks_handles))
+            if not networking_success:
+                logger.warning(
+                    'Some networking setup failed, continuing anyway')
+
+        logger.info('JobGroup setup complete, all jobs are running')
+
+        # Phase 4: Monitor all jobs in parallel with primary/auxiliary support
+        logger.info('Phase 4: Monitoring all jobs...')
+
+        # Determine primary vs auxiliary jobs
+        primary_job_names = self._dag.primary_tasks
+        if not primary_job_names:
+            # All jobs are primary (traditional behavior)
+            primary_task_ids: Set[int] = set(range(len(tasks)))
+            auxiliary_task_ids: Set[int] = set()
+        else:
+            primary_task_ids = {
+                tid for tid, t in enumerate(tasks)
+                if t.name in primary_job_names
+            }
+            auxiliary_task_ids = set(range(len(tasks))) - primary_task_ids
+
+        if auxiliary_task_ids:
+            logger.info(
+                f'Primary jobs: {[tasks[tid].name for tid in primary_task_ids]}'
+            )
+            logger.info(f'Auxiliary jobs: '
+                        f'{[tasks[tid].name for tid in auxiliary_task_ids]}')
+
+        # Create asyncio.Task objects for all non-terminal tasks
+        # Maps task_id -> asyncio.Task
+        monitor_async_tasks: Dict[int, asyncio.Task] = {}
+        for task_id, task in enumerate(tasks):
+            if is_terminal(task_id):
+                continue
+
+            _, force_recovery = task_resume_info[task_id]
+            task_handle = handles[task_id]
+            executor = strategy_executors[task_id]
+            cluster_name = cluster_names[task_id]
+            assert cluster_name is not None
+            assert executor is not None
+            coro = self._monitor_job_group_task(task_id, task, cluster_name,
+                                                executor, job_group_name,
+                                                tasks_handles, force_recovery)
+            monitor_async_tasks[task_id] = asyncio.create_task(
+                coro, name=f'monitor_{task.name}')
+
+        # Track results: task_id -> success (True/False/Exception)
+        task_results: Dict[int, typing.Union[bool, Exception]] = {}
+        # Track remaining primary task IDs (non-terminal ones)
+        remaining_primary = primary_task_ids - {
+            tid for tid in range(len(tasks)) if is_terminal(tid)
+        }
+        # Reverse mapping: asyncio.Task -> task_id for efficient lookup
+        async_task_to_id: Dict[asyncio.Task, int] = {
+            at: tid for tid, at in monitor_async_tasks.items()
+        }
+
+        try:
+            # Monitor with primary/auxiliary termination logic
+            while monitor_async_tasks:
+                # Wait for any task to complete
+                done, _ = await asyncio.wait(
+                    monitor_async_tasks.values(),
+                    return_when=asyncio.FIRST_COMPLETED)
+
+                for completed_task in done:
+                    completed_task_id = async_task_to_id[completed_task]
+
+                    # Remove from monitoring
+                    del monitor_async_tasks[completed_task_id]
+                    del async_task_to_id[completed_task]
+
+                    # Get result
+                    try:
+                        task_result: typing.Union[bool, Exception] = (
+                            completed_task.result())
+                        task_results[completed_task_id] = task_result
+                        if task_result:
+                            logger.info(
+                                f'Job {tasks[completed_task_id].name} succeeded'
+                            )
+                        else:
+                            logger.info(
+                                f'Job {tasks[completed_task_id].name} failed')
+                    except asyncio.CancelledError:
+                        # Task was cancelled (auxiliary job termination)
+                        task_results[completed_task_id] = False
+                        task_name = tasks[completed_task_id].name
+                        logger.info(f'Job {task_name} was terminated')
+                    except Exception as e:  # pylint: disable=broad-except
+                        # TODO: avoid broad except
+                        task_results[completed_task_id] = e
+                        logger.error(
+                            f'Job {tasks[completed_task_id].name} failed with '
+                            f'exception: {e}')
+
+                    # If this was a primary task, check if all primary done
+                    if completed_task_id in remaining_primary:
+                        remaining_primary.discard(completed_task_id)
+
+                        if not remaining_primary:
+                            # All primary jobs are done
+                            logger.info('All primary jobs completed')
+
+                            # Check if all primary jobs succeeded. For terminal
+                            # tasks, check their status; for others, check
+                            # result.
+                            def primary_task_succeeded(tid: int) -> bool:
+                                if is_terminal(tid):
+                                    return (task_resume_info[tid][0] ==
+                                            managed_job_state.ManagedJobStatus.
+                                            SUCCEEDED)
+                                return task_results.get(tid, True) is True
+
+                            all_primary_succeeded = all(
+                                primary_task_succeeded(tid)
+                                for tid in primary_task_ids)
+
+                            # Terminate remaining auxiliary jobs
+                            if monitor_async_tasks:
+                                await self._terminate_auxiliary_jobs(
+                                    tasks, monitor_async_tasks, cluster_names,
+                                    all_primary_succeeded)
+                                # All auxiliary jobs terminated, exit loop
+                                break
+
+        except Exception as e:
+            logger.error(f'Monitoring failed: {e}')
+            # Cancel all remaining tasks
+            for task_id, async_task in monitor_async_tasks.items():
+                async_task.cancel()
+            await self._cleanup_job_group_clusters(cluster_names)
+            raise
+
+        # Check results (include terminal tasks)
+        all_succeeded = True
+        for task_id, task in enumerate(tasks):
+            if is_terminal(task_id):
+                # Terminal task - check if it succeeded
+                task_status = task_resume_info[task_id][0]
+                if task_status != managed_job_state.ManagedJobStatus.SUCCEEDED:
+                    all_succeeded = False
+                continue
+
+            # Check the result for this task
+            check_result = task_results.get(task_id)
+            if isinstance(check_result, Exception):
+                logger.error(
+                    f'Job {task.name} monitoring failed: {check_result}')
+                all_succeeded = False
+            elif check_result is not True:
+                all_succeeded = False
+
+        await self._cleanup_job_group_clusters(cluster_names)
+        return all_succeeded
+
+    async def _terminate_auxiliary_jobs(self, tasks: List['task_lib.Task'],
+                                        monitor_async_tasks: Dict[int,
+                                                                  asyncio.Task],
+                                        cluster_names: List[Optional[str]],
+                                        all_primary_succeeded: bool) -> None:
+        """Terminate auxiliary jobs after all primary jobs complete.
+
+        Args:
+            tasks: List of all tasks in the job group.
+            monitor_async_tasks: Dict mapping task_id to asyncio.Task for
+                remaining (auxiliary) jobs.
+            cluster_names: List of cluster names for each task.
+            all_primary_succeeded: Whether all primary jobs succeeded. If True,
+                use configured termination delays. If False, terminate
+                immediately.
+        """
+        if not monitor_async_tasks:
+            return
+
+        async def terminate_one(task_id: int, async_task: asyncio.Task,
+                                delay_secs: int) -> None:
+            """Terminate a single auxiliary job after optional delay."""
+            task_name = tasks[task_id].name
+            if delay_secs > 0:
+                logger.info(f'Waiting {delay_secs}s before terminating '
+                            f'auxiliary job {task_name}...')
+                await asyncio.sleep(delay_secs)
+
+            logger.info(f'Terminating auxiliary job {task_name}')
+
+            # Cancel the monitoring task
+            async_task.cancel()
+            try:
+                await async_task
+            except asyncio.CancelledError:
+                pass
+
+            # Set the task status to cancelled
+            callback_func = managed_job_utils.event_callback_func(
+                job_id=self._job_id, task_id=task_id, task=tasks[task_id])
+            await managed_job_state.set_cancelling_async(
+                job_id=self._job_id, callback_func=callback_func)
+
+            # Clean up the cluster
+            cluster_name = cluster_names[task_id]
+            if cluster_name is not None:
+                try:
+                    await self._cleanup_cluster(cluster_name)
+                except Exception as e:  # pylint: disable=broad-except
+                    logger.warning(
+                        f'Failed to cleanup cluster for {task_name}: {e}')
+
+            await managed_job_state.set_cancelled_async(
+                job_id=self._job_id, callback_func=callback_func)
+
+        # Build termination coroutines with appropriate delays
+        termination_coros = []
+        for task_id, async_task in list(monitor_async_tasks.items()):
+            if all_primary_succeeded:
+                delay_secs = self._dag.get_termination_delay_secs(
+                    tasks[task_id].name)
+            else:
+                # Primary job failed - terminate immediately
+                delay_secs = 0
+            termination_coros.append(
+                terminate_one(task_id, async_task, delay_secs))
+
+        # Run all terminations in parallel
+        await asyncio.gather(*termination_coros, return_exceptions=True)
+
+    async def _cleanup_job_group_clusters(
+            self, cluster_names: typing.List[typing.Optional[str]]) -> None:
+        """Clean up all clusters in a JobGroup."""
+        for cluster_name in cluster_names:
+            if cluster_name is not None:
+                try:
+                    await self._cleanup_cluster(cluster_name)
+                except Exception as e:  # pylint: disable=broad-except
+                    logger.warning(f'Failed to cleanup {cluster_name}: {e}')
+
     async def run(self):
         """Run controller logic and handle exceptions."""
         logger.info(f'Starting JobsController run for job {self._job_id}')
@@ -900,20 +1530,28 @@ async def run(self):
 
         try:
             succeeded = True
-            # We support chain DAGs only for now.
-            for task_id, task in enumerate(self._dag.tasks):
-                logger.info(
-                    f'Processing task {task_id}/{len(self._dag.tasks)-1}: '
-                    f'{task.name}')
-                task_start = time.time()
-                succeeded = await self._run_one_task(task_id, task)
-                task_time = time.time() - task_start
-                logger.info(f'Task {task_id} completed in {task_time:.2f}s '
-                            f'with success={succeeded}')
-
-                if not succeeded:
-                    logger.info(f'Task {task_id} failed, stopping execution')
-                    break
+
+            # Check if this is a JobGroup (parallel execution)
+            if self._dag.is_job_group():
+                logger.info(f'Running as JobGroup with {len(self._dag.tasks)} '
+                            f'parallel jobs')
+                succeeded = await self._run_job_group()
+            else:
+                # Traditional chain DAG: serial execution
+                for task_id, task in enumerate(self._dag.tasks):
+                    logger.info(
+                        f'Processing task {task_id}/{len(self._dag.tasks)-1}: '
+                        f'{task.name}')
+                    task_start = time.time()
+                    succeeded = await self._run_one_task(task_id, task)
+                    task_time = time.time() - task_start
+                    logger.info(f'Task {task_id} completed in {task_time:.2f}s '
+                                f'with success={succeeded}')
+
+                    if not succeeded:
+                        logger.info(
+                            f'Task {task_id} failed, stopping execution')
+                        break
 
         except exceptions.ProvisionPrechecksError as e:
             # Please refer to the docstring of self._run for the cases when
@@ -1105,7 +1743,7 @@ def task_cleanup(task: 'sky.Task', job_id: int):
         for task in dag.tasks:
             # most things in this function are blocking
             try:
-                await context_utils.to_thread(task_cleanup, task, job_id)
+                await asyncio.to_thread(task_cleanup, task, job_id)
             except Exception as e:  # pylint: disable=broad-except
                 error = e
 
@@ -1132,6 +1770,7 @@ async def run_job_loop(self,
         logger.info(f'From controller {self._controller_uuid}')
         logger.info(f'  pid={self._pid}')
 
+        job_rank = None
         env_content = file_content_utils.get_job_env_content(job_id)
         if env_content:
             try:
@@ -1149,6 +1788,25 @@ async def run_job_loop(self,
                     file_content_utils.restore_job_config_file(job_id)
 
                     skypilot_config.reload_config()
+
+                    # Set SKYPILOT_JOB_RANK from job_id_to_rank mapping if
+                    # available
+                    if ('SKYPILOT_JOB_ID_TO_RANK' in env_vars and
+                            env_vars['SKYPILOT_JOB_ID_TO_RANK']):
+                        try:
+                            job_id_to_rank = (json.loads(
+                                env_vars['SKYPILOT_JOB_ID_TO_RANK']))
+                            logger.debug(
+                                f'Loaded job_id_to_rank map: {job_id_to_rank}')
+                            job_rank = job_id_to_rank.get(str(job_id))
+                        except json.JSONDecodeError as e:
+                            logger.warning(
+                                'Failed to parse SKYPILOT_JOB_ID_TO_RANK for '
+                                'job %s: %s', job_id, e)
+                    else:
+                        logger.debug(
+                            'SKYPILOT_JOB_ID_TO_RANK not found in environment '
+                            'variables')
                 else:  # pragma: no cover - defensive
                     logger.error('Context is None, cannot set environment '
                                  'variables')
@@ -1161,7 +1819,7 @@ async def run_job_loop(self,
         try:
             controller = JobController(job_id, self.starting,
                                        self._job_tasks_lock,
-                                       self._starting_signal, pool)
+                                       self._starting_signal, pool, job_rank)
 
             async with self._job_tasks_lock:
                 if job_id in self.job_tasks:
diff --git a/sky/jobs/job_group_networking.py b/sky/jobs/job_group_networking.py
new file mode 100644
index 00000000000..c43398aba1f
--- /dev/null
+++ b/sky/jobs/job_group_networking.py
@@ -0,0 +1,667 @@
+"""Networking utilities for JobGroups.
+
+This module provides functions to set up networking between tasks in a JobGroup.
+
+Architecture:
+    Layer 1: User Interface (environment variables)
+        - SKYPILOT_JOBGROUP_NAME = <job_group_name>
+
+    Layer 2: JobAddressResolver
+        - Resolves task addresses for internal networking
+        - All tasks run on same infrastructure (cloud + region or K8s cluster)
+
+    Layer 3: NetworkConfigurator
+        - Configures network infrastructure (e.g., /etc/hosts injection)
+        - Handles platform-specific differences (K8s vs SSH clouds)
+
+Design Goals:
+    - Unified interface: All tasks access addresses via environment variables
+    - Platform abstraction: K8s uses native DNS, SSH clouds use /etc/hosts
+"""
+import asyncio
+import os
+import tempfile
+import textwrap
+import traceback
+import typing
+from typing import List, Tuple
+
+from sky import clouds as sky_clouds
+from sky import sky_logging
+from sky.utils import command_runner
+
+if typing.TYPE_CHECKING:
+    from sky import task as task_lib
+    from sky.backends import cloud_vm_ray_backend
+
+logger = sky_logging.init_logger(__name__)
+
+# ============================================================================
+# Layer 2: JobAddressResolver - Address resolution abstraction
+# ============================================================================
+
+
+def _is_kubernetes(
+        handle: 'cloud_vm_ray_backend.CloudVmRayResourceHandle') -> bool:
+    """Check if handle is for a Kubernetes cluster."""
+    if handle is None:
+        return False
+    if handle.launched_resources and handle.launched_resources.cloud:
+        return handle.launched_resources.cloud.is_same_cloud(
+            sky_clouds.Kubernetes())
+    return False
+
+
+def _get_k8s_namespace_from_handle(
+        handle: 'cloud_vm_ray_backend.CloudVmRayResourceHandle') -> str:
+    """Get Kubernetes namespace from a resource handle.
+
+    Returns:
+        Namespace string, defaults to 'default' if not available.
+    """
+    if handle is None:
+        return 'default'
+
+    # Try to get namespace from launched_resources
+    if handle.launched_resources and handle.launched_resources.region:
+        # In K8s, region is the context name
+        try:
+            # pylint: disable=import-outside-toplevel
+            from sky.provision.kubernetes import utils as k8s_utils
+            return k8s_utils.get_kube_config_context_namespace(
+                handle.launched_resources.region)
+        except Exception as e:  # pylint: disable=broad-except
+            logger.debug(f'Failed to get K8s namespace from handle, '
+                         f'falling back to default: {e}')
+
+    # Fallback to default namespace
+    return 'default'
+
+
+def _construct_k8s_internal_svc(cluster_name_on_cloud: str, namespace: str,
+                                node_idx: int) -> str:
+    """Construct Kubernetes internal service DNS URL.
+
+    The pod creation logic guarantees this format.
+
+    Args:
+        cluster_name_on_cloud: Cluster name on cloud
+        namespace: Kubernetes namespace
+        node_idx: Node index (0 for head, 1+ for workers)
+
+    Returns:
+        DNS URL like '{cluster}-head.{namespace}.svc.cluster.local'
+    """
+    if node_idx == 0:
+        return f'{cluster_name_on_cloud}-head.{namespace}.svc.cluster.local'
+    return (f'{cluster_name_on_cloud}-worker{node_idx}.'
+            f'{namespace}.svc.cluster.local')
+
+
+def _get_job_address(job_name: str,
+                     job_group_name: str,
+                     node_idx: int = 0) -> str:
+    """Get the address for a job node.
+
+    Returns the hostname that will be resolved via /etc/hosts injection.
+    Both K8s and SSH clouds use this same hostname format.
+
+    Args:
+        job_name: Name of the job.
+        job_group_name: Name of the JobGroup.
+        node_idx: Node index (0 for head, 1+ for workers). Defaults to 0.
+
+    Returns:
+        Hostname string in format: {job_name}-{node_idx}.{job_group_name}
+    """
+    return f'{job_name}-{node_idx}.{job_group_name}'
+
+
+# ============================================================================
+# Layer 3: NetworkConfigurator - Platform-specific network configuration
+# ============================================================================
+
+
+def _generate_k8s_dns_mappings(
+    job_group_name: str,
+    tasks_handles: List[Tuple['task_lib.Task',
+                              'cloud_vm_ray_backend.CloudVmRayResourceHandle']]
+) -> List[Tuple[str, str]]:
+    """Generate K8s DNS to hostname mappings for background updater.
+
+    Args:
+        job_group_name: Name of the JobGroup.
+        tasks_handles: List of (Task, ResourceHandle) tuples.
+
+    Returns:
+        List of (k8s_dns, simple_hostname) tuples.
+    """
+    mappings = []
+    for task, handle in tasks_handles:
+        if handle is None or not _is_kubernetes(handle):
+            continue
+
+        job_name = task.name
+        cluster_name_on_cloud = handle.cluster_name_on_cloud
+        namespace = _get_k8s_namespace_from_handle(handle)
+        num_nodes = (len(handle.stable_internal_external_ips)
+                     if handle.stable_internal_external_ips else 1)
+
+        for node_idx in range(num_nodes):
+            hostname = f'{job_name}-{node_idx}.{job_group_name}'
+            internal_svc = _construct_k8s_internal_svc(cluster_name_on_cloud,
+                                                       namespace, node_idx)
+            mappings.append((internal_svc, hostname))
+            node_type = 'head' if node_idx == 0 else f'worker{node_idx}'
+            logger.debug(f'K8s DNS mapping ({node_type}): '
+                         f'{internal_svc} -> {hostname}')
+
+    return mappings
+
+
+def _generate_hosts_entries(
+    job_group_name: str,
+    tasks_handles: List[Tuple['task_lib.Task',
+                              'cloud_vm_ray_backend.CloudVmRayResourceHandle']]
+) -> str:
+    """Generate /etc/hosts entries for SSH cloud nodes.
+
+    K8s nodes use a background updater to dynamically resolve IPs.
+
+    Args:
+        job_group_name: Name of the JobGroup.
+        tasks_handles: List of (Task, ResourceHandle) tuples for each task.
+
+    Returns:
+        String containing /etc/hosts entries, one per line.
+    """
+    entries = [f'# JobGroup: {job_group_name}']
+
+    for task, handle in tasks_handles:
+        if handle is None:
+            logger.warning(f'Skipping task {task.name}: no handle')
+            continue
+
+        if _is_kubernetes(handle):
+            continue
+
+        if handle.stable_internal_external_ips is None:
+            logger.warning(f'Skipping task {task.name}: no IP information')
+            continue
+
+        task_name = task.name
+        for node_idx, (internal_ip,
+                       _) in enumerate(handle.stable_internal_external_ips):
+            hostname = f'{task_name}-{node_idx}.{job_group_name}'
+            entries.append(f'{internal_ip} {hostname}')
+            logger.debug(f'Host entry (SSH): {internal_ip} -> {hostname}')
+
+    return '\n'.join(entries)
+
+
+async def _inject_hosts_on_node(
+    runner: 'command_runner.CommandRunner',
+    hosts_content: str,
+    job_group_name: str,
+) -> bool:
+    """Inject /etc/hosts entries on a single node.
+
+    Also creates a marker file to signal that networking setup is complete.
+
+    Args:
+        runner: CommandRunner for the target node.
+        hosts_content: Content to append to /etc/hosts.
+        job_group_name: Name of the JobGroup (for marker file).
+
+    Returns:
+        True if successful, False otherwise.
+    """
+    # pylint: disable=invalid-string-quote
+    escaped_content = hosts_content.replace("'", "'\\''")  # noqa: Q000
+    marker_file = get_network_ready_marker_path(job_group_name)
+    # Use ALIAS_SUDO_TO_EMPTY_FOR_ROOT_CMD to handle containers without sudo
+    # but running as root (e.g., pytorch/pytorch images)
+    cmd = (
+        f'{command_runner.ALIAS_SUDO_TO_EMPTY_FOR_ROOT_CMD} && '
+        f"echo '{escaped_content}' | "  # noqa: Q000
+        f'sudo tee -a /etc/hosts > /dev/null && touch {marker_file}')
+    # pylint: enable=invalid-string-quote
+
+    try:
+        loop = asyncio.get_running_loop()
+        returncode, _, stderr = await loop.run_in_executor(
+            None,
+            lambda: runner.run(cmd, stream_logs=False, require_outputs=True))
+        if returncode != 0:
+            logger.error(f'Failed to inject /etc/hosts: {stderr}')
+            return False
+        return True
+    except Exception as e:  # pylint: disable=broad-except
+        logger.error(f'Exception while injecting /etc/hosts: {e}')
+        logger.error(traceback.format_exc())
+        return False
+
+
+def generate_k8s_dns_updater_script(dns_mappings: List[Tuple[str, str]],
+                                    job_group_name: str) -> str:
+    """Generate background script to update /etc/hosts with K8s DNS IPs.
+
+    Args:
+        dns_mappings: List of (k8s_dns, simple_hostname) tuples.
+        job_group_name: Name of the job group (for process identification).
+
+    Returns:
+        Bash script as a string (standalone, without nohup wrapper).
+    """
+    if not dns_mappings:
+        return ''
+
+    mapping_pairs = ' '.join(
+        f'{dns}:{hostname}' for dns, hostname in dns_mappings)
+
+    # Note: job_group_name is validated at YAML load time to be shell-safe
+    # Use ALIAS_SUDO_TO_EMPTY_FOR_ROOT_CMD to handle containers without sudo
+    # but running as root (e.g., pytorch/pytorch images)
+    script = textwrap.dedent(f"""\
+        #!/bin/bash
+        # Background K8s DNS to IP updater for /etc/hosts
+
+        # Disable sudo for root user - handles containers without sudo installed
+        {command_runner.ALIAS_SUDO_TO_EMPTY_FOR_ROOT_CMD}
+
+        MAPPINGS="{mapping_pairs}"
+        MARKER="# SkyPilot JobGroup K8s entries"
+
+        echo "$(date '+%Y-%m-%d %H:%M:%S') [INFO] Starting DNS updater for {job_group_name}"
+        echo "$(date '+%Y-%m-%d %H:%M:%S') [INFO] Monitoring mappings: $MAPPINGS"
+
+        while true; do
+          # Build new entries
+          new_entries=""
+          needs_update=0
+          for mapping in $MAPPINGS; do
+            k8s_dns="${{mapping%%:*}}"
+            simple_name="${{mapping##*:}}"
+            # Resolve K8s DNS to IP
+            ip=$(getent hosts "$k8s_dns" 2>/dev/null | awk '{{print $1}}')
+            if [ -n "$ip" ]; then
+              new_entries="${{new_entries}}$ip $simple_name  $MARKER
+        "
+              # Check if current IP differs from /etc/hosts
+              # Note: On first run, current_ip will be empty, triggering update
+              current_ip=$(getent hosts "$simple_name" 2>/dev/null | awk '{{print $1}}')
+              if [ "$ip" != "$current_ip" ]; then
+                needs_update=1
+                echo "$(date '+%Y-%m-%d %H:%M:%S') [INFO] IP changed for $simple_name: $current_ip -> $ip"
+              fi
+            else
+              echo "$(date '+%Y-%m-%d %H:%M:%S') [DEBUG] Waiting to resolve $k8s_dns"
+            fi
+          done
+
+          # Only update /etc/hosts if IPs have changed
+          if [ -n "$new_entries" ] && [ $needs_update -eq 1 ]; then
+            echo "$(date '+%Y-%m-%d %H:%M:%S') [INFO] Updating /etc/hosts"
+            # In K8s, /etc/hosts is mounted by kubelet and cannot be replaced (mv).
+            # Instead, we filter and rewrite in-place using tee.
+            # 1. Read existing content without our markers
+            existing=$(sudo grep -v "$MARKER" /etc/hosts 2>/dev/null || true)
+            # 2. Write back existing + new entries using tee
+            if echo -e "$existing\\n$new_entries" | sudo tee /etc/hosts > /dev/null; then
+              echo "$(date '+%Y-%m-%d %H:%M:%S') [INFO] Successfully updated /etc/hosts"
+            else
+              echo "$(date '+%Y-%m-%d %H:%M:%S') [ERROR] Failed to update /etc/hosts"
+            fi
+          fi
+          sleep 5
+        done
+        """)
+    return script.strip()
+
+
+async def _start_k8s_dns_updater_on_node(
+    runner: 'command_runner.CommandRunner',
+    dns_mappings: List[Tuple[str, str]],
+    job_group_name: str,
+) -> bool:
+    """Start background DNS updater on a K8s node.
+
+    The updater resolves K8s service DNS names to IPs and keeps
+    /etc/hosts updated.
+
+    Args:
+        runner: CommandRunner for the target node.
+        dns_mappings: List of (k8s_dns, simple_hostname) tuples.
+        job_group_name: Name of the job group (for process identification).
+
+    Returns:
+        True if successful, False otherwise.
+    """
+    if not dns_mappings:
+        return True
+
+    updater_script = generate_k8s_dns_updater_script(dns_mappings,
+                                                     job_group_name)
+
+    # Note: job_group_name is validated at YAML load time to be shell-safe
+    # (alphanumeric, hyphens, underscores only - see dag_utils.py:477-485)
+    # This ensures the process_id is safe for use in pgrep patterns and paths
+    process_id = f'skypilot-jobgroup-dns-updater-{job_group_name}'
+    script_path = f'/tmp/{process_id}.sh'
+    log_path = f'/tmp/{process_id}.log'
+
+    loop = asyncio.get_running_loop()
+
+    try:
+        # Upload script via rsync
+        with tempfile.NamedTemporaryFile('w',
+                                         prefix='sky_dns_updater_',
+                                         suffix='.sh',
+                                         delete=False) as f:
+            f.write(updater_script)
+            local_script_path = f.name
+
+        try:
+            logger.info(f'Uploading DNS updater script for {job_group_name}...')
+            await loop.run_in_executor(
+                None, lambda: runner.rsync(source=local_script_path,
+                                           target=script_path,
+                                           up=True,
+                                           stream_logs=False))
+            logger.info(f'DNS updater script uploaded to {script_path}')
+        finally:
+            os.remove(local_script_path)
+
+        # Make executable and run in background, then verify it started.
+        # Uses nohup with a subshell to fully detach from kubectl exec.
+        # After a brief sleep, pgrep confirms the process is running.
+        # Use 0.5s sleep to ensure process is visible on loaded systems.
+        # Also create the marker file to signal networking setup is initiated.
+        marker_file = get_network_ready_marker_path(job_group_name)
+        run_cmd = (f'chmod +x {script_path} && '
+                   f'(nohup {script_path} < /dev/null > {log_path} 2>&1 &) && '
+                   f'sleep 0.5 && '
+                   f'pgrep -f "{process_id}" > /dev/null && '
+                   f'touch {marker_file}')
+        logger.info(f'Starting DNS updater in background (log: {log_path})...')
+        returncode, _, stderr = await loop.run_in_executor(
+            None,
+            lambda: runner.run(run_cmd, stream_logs=False, require_outputs=True)
+        )
+
+        # Exit code 143 (SIGTERM) is expected when kubectl exec closes the
+        # connection. The background process continues running despite this.
+        if returncode not in (0, 143):
+            logger.error(f'Failed to start DNS updater: '
+                         f'returncode={returncode}, stderr={stderr}')
+            return False
+
+        logger.info('DNS updater started successfully')
+        return True
+    except Exception as e:  # pylint: disable=broad-except
+        logger.error(f'Exception while starting DNS updater: {e}')
+        logger.error(traceback.format_exc())
+        return False
+
+
+class NetworkConfigurator:
+    """Configures network infrastructure for JobGroups.
+
+    Handles platform-specific network configuration:
+    - K8s: No configuration needed (DNS works automatically)
+    - SSH clouds: Injects /etc/hosts entries for hostname resolution
+    """
+
+    @staticmethod
+    async def setup(
+        job_group_name: str,
+        tasks_handles: List[Tuple[
+            'task_lib.Task', 'cloud_vm_ray_backend.CloudVmRayResourceHandle']],
+    ) -> bool:
+        """Set up network configuration for JobGroup.
+
+        Args:
+            job_group_name: Name of the JobGroup.
+            tasks_handles: List of (Task, ResourceHandle) tuples.
+
+        Returns:
+            True if all configuration succeeded, False otherwise.
+        """
+        return await NetworkConfigurator._inject_etc_hosts(
+            job_group_name, tasks_handles)
+
+    @staticmethod
+    async def _inject_etc_hosts(
+        job_group_name: str,
+        tasks_handles: List[Tuple[
+            'task_lib.Task', 'cloud_vm_ray_backend.CloudVmRayResourceHandle']],
+    ) -> bool:
+        """Inject /etc/hosts entries for all clusters in the JobGroup.
+
+        This maps the unified hostname format to actual addresses:
+        - K8s: Write DNS mappings file for skylet's HostUpdater
+        - SSH: Inject static internal IPs
+
+        Args:
+            job_group_name: Name of the JobGroup.
+            tasks_handles: List of (Task, ResourceHandle) tuples for all jobs.
+
+        Returns:
+            True if all injections succeeded, False otherwise.
+        """
+        logger.info(f'Setting up networking on all {len(tasks_handles)} jobs')
+
+        ssh_hosts_content = _generate_hosts_entries(job_group_name,
+                                                    tasks_handles)
+        k8s_dns_mappings = _generate_k8s_dns_mappings(job_group_name,
+                                                      tasks_handles)
+
+        # Each entry: (coroutine, task_name, node_idx, is_k8s)
+        setup_tasks: List[Tuple] = []
+        for task, handle in tasks_handles:
+            if handle is None:
+                continue
+
+            is_k8s = _is_kubernetes(handle)
+            try:
+                runners = handle.get_command_runners()
+            except Exception as e:  # pylint: disable=broad-except
+                logger.warning(
+                    f'Failed to get command runners for {task.name}: {e}')
+                continue
+
+            for node_idx, runner in enumerate(runners):
+                if is_k8s:
+                    coro = _start_k8s_dns_updater_on_node(
+                        runner, k8s_dns_mappings, job_group_name)
+                    setup_tasks.append((coro, task.name, node_idx, True))
+                else:
+                    # ssh_hosts_content is always truthy (has header comment)
+                    assert ssh_hosts_content, 'unreachable'
+                    coro = _inject_hosts_on_node(runner, ssh_hosts_content,
+                                                 job_group_name)
+                    setup_tasks.append((coro, task.name, node_idx, False))
+                logger.debug(
+                    f'Queued networking setup for {task.name}-{node_idx}')
+
+        if not setup_tasks:
+            logger.warning('No nodes to set up networking')
+            return True
+
+        coroutines = [entry[0] for entry in setup_tasks]
+        logger.info(f'Setting up networking on {len(coroutines)} nodes...')
+        try:
+            results = await asyncio.wait_for(asyncio.gather(
+                *coroutines, return_exceptions=True),
+                                             timeout=60.0)
+        except asyncio.TimeoutError:
+            logger.error('Networking setup timed out after 60 seconds')
+            return False
+
+        success_count = 0
+        for i, result in enumerate(results):
+            if result is True:
+                success_count += 1
+                continue
+
+            # Log error details for failed tasks
+            _, task_name, node_idx, is_k8s = setup_tasks[i]
+            setup_type = 'K8s DNS updater' if is_k8s else '/etc/hosts'
+            node_label = f'{task_name}-{node_idx}'
+
+            if isinstance(result, Exception):
+                tb_str = ''.join(
+                    traceback.format_exception(type(result), result,
+                                               result.__traceback__))
+                logger.error(
+                    f'{setup_type} failed on {node_label}: {result}\n{tb_str}')
+            else:
+                logger.error(f'{setup_type} failed on {node_label}')
+
+        logger.info(
+            f'Hosts injection: {success_count}/{len(results)} succeeded')
+        return success_count == len(results)
+
+
+# ============================================================================
+# Layer 4: Public API
+# ============================================================================
+
+
+async def setup_job_group_networking(
+    job_group_name: str,
+    tasks_handles: List[Tuple['task_lib.Task',
+                              'cloud_vm_ray_backend.CloudVmRayResourceHandle']],
+) -> bool:
+    """Set up networking for all tasks in a JobGroup.
+
+    This is the main entry point for JobGroup networking setup.
+
+    Args:
+        job_group_name: Name of the JobGroup.
+        tasks_handles: List of (Task, ResourceHandle) tuples for each task.
+
+    Returns:
+        True if setup succeeded, False otherwise.
+    """
+    logger.info(f'Setting up networking for JobGroup: {job_group_name}')
+    return await NetworkConfigurator.setup(job_group_name, tasks_handles)
+
+
+def get_network_ready_marker_path(job_group_name: str) -> str:
+    """Get the path to the networking ready marker file.
+
+    This marker file is created by Phase 3 (setup_job_group_networking)
+    after /etc/hosts entries are set up. The wait script checks for this
+    file before starting the hostname resolution timeout.
+
+    Args:
+        job_group_name: Name of the JobGroup.
+
+    Returns:
+        Path to the marker file.
+    """
+    return f'/tmp/skypilot-jobgroup-network-ready-{job_group_name}'
+
+
+def generate_wait_for_networking_script(job_group_name: str,
+                                        other_job_names: List[str]) -> str:
+    """Generate a bash script to wait for network setup.
+
+    This script should be prepended to task.setup to ensure networking
+    is ready before the task starts.
+
+    The script has two phases:
+    1. Wait for the networking ready marker file (created by Phase 3)
+    2. Wait for all hostnames to be resolvable
+
+    Args:
+        job_group_name: Name of the JobGroup.
+        other_job_names: List of other task names in the group to wait for.
+
+    Returns:
+        Bash script as a string.
+    """
+    # Generate hostnames to wait for
+    hostnames = [
+        f'{task_name}-0.{job_group_name}' for task_name in other_job_names
+    ]
+
+    if not hostnames:
+        return ''
+
+    hostname_list = ' '.join(hostnames)
+    # Note: job_group_name is validated at YAML load time to be shell-safe
+    marker_file = get_network_ready_marker_path(job_group_name)
+    updater_log = (f'/tmp/skypilot-jobgroup-dns-updater-'
+                   f'{job_group_name}.log')
+    updater_process = f'skypilot-jobgroup-dns-updater-{job_group_name}'
+
+    # TODO(zhwu): The current handling is not robust against the case where
+    # network setup fails. The job will continue but may get stuck if it
+    # depends on networking. We should make the job group automatically
+    # recover (e.g., re-trigger network setup or restart the job) if the
+    # network fails to initialize properly.
+    wait_script = textwrap.dedent(f"""
+        # Wait for JobGroup networking to be ready (best-effort, non-blocking)
+        # If networking fails, we continue anyway to allow job group recovery
+        echo "[SkyPilot] Waiting for network setup..."
+        NETWORK_READY=true
+
+        # Phase 1: Wait for networking setup to be initiated by controller
+        # This marker file is created after Phase 3 sets up /etc/hosts
+        MARKER_FILE="{marker_file}"
+        MARKER_WAIT=600  # 10 minutes to wait for Phase 3 to start
+        MARKER_ELAPSED=0
+        echo "[SkyPilot] Waiting for networking initialization marker..."
+        while [ ! -f "$MARKER_FILE" ]; do
+          if [ $MARKER_ELAPSED -ge $MARKER_WAIT ]; then
+            echo "[SkyPilot] Warning: Networking setup not initiated after ${{MARKER_ELAPSED}}s"
+            echo "[SkyPilot] Continuing without full network setup (job group may recover later)"
+            NETWORK_READY=false
+            break
+          fi
+          if [ $(($MARKER_ELAPSED % 60)) -eq 0 ] && [ $MARKER_ELAPSED -gt 0 ]; then
+            echo "[SkyPilot] Still waiting for networking initialization (${{MARKER_ELAPSED}}s elapsed)..."
+          fi
+          sleep 5
+          MARKER_ELAPSED=$((MARKER_ELAPSED + 5))
+        done
+
+        if [ "$NETWORK_READY" = "true" ]; then
+          echo "[SkyPilot] Networking setup initiated, waiting for hostnames..."
+
+          # Phase 2: Wait for all hostnames to be resolvable
+          echo "[SkyPilot] Waiting for hostnames: {hostname_list}"
+          HOSTNAMES="{hostname_list}"
+          MAX_WAIT=300  # 5 minutes
+          ELAPSED=0
+          UPDATER_LOG="{updater_log}"
+          UPDATER_PROCESS="{updater_process}"
+          for hostname in $HOSTNAMES; do
+            while ! getent hosts "$hostname" >/dev/null 2>&1; do
+              if [ $ELAPSED -ge $MAX_WAIT ]; then
+                echo "[SkyPilot] Warning: Network setup timed out for \\"$hostname\\" after ${{ELAPSED}}s"
+                echo "[SkyPilot] DNS updater running: $(pgrep -f "$UPDATER_PROCESS" > /dev/null && echo 'yes' || echo 'no')"
+                echo "[SkyPilot] Continuing without full network setup (job group may recover later)"
+                NETWORK_READY=false
+                break 2  # Break out of both loops
+              fi
+              if [ $(($ELAPSED % 30)) -eq 0 ]; then
+                echo "[SkyPilot] Still waiting for $hostname (${{ELAPSED}}s elapsed)..."
+              fi
+              sleep 2
+              ELAPSED=$((ELAPSED + 2))
+            done
+            if [ "$NETWORK_READY" = "true" ]; then
+              echo "[SkyPilot] Hostname $hostname is now resolvable"
+            fi
+          done
+        fi
+
+        if [ "$NETWORK_READY" = "true" ]; then
+          echo "[SkyPilot] Network is ready!"
+        fi
+    """)
+
+    return wait_script.strip()
diff --git a/sky/jobs/recovery_strategy.py b/sky/jobs/recovery_strategy.py
index 1319207d8c1..94dd3a05943 100644
--- a/sky/jobs/recovery_strategy.py
+++ b/sky/jobs/recovery_strategy.py
@@ -28,7 +28,6 @@
 from sky.skylet import job_lib
 from sky.usage import usage_lib
 from sky.utils import common_utils
-from sky.utils import context_utils
 from sky.utils import env_options
 from sky.utils import instance_links as instance_links_utils
 from sky.utils import registry
@@ -202,7 +201,7 @@ async def recover(self) -> float:
     async def _try_cancel_jobs(self):
         if self.cluster_name is None:
             return
-        handle = await context_utils.to_thread(
+        handle = await asyncio.to_thread(
             global_user_state.get_handle_from_cluster_name, self.cluster_name)
         if handle is None or self.pool is not None:
             return
@@ -229,21 +228,21 @@ async def _try_cancel_jobs(self):
             # then kill the user process on remaining worker nodes.
             # Only cancel the corresponding job for pool.
             if self.pool is None:
-                request_id = await context_utils.to_thread(
+                request_id = await asyncio.to_thread(
                     sdk.cancel,
                     cluster_name=self.cluster_name,
                     all=True,
                     _try_cancel_if_cluster_is_init=True,
                 )
             else:
-                request_id = await context_utils.to_thread(
+                request_id = await asyncio.to_thread(
                     sdk.cancel,
                     cluster_name=self.cluster_name,
                     job_ids=[self.job_id_on_pool_cluster],
                     _try_cancel_if_cluster_is_init=True,
                 )
             logger.debug(f'sdk.cancel request ID: {request_id}')
-            await context_utils.to_thread(
+            await asyncio.to_thread(
                 sdk.get,
                 request_id,
             )
@@ -254,7 +253,7 @@ async def _try_cancel_jobs(self):
                         f'{common_utils.format_exception(e)}\n'
                         'Terminating the cluster explicitly to ensure no '
                         'remaining job process interferes with recovery.')
-            await context_utils.to_thread(self._cleanup_cluster)
+            await asyncio.to_thread(self._cleanup_cluster)
 
     async def _wait_until_job_starts_on_cluster(self) -> Optional[float]:
         """Wait for MAX_JOB_CHECKING_RETRY times until job starts on the cluster
@@ -270,7 +269,7 @@ async def _wait_until_job_starts_on_cluster(self) -> Optional[float]:
             # Avoid the infinite loop, if any bug happens.
             job_checking_retry_cnt += 1
             try:
-                cluster_status, _ = (await context_utils.to_thread(
+                cluster_status, _ = (await asyncio.to_thread(
                     backend_utils.refresh_cluster_status_handle,
                     self.cluster_name,
                     force_refresh_statuses=set(status_lib.ClusterStatus)))
@@ -282,7 +281,8 @@ async def _wait_until_job_starts_on_cluster(self) -> Optional[float]:
                 logger.info(f'Unexpected exception: {e}\nFailed to get the '
                             'refresh the cluster status. Retrying.')
                 continue
-            if cluster_status != status_lib.ClusterStatus.UP:
+            if cluster_status not in (status_lib.ClusterStatus.UP,
+                                      status_lib.ClusterStatus.AUTOSTOPPING):
                 # The cluster can be preempted before the job is
                 # launched.
                 # Break to let the retry launch kick in.
@@ -317,7 +317,7 @@ async def _wait_until_job_starts_on_cluster(self) -> Optional[float]:
             # Check the job status until it is not in initialized status
             if status is not None and status > job_lib.JobStatus.INIT:
                 try:
-                    job_submitted_at = await context_utils.to_thread(
+                    job_submitted_at = await asyncio.to_thread(
                         managed_job_utils.get_job_timestamp,
                         self.backend,
                         self.cluster_name,
@@ -417,7 +417,7 @@ async def _launch(self,
                                                  f'{env_var}')
                                 logger.debug('Env vars for api_start: '
                                              f'{os.environ}')
-                                await context_utils.to_thread(sdk.api_start)
+                                await asyncio.to_thread(sdk.api_start)
                                 logger.info('API server started.')
                             finally:
                                 for env_var, value in vars_to_restore.items():
@@ -428,7 +428,7 @@ async def _launch(self,
 
                             request_id = None
                             try:
-                                request_id = await context_utils.to_thread(
+                                request_id = await asyncio.to_thread(
                                     sdk.launch,
                                     self.dag,
                                     cluster_name=self.cluster_name,
@@ -448,19 +448,18 @@ async def _launch(self,
                                 )
                                 logger.debug('sdk.launch request ID: '
                                              f'{request_id}')
-                                await context_utils.to_thread(
+                                await asyncio.to_thread(
                                     sdk.stream_and_get,
                                     request_id,
                                 )
                             except asyncio.CancelledError:
                                 if request_id:
-                                    req = await context_utils.to_thread(
+                                    req = await asyncio.to_thread(
                                         sdk.api_cancel, request_id)
                                     logger.debug('sdk.api_cancel request '
                                                  f'ID: {req}')
                                     try:
-                                        await context_utils.to_thread(
-                                            sdk.get, req)
+                                        await asyncio.to_thread(sdk.get, req)
                                     except Exception as e:  # pylint: disable=broad-except
                                         # we must still return a CancelledError
                                         logger.error(
@@ -475,7 +474,7 @@ async def _launch(self,
                                 task = self.dag.tasks[self.task_id]
                                 task_resources = task.resources
 
-                            self.cluster_name = await (context_utils.to_thread(
+                            self.cluster_name = await (asyncio.to_thread(
                                 serve_utils.get_next_cluster_name, self.pool,
                                 self.job_id, task_resources))
                             if self.cluster_name is None:
@@ -483,25 +482,25 @@ async def _launch(self,
                                     'No cluster name found in the pool.')
                             request_id = None
                             try:
-                                request_id = await context_utils.to_thread(
+                                request_id = await asyncio.to_thread(
                                     sdk.exec,
                                     self.dag,
                                     cluster_name=self.cluster_name,
                                 )
                                 logger.debug('sdk.exec request ID: '
                                              f'{request_id}')
-                                job_id_on_pool_cluster, _ = (
-                                    await context_utils.to_thread(
-                                        sdk.get, request_id))
+                                job_id_on_pool_cluster, _ = (await
+                                                             asyncio.to_thread(
+                                                                 sdk.get,
+                                                                 request_id))
                             except asyncio.CancelledError:
                                 if request_id:
-                                    req = await context_utils.to_thread(
+                                    req = await asyncio.to_thread(
                                         sdk.api_cancel, request_id)
                                     logger.debug('sdk.api_cancel request '
                                                  f'ID: {req}')
                                     try:
-                                        await context_utils.to_thread(
-                                            sdk.get, req)
+                                        await asyncio.to_thread(sdk.get, req)
                                     except Exception as e:  # pylint: disable=broad-except
                                         # we must still return a CancelledError
                                         logger.error(
@@ -572,7 +571,7 @@ async def _launch(self,
                         # cloud
                         if self.cluster_name is not None and self.pool is None:
                             try:
-                                handle = await context_utils.to_thread(
+                                handle = await asyncio.to_thread(
                                     global_user_state.
                                     get_handle_from_cluster_name,
                                     self.cluster_name)
@@ -630,7 +629,7 @@ async def _launch(self,
 
                     # If we get here, the launch did not succeed. Tear down the
                     # cluster and retry.
-                    await context_utils.to_thread(self._cleanup_cluster)
+                    await asyncio.to_thread(self._cleanup_cluster)
                     if max_retry is not None and retry_cnt >= max_retry:
                         # Retry forever if max_retry is None.
                         if raise_on_failure:
@@ -736,13 +735,25 @@ async def _launch(self,
                                                  recovery)
         if job_submitted_at is not None and self.cluster_name is not None:
             # Only record the cloud/region if the launch is successful.
-            handle = await context_utils.to_thread(
+            handle = await asyncio.to_thread(
                 global_user_state.get_handle_from_cluster_name,
                 self.cluster_name)
             assert isinstance(handle, backends.CloudVmRayResourceHandle), (
                 'Cluster should be launched.', handle)
             launched_resources = handle.launched_resources
             self._launched_resources = launched_resources
+
+            # Persist infra info to database for sorting/filtering
+            if launched_resources is not None:
+                cloud = str(launched_resources.cloud
+                           ) if launched_resources.cloud else None
+                await asyncio.to_thread(
+                    state.set_job_infra,
+                    self.job_id,
+                    cloud=cloud,
+                    region=launched_resources.region,
+                    zone=launched_resources.zone,
+                )
         else:
             self._launched_resources = None
         return job_submitted_at
@@ -779,7 +790,7 @@ async def recover(self) -> float:
             # Step 2
             logger.debug('Terminating unhealthy cluster and reset cloud '
                          'region.')
-            await context_utils.to_thread(self._cleanup_cluster)
+            await asyncio.to_thread(self._cleanup_cluster)
 
             # Step 3
             logger.debug('Relaunch the cluster  without constraining to prior '
@@ -838,7 +849,7 @@ async def recover(self) -> float:
 
         # Step 1
         logger.debug('Terminating unhealthy cluster and reset cloud region.')
-        await context_utils.to_thread(self._cleanup_cluster)
+        await asyncio.to_thread(self._cleanup_cluster)
 
         # Step 2
         logger.debug('Relaunch the cluster skipping the previously launched '
diff --git a/sky/jobs/scheduler.py b/sky/jobs/scheduler.py
index cec78f800ba..d744838647f 100644
--- a/sky/jobs/scheduler.py
+++ b/sky/jobs/scheduler.py
@@ -59,10 +59,12 @@
 from sky.adaptors import common as adaptors_common
 from sky.client import sdk
 from sky.jobs import constants as managed_job_constants
+from sky.jobs import file_content_utils
 from sky.jobs import state
 from sky.jobs import utils as managed_job_utils
 from sky.skylet import constants
 from sky.utils import controller_utils
+from sky.utils import dag_utils
 from sky.utils import subprocess_utils
 
 if typing.TYPE_CHECKING:
@@ -263,27 +265,32 @@ def maybe_start_controllers(from_scheduler: bool = False) -> None:
         pass
 
 
-def submit_job(job_id: int, dag_yaml_path: str, original_user_yaml_path: str,
-               env_file_path: str, priority: int) -> None:
-    """Submit an existing job to the scheduler.
+def submit_jobs(job_ids: List[int], dag_yaml_path: str,
+                original_user_yaml_path: str, env_file_path: str,
+                priority: int) -> None:
+    """Submit multiple existing jobs to the scheduler.
 
-    This should be called after a job is created in the `spot` table as
-    PENDING. It will tell the scheduler to try and start the job controller, if
+    This should be called after jobs are created in the `spot` table as
+    PENDING. It will tell the scheduler to try and start the job controllers, if
     there are resources available.
 
     The user hash should be set (e.g. via SKYPILOT_USER_ID) before calling this.
     """
-    controller_process = state.get_job_controller_process(job_id)
-    if controller_process is not None:
-        # why? TODO(cooperc): figure out why this is needed, fix it, and remove
-        if managed_job_utils.controller_process_alive(controller_process,
-                                                      job_id):
-            # This can happen when HA recovery runs for some reason but the job
-            # controller is still alive.
-            logger.warning(f'Job {job_id} is still alive with controller '
-                           f'{controller_process}, skipping submission')
-            maybe_start_controllers(from_scheduler=True)
-            return
+    job_ids_without_controller_process = []
+    for job_id in job_ids:
+        controller_process = state.get_job_controller_process(job_id)
+        if controller_process is not None:
+            # why? TODO(cooperc): figure out why this is needed, fix it, and
+            # remove
+            if managed_job_utils.controller_process_alive(
+                    controller_process, job_id):
+                # This can happen when HA recovery runs for some reason but the
+                # job controller is still alive.
+                logger.warning(f'Job {job_id} is still alive with controller '
+                               f'{controller_process}, skipping submission')
+                continue
+        job_ids_without_controller_process.append(job_id)
+    job_ids = job_ids_without_controller_process
 
     with open(dag_yaml_path, 'r', encoding='utf-8') as dag_file:
         dag_yaml_content = dag_file.read()
@@ -303,12 +310,14 @@ def submit_job(job_id: int, dag_yaml_path: str, original_user_yaml_path: str,
                 config_file_content = config_file.read()
 
     config_bytes = (len(config_file_content) if config_file_content else 0)
-    logger.debug(f'Storing job {job_id} file contents in database '
+    logger.debug(f'Storing jobs {job_ids} file contents in database '
                  f'(DAG bytes={len(dag_yaml_content)}, '
                  f'original user yaml bytes={len(original_user_yaml_content)}, '
                  f'env bytes={len(env_file_content)}, '
                  f'config bytes={config_bytes}).')
-    state.scheduler_set_waiting(job_id, dag_yaml_content,
+
+    # Submit all jobs
+    state.scheduler_set_waiting(job_ids, dag_yaml_content,
                                 original_user_yaml_content, env_file_content,
                                 config_file_content, priority)
     maybe_start_controllers(from_scheduler=True)
@@ -349,6 +358,16 @@ async def scheduled_launch(
         yield
         return
 
+    # For JobGroups, multiple tasks share the same job_id but each launches
+    # a different cluster in parallel. We handle scheduler state at the group
+    # level in _run_job_group(), so bypass per-task scheduling here.
+    # Check if job is a JobGroup by examining the DAG YAML content.
+    # TODO(zhwu): make JobGroup scheduler aware.
+    dag_content = file_content_utils.get_job_dag_content(job_id)
+    if dag_content is not None and dag_utils.is_job_group_yaml_str(dag_content):
+        yield
+        return
+
     assert starting_lock == starting_signal._lock, (  # type: ignore #pylint: disable=protected-access
         'starting_lock and starting_signal must use the same lock')
 
@@ -414,10 +433,11 @@ async def job_done_async(job_id: int, idempotent: bool = False):
     parser.add_argument('--user-yaml-path',
                         type=str,
                         help='The path to the original user job yaml file.')
-    parser.add_argument('--job-id',
-                        required=True,
-                        type=int,
-                        help='Job id for the controller job.')
+    parser.add_argument(
+        '--job-id',
+        type=int,
+        nargs='+',
+        help='Job id(s) for the controller job(s). Can specify multiple.')
     parser.add_argument('--env-file',
                         type=str,
                         help='The path to the controller env file.')
@@ -434,5 +454,6 @@ async def job_done_async(job_id: int, idempotent: bool = False):
         f'Job priority ({constants.MIN_PRIORITY} to {constants.MAX_PRIORITY}).'
         f' Default: {constants.DEFAULT_PRIORITY}.')
     args = parser.parse_args()
-    submit_job(args.job_id, args.dag_yaml, args.user_yaml_path, args.env_file,
-               args.priority)
+
+    submit_jobs(args.job_id, args.dag_yaml, args.user_yaml_path, args.env_file,
+                args.priority)
diff --git a/sky/jobs/server/core.py b/sky/jobs/server/core.py
index 40780509abd..4b05aa615fc 100644
--- a/sky/jobs/server/core.py
+++ b/sky/jobs/server/core.py
@@ -1,6 +1,4 @@
 """SDK functions for managed jobs."""
-import concurrent.futures
-import copy
 import ipaddress
 import os
 import pathlib
@@ -17,6 +15,7 @@
 from sky import exceptions
 from sky import execution
 from sky import global_user_state
+from sky import optimizer as optimizer_lib
 from sky import provision as provision_lib
 from sky import sky_logging
 from sky import skypilot_config
@@ -25,6 +24,7 @@
 from sky.backends import backend_utils
 from sky.backends import cloud_vm_ray_backend
 from sky.catalog import common as service_catalog_common
+from sky.dag import DEFAULT_EXECUTION
 from sky.data import data_utils
 from sky.data import storage as storage_lib
 from sky.jobs import constants as managed_job_constants
@@ -55,6 +55,7 @@
     from google.protobuf import json_format
 
     import sky
+    from sky import resources
     from sky.schemas.generated import managed_jobsv1_pb2
 else:
     json_format = adaptors_common.LazyImport('google.protobuf.json_format')
@@ -105,6 +106,14 @@ def _warn_file_mounts_rolling_update(dag: 'sky.Dag') -> None:
     if os.environ.get(skylet_constants.SKYPILOT_ROLLING_UPDATE_ENABLED) is None:
         return
 
+    # If persistent storage is enabled (via Helm storage.enabled=true or by
+    # default for local deployments), file mounts are persisted and will
+    # survive rolling updates. Default to True if not explicitly set to False.
+    storage_enabled_str = os.environ.get(
+        skylet_constants.SKYPILOT_API_SERVER_STORAGE_ENABLED, 'true')
+    if storage_enabled_str.lower() == 'true':
+        return
+
     # If consolidation mode is not enabled, don't warn.
     if not managed_job_utils.is_consolidation_mode():
         return
@@ -136,7 +145,10 @@ def _warn_file_mounts_rolling_update(dag: 'sky.Dag') -> None:
         'with rolling update enabled for API server. To persist files'
         ' across API server restarts/update, use buckets, volumes, or git '
         'for your file mounts; or, configure a bucket in your SkyPilot config '
-        f'under `jobs.bucket`. {colorama.Style.RESET_ALL}')
+        'under `jobs.bucket`; or, enable persistent storage in Helm with '
+        '`storage.enabled=true`. See: https://docs.skypilot.co/en/latest/'
+        'reference/kubernetes/kubernetes-deployment.html'
+        f'{colorama.Style.RESET_ALL}')
 
 
 def _upload_files_to_controller(dag: 'sky.Dag') -> Dict[str, str]:
@@ -191,6 +203,61 @@ def _upload_files_to_controller(dag: 'sky.Dag') -> Dict[str, str]:
     return local_to_controller_file_mounts
 
 
+def _job_ids_to_str(job_ids: Optional[List[int]]) -> str:
+    if not job_ids:
+        return ''
+
+    if len(job_ids) == 1:
+        return str(job_ids[0])
+
+    job_ids = sorted(job_ids)
+    ranges = []
+    start = prev = job_ids[0]
+
+    for n in job_ids[1:]:
+        if n == prev + 1:
+            prev = n
+            continue
+        ranges.append(f'{start}-{prev}' if start != prev else str(start))
+        start = prev = n
+
+    # append last range
+    ranges.append(f'{start}-{prev}' if start != prev else str(start))
+    return ','.join(ranges)
+
+
+def _consolidated_launch(
+    controller: controller_utils.Controllers,
+    controller_task: 'sky.Task',
+    job_ids: List[int],
+) -> Tuple[List[int], backends.ResourceHandle]:
+    local_handle = backend_utils.is_controller_accessible(controller=controller,
+                                                          stopped_message='')
+    backend = backend_utils.get_backend_from_handle(local_handle)
+    assert isinstance(backend, backends.CloudVmRayBackend)
+    with sky_logging.silent():
+        backend.sync_file_mounts(handle=local_handle,
+                                 all_file_mounts=controller_task.file_mounts,
+                                 storage_mounts=controller_task.storage_mounts)
+    run_script = controller_task.run
+    assert isinstance(run_script, str)
+    # Manually add the env variables to the run script.
+    # Originally this is done in ray jobs submission but now
+    # we have to do it manually because there is no ray
+    # runtime on the API server.
+    env_cmds = [f'export {k}={v!r}' for k, v in controller_task.envs.items()]
+    run_script = '\n'.join(env_cmds + [run_script])
+    # Dump script for high availability recovery.
+    assert job_ids is not None, 'job_ids not set'
+    log_dir = os.path.join(skylet_constants.SKY_LOGS_DIRECTORY, 'managed_jobs')
+    os.makedirs(log_dir, exist_ok=True)
+    job_ids_str = _job_ids_to_str(job_ids)
+    log_path = os.path.join(log_dir, f'submit-job-{job_ids_str}.log')
+    backend.run_on_head(local_handle, run_script, log_path=log_path)
+    ux_utils.starting_message(f'Job submitted, ID: {job_ids_str}')
+    return job_ids, local_handle
+
+
 def _maybe_submit_job_locally(prefix: str, dag: 'sky.Dag',
                               num_jobs: int) -> Optional[List[int]]:
     """Submit the managed job locally if in consolidation mode.
@@ -220,6 +287,10 @@ def _maybe_submit_job_locally(prefix: str, dag: 'sky.Dag',
         # each job and then give it a unique name (e.g. append job id after
         # the task name). The name of the dag also needs to be aligned with
         # the task name.
+        # Execution mode: 'parallel' for job groups, 'serial' for pipelines and
+        # single jobs
+        execution_mode = (dag.execution.value
+                          if dag.execution else DEFAULT_EXECUTION.value)
         consolidation_mode_job_id = (
             managed_job_state.set_job_info_without_job_id(
                 dag.name,
@@ -228,17 +299,165 @@ def _maybe_submit_job_locally(prefix: str, dag: 'sky.Dag',
                 entrypoint=common_utils.get_current_command(),
                 pool=pool,
                 pool_hash=pool_hash,
-                user_hash=common_utils.get_user_hash()))
+                user_hash=common_utils.get_user_hash(),
+                execution=execution_mode))
         for task_id, task in enumerate(dag.tasks):
             resources_str = backend_utils.get_task_resources_str(
                 task, is_managed_job=True)
+            # For job groups, determine which tasks are primary vs auxiliary.
+            # For non-job-groups (single jobs, pipelines),
+            # is_primary_in_job_group is None for all tasks.
+            is_primary_in_job_group: Optional[bool] = None
+            if dag.is_job_group():
+                is_primary_in_job_group = (dag.primary_tasks is None or
+                                           task.name in dag.primary_tasks)
             managed_job_state.set_pending(consolidation_mode_job_id, task_id,
                                           task.name, resources_str,
-                                          task.metadata_json)
+                                          task.metadata_json,
+                                          is_primary_in_job_group)
         job_ids.append(consolidation_mode_job_id)
     return job_ids
 
 
+def _ensure_controller_up(
+    controller: controller_utils.Controllers,
+    task_resources: Optional[List['resources.Resources']] = None
+) -> 'cloud_vm_ray_backend.CloudVmRayResourceHandle':
+    """Ensure the jobs controller is up before proceeding.
+
+    If the controller is not accessible, provision it (bring up the cluster)
+    without launching a job. This avoids creating a cluster job ID that would
+    interfere with the ID space from the controller's perspective.
+
+    Args:
+        controller: The controller type to ensure is up.
+        task_resources: Optional list of task resources. If provided, the
+            controller will be launched on the same cloud as the tasks.
+    """
+    controller_name = controller.value.cluster_name
+    logger.info(f'{colorama.Fore.YELLOW}'
+                f'Ensuring the jobs controller {controller_name} is up before'
+                f' continuing job launch...{colorama.Style.RESET_ALL}')
+
+    # Create a minimal task for provisioning the controller cluster
+    # We only use this for its resources, not to execute a job
+    # Use task_resources to determine which cloud to launch the controller.
+    controller_resources_set = controller_utils.get_controller_resources(
+        controller=controller,
+        task_resources=task_resources if task_resources else [])
+
+    # Use the jobs controller template to ensure cloud dependencies
+    # are installed.
+    dag_uuid = str(uuid.uuid4())
+
+    vars_to_fill: Dict[str, Any] = {
+        'dag_name': 'ensure_controller_up',
+        'job_controller_indicator_file':
+            managed_job_constants.JOB_CONTROLLER_INDICATOR_FILE,
+        **controller_utils.controller_only_vars_to_fill(controller,),
+    }
+
+    yaml_path = os.path.join(managed_job_constants.JOBS_CONTROLLER_YAML_PREFIX,
+                             f'ensure-controller-up-{dag_uuid}.yaml')
+
+    # Fill the template to create the controller task YAML for provisioning.
+    common_utils.fill_template(
+        managed_job_constants.JOBS_CONTROLLER_PROVISION_TEMPLATE,
+        vars_to_fill,
+        output_path=yaml_path)
+
+    # Create task from the template-generated YAML.
+    controller_task = task_lib.Task.from_yaml(yaml_path)
+    controller_task.set_resources(controller_resources_set)
+
+    with skypilot_config.local_active_workspace_ctx(
+            skylet_constants.SKYPILOT_DEFAULT_WORKSPACE):
+        with common.with_server_user():
+            # Only provision the controller, don't execute a job.
+            # Job controller is not placed in kueue, as the
+            # controller pod is considered a "system" pod
+            # and is not subject to queue limits or preemption.
+            with skypilot_config.remove_queue_name_from_config():
+                _, _ = execution.launch(
+                    task=controller_task,
+                    cluster_name=controller_name,
+                    retry_until_up=True,
+                    stream_logs=False,
+                    _request_name=request_names.AdminPolicyRequestName.
+                    JOBS_LAUNCH_CONTROLLER,
+                    _disable_controller_check=True,
+                    fast=True)
+
+    # Verify the controller is now accessible
+    handle = backend_utils.is_controller_accessible(controller=controller,
+                                                    stopped_message='')
+    return handle
+
+
+def _submit_remotely(controller: controller_utils.Controllers,
+                     dag: 'sky.Dag',
+                     pool: Optional[str] = None,
+                     num_jobs: int = 1) -> List[int]:
+    # Ensure the controller is up before trying to create job IDs
+    # Use the same cloud as the tasks for the controller
+    task_resources = None
+    for task in dag.tasks:
+        if task.resources:
+            task_resources = list(task.resources)
+            break
+    local_handle = _ensure_controller_up(controller,
+                                         task_resources=task_resources)
+    backend = backend_utils.get_backend_from_handle(local_handle)
+    assert isinstance(backend, backends.CloudVmRayBackend)
+
+    workspace = skypilot_config.get_active_workspace(force_user_workspace=True)
+    entrypoint = common_utils.get_current_command()
+    pool_hash = serve_state.get_service_hash(pool)
+    user_hash = common_utils.get_user_hash()
+
+    # Prepare task data
+    task_ids = []
+    task_names = []
+    metadata_jsons = []
+    is_primary_in_job_groups = []
+    for task_id, task in enumerate(dag.tasks):
+        task_ids.append(task_id)
+        assert task.name is not None, 'task name is not set'
+        task_names.append(task.name)
+        assert task.metadata_json is not None, 'task metadata is not set'
+        metadata_jsons.append(task.metadata_json)
+        if dag.is_job_group():
+            is_primary_in_job_group = (dag.primary_tasks is None or
+                                       task.name in dag.primary_tasks)
+            is_primary_in_job_groups.append(is_primary_in_job_group)
+        else:
+            is_primary_in_job_groups.append(False)
+
+    # Use the same resources_str for all tasks
+    resources_str = backend_utils.get_task_resources_str(dag.tasks[0],
+                                                         is_managed_job=True)
+
+    assert dag.name is not None, 'dag name is not set'
+    execution_mode = (dag.execution.value
+                      if dag.execution else DEFAULT_EXECUTION.value)
+    job_ids = backend.set_job_info_without_job_id(
+        handle=local_handle,
+        name=dag.name,
+        workspace=workspace,
+        entrypoint=entrypoint,
+        pool=pool,
+        pool_hash=pool_hash,
+        user_hash=user_hash,
+        task_ids=task_ids,
+        task_names=task_names,
+        resources_str=resources_str,
+        metadata_jsons=metadata_jsons,
+        num_jobs=num_jobs,
+        execution=execution_mode,
+        is_primary_in_job_groups=(is_primary_in_job_groups))
+    return job_ids
+
+
 @timeline.event
 @usage_lib.entrypoint
 def launch(
@@ -294,15 +513,54 @@ def launch(
     dag, mutated_user_config = admin_policy_utils.apply(
         dag, request_name=request_names.AdminPolicyRequestName.JOBS_LAUNCH)
     dag.resolve_and_validate_volumes()
-    if not dag.is_chain():
+    if not dag.is_chain() and not dag.is_job_group():
         with ux_utils.print_exception_no_traceback():
-            raise ValueError('Only single-task or chain DAG is '
+            raise ValueError('Only single-task, chain DAG, or JobGroup is '
                              f'allowed for job_launch. Dag: {dag}')
+    if dag.is_job_group() and pool is not None:
+        with ux_utils.print_exception_no_traceback():
+            raise ValueError('JobGroups do not support pools. Please remove '
+                             'the --pool argument when launching a job group.')
     dag.validate()
     # TODO(aylei): use consolidated job controller instead of performing
     # pre-mount operations when submitting jobs.
     dag.pre_mount_volumes()
 
+    # Optimize JobGroup before sending to controller
+    # This pre-determines cloud+region for all tasks, enabling parallel launch
+    if dag.is_job_group():
+        dag = optimizer_lib.Optimizer.optimize_job_group(dag)
+        # Apply optimized cloud/region to task resources so they persist
+        # through serialization. Without this, each task would be re-optimized
+        # independently on the controller, potentially ending up on different
+        # infrastructure.
+        # TODO(zhwu): make the optimizer aware of multiple jobs directly during
+        # the re-optimization, instead of independently.
+        for task_ in dag.tasks:
+            if task_.best_resources is not None:
+                best_cloud = task_.best_resources.cloud
+                best_region = task_.best_resources.region
+                if best_cloud is not None or best_region is not None:
+                    override_params: Dict[str, Any] = {}
+                    if best_cloud is not None:
+                        override_params['cloud'] = best_cloud
+                    if best_region is not None:
+                        override_params['region'] = best_region
+                    task_.set_resources_override(override_params)
+
+        # Warn if job group is not running on Kubernetes (networking won't work)
+        first_task = dag.tasks[0]
+        if first_task.best_resources is not None:
+            best_cloud = first_task.best_resources.cloud
+            if best_cloud is not None and str(
+                    best_cloud).lower() != 'kubernetes':
+                logger.warning(
+                    f'{colorama.Fore.YELLOW}Job group service discovery '
+                    f'(hostname-based networking) is only supported on '
+                    f'Kubernetes. Tasks will run on {best_cloud} but cannot '
+                    f'communicate with each other using hostnames.'
+                    f'{colorama.Style.RESET_ALL}')
+
     # If there is a local postgres db, when the api server tries launching on
     # the remote jobs controller it will fail. therefore, we should remove this
     # before sending the config to the jobs controller.
@@ -318,7 +576,7 @@ def launch(
                  ipaddress.ip_address(parsed.hostname).is_loopback)):
                 mutated_user_config.pop('db', None)
 
-    user_dag_str_user_specified = dag_utils.dump_chain_dag_to_yaml_str(
+    user_dag_str_user_specified = dag_utils.dump_dag_to_yaml_str(
         dag, use_user_specified_yaml=True)
 
     dag_utils.maybe_infer_and_fill_dag_and_task_names(dag)
@@ -410,220 +668,149 @@ def launch(
         controller=controller,
         task_resources=sum([list(t.resources) for t in dag.tasks], []))
 
+    if num_jobs and pool is None:
+        raise ValueError('Cannot specify num_jobs without pool.')
+
     num_jobs = num_jobs if num_jobs is not None else 1
     # We do this assignment after applying the admin policy, so that we don't
     # need to serialize the pool name in the dag. The dag object will be
     # preserved. See sky/admin_policy.py::MutatedUserRequest::decode.
     dag.pool = pool
-    consolidation_mode_job_ids = _maybe_submit_job_locally(
-        prefix, dag, num_jobs)
+    job_ids = _maybe_submit_job_locally(prefix, dag, num_jobs)
+    is_consolidation_mode = job_ids is not None
+    if not is_consolidation_mode:
+        job_ids = _submit_remotely(controller, dag, pool, num_jobs)
+    assert job_ids is not None, 'job_ids is not set'
 
     # This is only needed for non-consolidation mode. For consolidation
     # mode, the controller uses the same catalog as API server.
-    modified_catalogs = {} if consolidation_mode_job_ids is not None else (
+    modified_catalogs = {} if is_consolidation_mode else (
         service_catalog_common.get_modified_catalog_file_mounts())
 
-    def _submit_one(
-        consolidation_mode_job_id: Optional[int] = None,
-        job_rank: Optional[int] = None,
-        num_jobs: Optional[int] = None,
-    ) -> Tuple[Optional[int], Optional[backends.ResourceHandle]]:
-        rank_suffix = '' if job_rank is None else f'-{job_rank}'
-        remote_original_user_yaml_path = (
-            f'{prefix}/{dag.name}-{dag_uuid}{rank_suffix}.original_user_yaml')
-        remote_user_yaml_path = (
-            f'{prefix}/{dag.name}-{dag_uuid}{rank_suffix}.yaml')
-        remote_user_config_path = (
-            f'{prefix}/{dag.name}-{dag_uuid}{rank_suffix}.config_yaml')
-        remote_env_file_path = (
-            f'{prefix}/{dag.name}-{dag_uuid}{rank_suffix}.env')
-        with tempfile.NamedTemporaryFile(
-                prefix=f'managed-dag-{dag.name}{rank_suffix}-',
-                mode='w',
-        ) as f, tempfile.NamedTemporaryFile(
-                prefix=f'managed-user-dag-{dag.name}{rank_suffix}-',
-                mode='w',
-        ) as original_user_yaml_path:
-            original_user_yaml_path.write(user_dag_str_user_specified)
-            original_user_yaml_path.flush()
-            # Copy tasks to avoid race conditions when multiple threads modify
-            # the same dag object concurrently. Each thread needs its own copy.
-            dag_copy = copy.deepcopy(dag)
-            for task_ in dag_copy.tasks:
-                if job_rank is not None:
-                    task_.update_envs({'SKYPILOT_JOB_RANK': str(job_rank)})
-                if num_jobs is not None:
-                    task_.update_envs({'SKYPILOT_NUM_JOBS': str(num_jobs)})
-
-            dag_utils.dump_chain_dag_to_yaml(dag_copy, f.name)
-
-            vars_to_fill = {
-                'remote_original_user_yaml_path':
-                    (remote_original_user_yaml_path),
-                'original_user_dag_path': original_user_yaml_path.name,
-                'remote_user_yaml_path': remote_user_yaml_path,
-                'user_yaml_path': f.name,
-                'local_to_controller_file_mounts':
-                    (local_to_controller_file_mounts),
-                'jobs_controller': controller_name,
-                # Note: actual cluster name will be <task.name>-<managed job ID>
-                'dag_name': dag.name,
-                'remote_user_config_path': remote_user_config_path,
-                'remote_env_file_path': remote_env_file_path,
-                'modified_catalogs': modified_catalogs,
-                'priority': priority,
-                'consolidation_mode_job_id': consolidation_mode_job_id,
-                'pool': pool,
-                'job_controller_indicator_file':
-                    managed_job_constants.JOB_CONTROLLER_INDICATOR_FILE,
-                **controller_utils.shared_controller_vars_to_fill(
-                    controller,
-                    remote_user_config_path=remote_user_config_path,
-                    # TODO(aylei): the mutated config will not be updated
-                    # afterwards without recreate the controller. Need to
-                    # revisit this.
-                    local_user_config=mutated_user_config,
-                ),
-            }
-
-            yaml_path = os.path.join(
-                managed_job_constants.JOBS_CONTROLLER_YAML_PREFIX,
-                f'{name}-{dag_uuid}-{consolidation_mode_job_id}-{job_rank}.yaml'
-            )
-            common_utils.fill_template(
-                managed_job_constants.JOBS_CONTROLLER_TEMPLATE,
-                vars_to_fill,
-                output_path=yaml_path)
-            controller_task = task_lib.Task.from_yaml(yaml_path)
-            controller_task.set_resources(controller_resources)
-
-            controller_task.managed_job_dag = dag_copy
-            # pylint: disable=protected-access
-            controller_task._metadata = metadata
-
-            job_identity = ''
-            if job_rank is not None:
-                job_identity = f' (rank: {job_rank})'
-            job_controller_postfix = (' from jobs controller' if
-                                      consolidation_mode_job_id is None else '')
-            logger.info(
-                f'{colorama.Fore.YELLOW}'
-                f'Launching managed job {dag.name!r}{job_identity}'
-                f'{job_controller_postfix}...{colorama.Style.RESET_ALL}')
-
-            # Launch with the api server's user hash, so that sky status does
-            # not show the owner of the controller as whatever user launched
-            # it first.
-            with common.with_server_user():
-                # Always launch the controller in the default workspace.
-                with skypilot_config.local_active_workspace_ctx(
-                        skylet_constants.SKYPILOT_DEFAULT_WORKSPACE):
-                    # TODO(zhwu): the buckets need to be correctly handled for
-                    # a specific workspace. For example, if a job is launched in
-                    # workspace A, but the controller is in workspace B, the
-                    # intermediate bucket and newly created bucket should be in
-                    # workspace A.
-                    if consolidation_mode_job_id is None:
-                        # Job controller is not placed in kueue, as the
-                        # controller pod is considered a "system" pod
-                        # and is not subject to queue limits or preemption.
-                        with skypilot_config.remove_queue_name_from_config():
-                            return execution.launch(
-                                task=controller_task,
-                                cluster_name=controller_name,
-                                stream_logs=stream_logs,
-                                retry_until_up=True,
-                                fast=True,
-                                _request_name=request_names.
-                                AdminPolicyRequestName.JOBS_LAUNCH_CONTROLLER,
-                                _disable_controller_check=True)
-                    # Manually launch the scheduler in consolidation mode.
-                    local_handle = backend_utils.is_controller_accessible(
-                        controller=controller, stopped_message='')
-                    backend = backend_utils.get_backend_from_handle(
-                        local_handle)
-                    assert isinstance(backend, backends.CloudVmRayBackend)
-                    # Suppress file mount logs when submitting multiple jobs.
-                    should_silence = num_jobs is not None and num_jobs > 1
-                    with sky_logging.silent(should_silence):
-                        backend.sync_file_mounts(
-                            handle=local_handle,
-                            all_file_mounts=controller_task.file_mounts,
-                            storage_mounts=controller_task.storage_mounts)
-                    run_script = controller_task.run
-                    assert isinstance(run_script, str)
-                    # Manually add the env variables to the run script.
-                    # Originally this is done in ray jobs submission but now we
-                    # have to do it manually because there is no ray runtime on
-                    # the API server.
-                    env_cmds = [
-                        f'export {k}={v!r}'
-                        for k, v in controller_task.envs.items()
-                    ]
-                    run_script = '\n'.join(env_cmds + [run_script])
-                    log_dir = os.path.expanduser(
-                        os.path.join(skylet_constants.SKY_LOGS_DIRECTORY,
-                                     'managed_jobs'))
-                    os.makedirs(log_dir, exist_ok=True)
-                    log_path = os.path.join(
-                        log_dir, f'submit-job-{consolidation_mode_job_id}.log')
-                    backend.run_on_head(local_handle,
-                                        run_script,
-                                        log_path=log_path)
-                    ux_utils.starting_message(
-                        f'Job submitted, ID: {consolidation_mode_job_id}')
-                    return consolidation_mode_job_id, local_handle
-
-    if pool is None:
-        if consolidation_mode_job_ids is None:
-            return _submit_one()
-        assert len(consolidation_mode_job_ids) == 1
-        return _submit_one(consolidation_mode_job_ids[0])
-
-    ids: List[int] = []
-    all_handle: Optional[backends.ResourceHandle] = None
-
-    if num_jobs == 1:
-        job_id = (consolidation_mode_job_ids[0]
-                  if consolidation_mode_job_ids is not None else None)
-        jid, handle = _submit_one(job_id, 0, num_jobs=num_jobs)
-        assert jid is not None, (job_id, handle)
-        ids.append(jid)
-        all_handle = handle
-    else:
-        # Submit jobs in parallel using ThreadPoolExecutor
-        with concurrent.futures.ThreadPoolExecutor(
-                max_workers=min(num_jobs,
-                                os.cpu_count() or 1)) as executor:
-            # Submit jobs concurrently
-            future_to_rank = {}
-            for job_rank in range(num_jobs):
-                job_id = (consolidation_mode_job_ids[job_rank]
-                          if consolidation_mode_job_ids is not None else None)
-                future = executor.submit(_submit_one, job_id, job_rank,
-                                         num_jobs)
-                future_to_rank[future] = job_rank
-
-            # Collect results in order of job_rank to maintain consistent order.
-            results: List[Optional[Tuple[
-                int, Optional[backends.ResourceHandle]]]] = [None] * num_jobs
-            for future in concurrent.futures.as_completed(future_to_rank):
-                job_rank = future_to_rank[future]
-                try:
-                    jid, handle = future.result()
-                    assert jid is not None, (job_id, handle)
-                    results[job_rank] = (jid, handle)
-                    all_handle = handle  # Keep the last handle.
-                except Exception as e:
-                    logger.error(f'Error launching job {job_rank}: {e}')
-                    raise e
-
-            # Extract job IDs in order
-            for res in results:
-                if res is not None:
-                    ids.append(res[0])
-
-    return ids, all_handle
+    # Submit the job(s).
+    # Create a single set of YAML files (not per-rank)
+    remote_orig_user_yaml_path = (
+        f'{prefix}/{dag.name}-{dag_uuid}.original_user_yaml')
+    remote_user_yaml_path = (f'{prefix}/{dag.name}-{dag_uuid}.yaml')
+    remote_user_config_path = (f'{prefix}/{dag.name}-{dag_uuid}.config_yaml')
+    remote_env_file_path = (f'{prefix}/{dag.name}-{dag_uuid}.env')
+
+    with tempfile.NamedTemporaryFile(
+            prefix=f'managed-dag-{dag.name}-',
+            mode='w',
+    ) as f, tempfile.NamedTemporaryFile(
+            prefix=f'managed-user-dag-{dag.name}-',
+            mode='w',
+    ) as original_user_yaml_path:
+        original_user_yaml_path.write(user_dag_str_user_specified)
+        original_user_yaml_path.flush()
+
+        # Set the num_jobs env variable for each task.
+        for task_ in dag.tasks:
+            task_.update_envs({'SKYPILOT_NUM_JOBS': str(num_jobs)})
+
+        dag_utils.dump_dag_to_yaml(dag, f.name)
+
+        vars_to_fill: Dict[str, Any] = {
+            'remote_original_user_yaml_path': remote_orig_user_yaml_path,
+            'original_user_dag_path': original_user_yaml_path.name,
+            'remote_user_yaml_path': remote_user_yaml_path,
+            'user_yaml_path': f.name,
+            'local_to_controller_file_mounts':
+                (local_to_controller_file_mounts),
+            'jobs_controller': controller_name,
+            'dag_name': dag.name,
+            'remote_user_config_path': remote_user_config_path,
+            'remote_env_file_path': remote_env_file_path,
+            'modified_catalogs': modified_catalogs,
+            'priority': priority,
+            'is_consolidation_mode': is_consolidation_mode,
+            'pool': pool,
+            'job_controller_indicator_file':
+                managed_job_constants.JOB_CONTROLLER_INDICATOR_FILE,
+            'num_jobs': num_jobs,
+            **controller_utils.shared_controller_vars_to_fill(
+                controller,
+                remote_user_config_path=remote_user_config_path,
+                # TODO(aylei): the mutated config will not be updated
+                # afterwards without recreate the controller. Need to
+                # revisit this.
+                local_user_config=mutated_user_config,
+            ),
+        }
+
+        yaml_path = os.path.join(
+            managed_job_constants.JOBS_CONTROLLER_YAML_PREFIX,
+            f'{name}-{dag_uuid}.yaml')
+
+        # Launch with the api server's user hash, so that sky status does
+        # not show the owner of the controller as whatever user launched
+        # it first.
+        with common.with_server_user():
+            # Always launch the controller in the default workspace.
+            with skypilot_config.local_active_workspace_ctx(
+                    skylet_constants.SKYPILOT_DEFAULT_WORKSPACE):
+                job_controller_postfix = (' from jobs controller'
+                                          if not is_consolidation_mode else '')
+                managed_jobs_str = 'managed job'
+
+                job_ids_str = _job_ids_to_str(job_ids)
+                vars_to_fill['job_ids'] = job_ids
+                # Create job_id_to_rank dictionary by sorting job IDs and
+                # assigning ranks.
+                sorted_job_ids = sorted(job_ids)
+                job_id_to_rank = {
+                    str(job_id): rank
+                    for rank, job_id in enumerate(sorted_job_ids)
+                }
+                vars_to_fill['job_id_to_rank'] = job_id_to_rank
+                if num_jobs is not None and num_jobs > 1:
+                    managed_jobs_str = (
+                        f'{num_jobs} managed jobs {job_ids_str}')
+                logger.info(
+                    f'{colorama.Fore.YELLOW}'
+                    f'Launching {managed_jobs_str} {dag.name!r}'
+                    f'{job_controller_postfix}...{colorama.Style.RESET_ALL}')
+
+                common_utils.fill_template(
+                    managed_job_constants.JOBS_CONTROLLER_TEMPLATE,
+                    vars_to_fill,
+                    output_path=yaml_path)
+                logger.debug(f'Wrote controller yaml to path: {yaml_path}')
+                controller_task = task_lib.Task.from_yaml(yaml_path)
+                controller_task.set_resources(controller_resources)
+                controller_task.managed_job_dag = dag
+                # pylint: disable=protected-access
+                controller_task._metadata = metadata
+
+                # TODO(zhwu): the buckets need to be correctly handled for
+                # a specific workspace. For example, if a job is launched in
+                # workspace A, but the controller is in workspace B, the
+                # intermediate bucket and newly created bucket should be in
+                # workspace A.
+                if is_consolidation_mode:
+                    return _consolidated_launch(controller, controller_task,
+                                                job_ids)
+                else:
+                    # TODO(lloyd-brown) The cluster should already be launched
+                    # here so we should just be able to use exec, but we need
+                    # to work through the logic and make sure there is no issue
+                    # with say file mounts.
+
+                    # Job controller is not placed in kueue, as the
+                    # controller pod is considered a "system" pod
+                    # and is not subject to queue limits or preemption.
+                    with skypilot_config.remove_queue_name_from_config():
+                        result = execution.launch(
+                            task=controller_task,
+                            cluster_name=controller_name,
+                            stream_logs=stream_logs,
+                            retry_until_up=True,
+                            fast=True,
+                            _request_name=request_names.AdminPolicyRequestName.
+                            JOBS_LAUNCH_CONTROLLER,
+                            _disable_controller_check=True)
+                        return job_ids, result[1]
 
 
 def queue_from_kubernetes_pod(
@@ -808,12 +995,15 @@ def queue_v2_api(
     limit: Optional[int] = None,
     statuses: Optional[List[str]] = None,
     fields: Optional[List[str]] = None,
+    sort_by: Optional[str] = None,
+    sort_order: Optional[str] = None,
 ) -> Tuple[List[responses.ManagedJobRecord], int, Dict[str, int], int]:
     """Gets statuses of managed jobs and parse the
     jobs to responses.ManagedJobRecord."""
     jobs, total, status_counts, total_no_filter = queue_v2(
         refresh, skip_finished, all_users, job_ids, user_match, workspace_match,
-        name_match, pool_match, page, limit, statuses, fields)
+        name_match, pool_match, page, limit, statuses, fields, sort_by,
+        sort_order)
     return [responses.ManagedJobRecord(**job) for job in jobs
            ], total, status_counts, total_no_filter
 
@@ -832,6 +1022,8 @@ def queue_v2(
     limit: Optional[int] = None,
     statuses: Optional[List[str]] = None,
     fields: Optional[List[str]] = None,
+    sort_by: Optional[str] = None,
+    sort_order: Optional[str] = None,
 ) -> Tuple[List[Dict[str, Any]], int, Dict[str, int], int]:
     # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
     """Gets statuses of managed jobs with filtering.
@@ -926,6 +1118,8 @@ def queue_v2(
                 fields=managed_jobsv1_pb2.Fields(
                     fields=fields) if fields is not None else None,
                 show_jobs_without_user_hash=show_jobs_without_user_hash,
+                sort_by=sort_by,
+                sort_order=sort_order,
             )
             response = backend_utils.invoke_skylet_with_retries(
                 lambda: cloud_vm_ray_backend.SkyletClient(
@@ -939,7 +1133,8 @@ def queue_v2(
     with metrics_lib.time_it('jobs.queue.generate_code', group='jobs'):
         code = managed_job_utils.ManagedJobCodeGen.get_job_table(
             skip_finished, accessible_workspaces, job_ids, workspace_match,
-            name_match, pool_match, page, limit, user_hashes, statuses, fields)
+            name_match, pool_match, page, limit, user_hashes, statuses, fields,
+            sort_by, sort_order)
     with metrics_lib.time_it('jobs.queue.run_on_head', group='jobs'):
         returncode, job_table_payload, stderr = backend.run_on_head(
             handle,
@@ -1114,7 +1309,8 @@ def tail_logs(name: Optional[str],
               follow: bool,
               controller: bool,
               refresh: bool,
-              tail: Optional[int] = None) -> int:
+              tail: Optional[int] = None,
+              task: Optional[Union[str, int]] = None) -> int:
     # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
     """Tail logs of managed jobs.
 
@@ -1158,7 +1354,8 @@ def tail_logs(name: Optional[str],
                                          job_name=name,
                                          follow=follow,
                                          controller=controller,
-                                         tail=tail)
+                                         tail=tail,
+                                         task=task)
 
 
 @usage_lib.entrypoint
diff --git a/sky/jobs/server/server.py b/sky/jobs/server/server.py
index 91d2a7c0e90..d2550dc09f0 100644
--- a/sky/jobs/server/server.py
+++ b/sky/jobs/server/server.py
@@ -43,6 +43,7 @@ async def launch(request: fastapi.Request,
         func=core.launch,
         schedule_type=schedule_type,
         request_cluster_name=common.JOB_CONTROLLER_NAME,
+        auth_user=request.state.auth_user,
     )
 
 
@@ -59,6 +60,7 @@ async def queue(request: fastapi.Request,
         schedule_type=(api_requests.ScheduleType.LONG if jobs_queue_body.refresh
                        else api_requests.ScheduleType.SHORT),
         request_cluster_name=common.JOB_CONTROLLER_NAME,
+        auth_user=request.state.auth_user,
     )
 
 
@@ -74,6 +76,7 @@ async def queue_v2(request: fastapi.Request,
                        if jobs_queue_body_v2.refresh else
                        api_requests.ScheduleType.SHORT),
         request_cluster_name=common.JOB_CONTROLLER_NAME,
+        auth_user=request.state.auth_user,
     )
 
 
@@ -87,6 +90,7 @@ async def cancel(request: fastapi.Request,
         func=core.cancel,
         schedule_type=api_requests.ScheduleType.SHORT,
         request_cluster_name=common.JOB_CONTROLLER_NAME,
+        auth_user=request.state.auth_user,
     )
 
 
@@ -109,6 +113,7 @@ async def logs(
         func=core.tail_logs,
         schedule_type=schedule_type,
         request_cluster_name=common.JOB_CONTROLLER_NAME,
+        auth_user=request.state.auth_user,
     )
     kill_request_on_disconnect = False
     if schedule_type == api_requests.ScheduleType.SHORT:
@@ -150,6 +155,7 @@ async def download_logs(
         schedule_type=api_requests.ScheduleType.LONG
         if jobs_download_logs_body.refresh else api_requests.ScheduleType.SHORT,
         request_cluster_name=common.JOB_CONTROLLER_NAME,
+        auth_user=request.state.auth_user,
     )
 
 
@@ -163,6 +169,7 @@ async def pool_apply(request: fastapi.Request,
         func=core.pool_apply,
         schedule_type=api_requests.ScheduleType.LONG,
         request_cluster_name=common.JOB_CONTROLLER_NAME,
+        auth_user=request.state.auth_user,
     )
 
 
@@ -176,6 +183,7 @@ async def pool_down(request: fastapi.Request,
         func=core.pool_down,
         schedule_type=api_requests.ScheduleType.SHORT,
         request_cluster_name=common.JOB_CONTROLLER_NAME,
+        auth_user=request.state.auth_user,
     )
 
 
@@ -190,6 +198,7 @@ async def pool_status(
         func=core.pool_status,
         schedule_type=api_requests.ScheduleType.SHORT,
         request_cluster_name=common.JOB_CONTROLLER_NAME,
+        auth_user=request.state.auth_user,
     )
 
 
@@ -205,6 +214,7 @@ async def pool_tail_logs(
         func=core.pool_tail_logs,
         schedule_type=api_requests.ScheduleType.SHORT,
         request_cluster_name=common.JOB_CONTROLLER_NAME,
+        auth_user=request.state.auth_user,
     )
 
     request_task = await api_requests.get_request_async(
@@ -241,6 +251,7 @@ async def pool_download_logs(
         func=core.pool_sync_down_logs,
         schedule_type=api_requests.ScheduleType.SHORT,
         request_cluster_name=common.JOB_CONTROLLER_NAME,
+        auth_user=request.state.auth_user,
     )
 
 
@@ -255,4 +266,5 @@ async def events(request: fastapi.Request,
         func=core.get_job_events,
         schedule_type=api_requests.ScheduleType.SHORT,
         request_cluster_name=common.JOB_CONTROLLER_NAME,
+        auth_user=request.state.auth_user,
     )
diff --git a/sky/jobs/state.py b/sky/jobs/state.py
index 0d13bf503f4..9f74fcf14dc 100644
--- a/sky/jobs/state.py
+++ b/sky/jobs/state.py
@@ -30,9 +30,9 @@
 from sky import sky_logging
 from sky import skypilot_config
 from sky.adaptors import common as adaptors_common
+from sky.dag import DagExecution
 from sky.skylet import constants
 from sky.utils import common_utils
-from sky.utils import context_utils
 from sky.utils.db import db_utils
 from sky.utils.db import migration_utils
 from sky.utils.plugin_extensions import ExternalClusterFailure
@@ -106,6 +106,12 @@
     sqlalchemy.Column('links', sqlalchemy.JSON, server_default=None),
     sqlalchemy.Column('logs_cleaned_at', sqlalchemy.Float, server_default=None),
     sqlalchemy.Column('full_resources', sqlalchemy.JSON, server_default=None),
+    # Whether this task is a primary task (True) or auxiliary task (False)
+    # within a job group. NULL for non-job-group jobs (single jobs/pipelines).
+    # Auxiliary tasks are terminated when all primary tasks complete.
+    sqlalchemy.Column('is_primary_in_job_group',
+                      sqlalchemy.Boolean,
+                      server_default=None),
 )
 
 job_info_table = sqlalchemy.Table(
@@ -152,6 +158,14 @@
     sqlalchemy.Column('controller_logs_cleaned_at',
                       sqlalchemy.Float,
                       server_default=None),
+    # DAG execution mode: 'parallel' (job group) or 'serial' (pipeline/single)
+    sqlalchemy.Column('execution',
+                      sqlalchemy.Text,
+                      server_default=DagExecution.SERIAL.value),
+    # Infrastructure columns for efficient filtering/sorting
+    sqlalchemy.Column('cloud', sqlalchemy.Text, server_default=None),
+    sqlalchemy.Column('region', sqlalchemy.Text, server_default=None),
+    sqlalchemy.Column('zone', sqlalchemy.Text, server_default=None),
 )
 
 # TODO(cooperc): drop the table in a migration
@@ -277,7 +291,7 @@ async def wrapper(*args, **kwargs):
             # this may happen multiple times since there is no locking
             # here but thats fine, this is just a short circuit for the
             # common case.
-            await context_utils.to_thread(initialize_and_get_db_async)
+            await asyncio.to_thread(initialize_and_get_db_async)
 
         backoff = common_utils.Backoff(initial_backoff=1, max_backoff_factor=5)
         last_exc = None
@@ -407,6 +421,15 @@ def _get_jobs_dict(r: 'row.RowMapping') -> Dict[str, Any]:
         'current_cluster_name': r.get('current_cluster_name'),
         'job_id_on_pool_cluster': r.get('job_id_on_pool_cluster'),
         'pool_hash': r.get('pool_hash'),
+        # Whether this task is primary (True) or auxiliary (False) in a job
+        # group. NULL for non-job-group jobs.
+        'is_primary_in_job_group': r.get('is_primary_in_job_group'),
+        # Execution mode: 'parallel' (job group) or 'serial' (pipeline/single)
+        'execution': r.get('execution'),
+        # Infrastructure columns for filtering/sorting
+        'cloud': r.get('cloud'),
+        'region': r.get('region'),
+        'zone': r.get('zone'),
     }
 
 
@@ -748,9 +771,13 @@ def to_protobuf(self) -> 'managed_jobsv1_pb2.ManagedJobScheduleState':
 
 # === Status transition functions ===
 @_init_db
-def set_job_info_without_job_id(name: str, workspace: str, entrypoint: str,
-                                pool: Optional[str], pool_hash: Optional[str],
-                                user_hash: Optional[str]) -> int:
+def set_job_info_without_job_id(name: str,
+                                workspace: str,
+                                entrypoint: str,
+                                pool: Optional[str],
+                                pool_hash: Optional[str],
+                                user_hash: Optional[str],
+                                execution: Optional[str] = None) -> int:
     assert _SQLALCHEMY_ENGINE is not None
     with orm.Session(_SQLALCHEMY_ENGINE) as session:
         if (_SQLALCHEMY_ENGINE.dialect.name ==
@@ -770,6 +797,7 @@ def set_job_info_without_job_id(name: str, workspace: str, entrypoint: str,
             pool=pool,
             pool_hash=pool_hash,
             user_hash=user_hash,
+            execution=execution,
         )
 
         if (_SQLALCHEMY_ENGINE.dialect.name ==
@@ -796,6 +824,7 @@ def set_pending(
     task_name: str,
     resources_str: str,
     metadata: str,
+    is_primary_in_job_group: Optional[bool] = None,
 ):
     """Set the task to pending state."""
     add_job_event(job_id, task_id, ManagedJobStatus.PENDING,
@@ -811,6 +840,7 @@ def set_pending(
                 resources=resources_str,
                 metadata=metadata,
                 status=ManagedJobStatus.PENDING.value,
+                is_primary_in_job_group=is_primary_in_job_group,
             ))
         session.commit()
 
@@ -1391,6 +1421,7 @@ def build_managed_jobs_with_filters_no_status_query(
     user_hashes: Optional[List[Optional[str]]] = None,
     skip_finished: bool = False,
     count_only: bool = False,
+    count_unique_jobs: bool = False,
     status_count: bool = False,
 ) -> sqlalchemy.Select:
     """Build a query to get managed jobs from the database with filters."""
@@ -1402,7 +1433,12 @@ def build_managed_jobs_with_filters_no_status_query(
     # Note: we will get the user_hash here, but don't try to call
     # global_user_state.get_user() on it. This runs on the controller, which may
     # not have the user info. Prefer to do it on the API server side.
-    if count_only:
+    if count_unique_jobs:
+        # Count unique jobs (by spot_job_id), not tasks
+        query = sqlalchemy.select(
+            sqlalchemy.func.count(  # pylint: disable=not-callable
+                sqlalchemy.distinct(spot_table.c.spot_job_id)).label('count'))
+    elif count_only:
         query = sqlalchemy.select(sqlalchemy.func.count().label('count'))  # pylint: disable=not-callable
     elif status_count:
         query = sqlalchemy.select(spot_table.c.status,
@@ -1463,6 +1499,7 @@ def build_managed_jobs_with_filters_query(
     statuses: Optional[List[str]] = None,
     skip_finished: bool = False,
     count_only: bool = False,
+    count_unique_jobs: bool = False,
 ) -> sqlalchemy.Select:
     """Build a query to get managed jobs from the database with filters."""
     query = build_managed_jobs_with_filters_no_status_query(
@@ -1475,6 +1512,7 @@ def build_managed_jobs_with_filters_query(
         user_hashes=user_hashes,
         skip_finished=skip_finished,
         count_only=count_only,
+        count_unique_jobs=count_unique_jobs,
     )
     if statuses is not None:
         query = query.where(spot_table.c.status.in_(statuses))
@@ -1528,16 +1566,48 @@ def get_managed_jobs_with_filters(
     skip_finished: bool = False,
     page: Optional[int] = None,
     limit: Optional[int] = None,
+    sort_by: Optional[str] = None,
+    sort_order: Optional[str] = None,
 ) -> Tuple[List[Dict[str, Any]], int]:
     """Get managed jobs from the database with filters.
 
+    Pagination is by unique jobs (spot_job_id), not by tasks. This means
+    if you request page 1 with limit 10, you get all tasks for 10 unique jobs.
+
+    Args:
+        sort_by: Field to sort by. Valid values: 'job_id', 'id', 'job_name',
+            'name', 'submitted_at', 'status', 'job_duration', 'duration',
+            'recovery_count', 'recoveries', 'resources', 'user_hash', 'user',
+            'cloud', 'infra'.
+        sort_order: Sort direction, 'asc' or 'desc'. Defaults to 'desc'.
+
     Returns:
         A tuple containing
-         - the list of managed jobs
-         - the total number of managed jobs
+         - the list of managed jobs (all tasks for the paginated jobs)
+         - the total number of unique jobs (not tasks)
     """
+    # Column mapping for sorting
+    sort_field_map = {
+        'job_id': spot_table.c.spot_job_id,
+        'id': spot_table.c.spot_job_id,
+        'job_name': spot_table.c.job_name,
+        'name': spot_table.c.job_name,
+        'submitted_at': spot_table.c.submitted_at,
+        'status': spot_table.c.status,
+        'job_duration': spot_table.c.job_duration,
+        'duration': spot_table.c.job_duration,
+        'recovery_count': spot_table.c.recovery_count,
+        'recoveries': spot_table.c.recovery_count,
+        'resources': spot_table.c.resources,
+        'user_hash': job_info_table.c.user_hash,
+        'user': job_info_table.c.user_hash,
+        'cloud': job_info_table.c.cloud,
+        'infra': job_info_table.c.cloud,  # Sort by cloud for infra
+    }
+
     assert _SQLALCHEMY_ENGINE is not None
 
+    # Count unique jobs (by spot_job_id), not tasks
     count_query = build_managed_jobs_with_filters_query(
         fields=None,
         job_ids=job_ids,
@@ -1548,26 +1618,98 @@ def get_managed_jobs_with_filters(
         user_hashes=user_hashes,
         statuses=statuses,
         skip_finished=skip_finished,
-        count_only=True,
+        count_unique_jobs=True,
     )
     with orm.Session(_SQLALCHEMY_ENGINE) as session:
         total = session.execute(count_query).fetchone()[0]
 
-    query = build_managed_jobs_with_filters_query(
-        fields=fields,
-        job_ids=job_ids,
-        accessible_workspaces=accessible_workspaces,
-        workspace_match=workspace_match,
-        name_match=name_match,
-        pool_match=pool_match,
-        user_hashes=user_hashes,
-        statuses=statuses,
-        skip_finished=skip_finished,
-    )
-    query = query.order_by(spot_table.c.spot_job_id.desc(),
-                           spot_table.c.task_id.asc())
+    # For pagination, first get the unique job_ids for the current page,
+    # then fetch all tasks for those jobs
     if page is not None and limit is not None:
-        query = query.offset((page - 1) * limit).limit(limit)
+        # Get paginated unique job IDs with ordering
+        # Use GROUP BY instead of DISTINCT to allow ORDER BY on different
+        # columns (PostgreSQL requires ORDER BY columns to be in SELECT list
+        # when using DISTINCT).
+        job_ids_subquery = build_managed_jobs_with_filters_query(
+            fields=None,
+            job_ids=job_ids,
+            accessible_workspaces=accessible_workspaces,
+            workspace_match=workspace_match,
+            name_match=name_match,
+            pool_match=pool_match,
+            user_hashes=user_hashes,
+            statuses=statuses,
+            skip_finished=skip_finished,
+        ).with_only_columns(spot_table.c.spot_job_id).group_by(
+            spot_table.c.spot_job_id)
+
+        # Apply sorting to pagination query - this determines which jobs appear
+        # on each page. Use MAX aggregate for columns not in GROUP BY to ensure
+        # PostgreSQL compatibility.
+        if sort_by and sort_by in sort_field_map:
+            sort_column = sort_field_map[sort_by]
+            # Use MAX aggregate for columns that aren't the grouped column
+            if sort_column != spot_table.c.spot_job_id:
+                sort_column = sqlalchemy.func.max(sort_column)
+            if sort_order == 'asc':
+                job_ids_subquery = job_ids_subquery.order_by(sort_column.asc())
+            else:
+                job_ids_subquery = job_ids_subquery.order_by(sort_column.desc())
+        else:
+            # Default sort: job_id desc (newest first)
+            job_ids_subquery = job_ids_subquery.order_by(
+                spot_table.c.spot_job_id.desc())
+
+        job_ids_subquery = job_ids_subquery.offset(
+            (page - 1) * limit).limit(limit)
+
+        with orm.Session(_SQLALCHEMY_ENGINE) as session:
+            paginated_job_ids = [
+                row[0] for row in session.execute(job_ids_subquery).fetchall()
+            ]
+
+        if not paginated_job_ids:
+            return [], total
+
+        # Now get all tasks for those job IDs
+        query = build_managed_jobs_with_filters_query(
+            fields=fields,
+            job_ids=paginated_job_ids,  # Filter to only paginated jobs
+            accessible_workspaces=accessible_workspaces,
+            workspace_match=workspace_match,
+            name_match=name_match,
+            pool_match=pool_match,
+            user_hashes=user_hashes,
+            statuses=statuses,
+            skip_finished=skip_finished,
+        )
+    else:
+        # No pagination - get all jobs
+        query = build_managed_jobs_with_filters_query(
+            fields=fields,
+            job_ids=job_ids,
+            accessible_workspaces=accessible_workspaces,
+            workspace_match=workspace_match,
+            name_match=name_match,
+            pool_match=pool_match,
+            user_hashes=user_hashes,
+            statuses=statuses,
+            skip_finished=skip_finished,
+        )
+
+    # Apply sorting
+    if sort_by and sort_by in sort_field_map:
+        sort_column = sort_field_map[sort_by]
+        if sort_order == 'asc':
+            query = query.order_by(sort_column.asc(),
+                                   spot_table.c.task_id.asc())
+        else:
+            query = query.order_by(sort_column.desc(),
+                                   spot_table.c.task_id.asc())
+    else:
+        # Default sort: job_id desc, task_id asc
+        query = query.order_by(spot_table.c.spot_job_id.desc(),
+                               spot_table.c.task_id.asc())
     rows = None
     with orm.Session(_SQLALCHEMY_ENGINE) as session:
         rows = session.execute(query).fetchall()
@@ -1646,7 +1788,7 @@ def get_task_specs(job_id: int, task_id: int) -> Dict[str, Any]:
 
 
 @_init_db
-def scheduler_set_waiting(job_id: int, dag_yaml_content: str,
+def scheduler_set_waiting(job_ids: List[int], dag_yaml_content: str,
                           original_user_yaml_content: str,
                           env_file_content: str,
                           config_file_content: Optional[str],
@@ -1654,18 +1796,19 @@ def scheduler_set_waiting(job_id: int, dag_yaml_content: str,
     assert _SQLALCHEMY_ENGINE is not None
     with orm.Session(_SQLALCHEMY_ENGINE) as session:
         updated_count = session.query(job_info_table).filter(
-            sqlalchemy.and_(job_info_table.c.spot_job_id == job_id,)).update({
-                job_info_table.c.schedule_state:
-                    ManagedJobScheduleState.WAITING.value,
-                job_info_table.c.dag_yaml_content: dag_yaml_content,
-                job_info_table.c.original_user_yaml_content:
-                    (original_user_yaml_content),
-                job_info_table.c.env_file_content: env_file_content,
-                job_info_table.c.config_file_content: config_file_content,
-                job_info_table.c.priority: priority,
-            })
+            sqlalchemy.and_(job_info_table.c.spot_job_id.in_(job_ids),)).update(
+                {
+                    job_info_table.c.schedule_state:
+                        ManagedJobScheduleState.WAITING.value,
+                    job_info_table.c.dag_yaml_content: dag_yaml_content,
+                    job_info_table.c.original_user_yaml_content:
+                        (original_user_yaml_content),
+                    job_info_table.c.env_file_content: env_file_content,
+                    job_info_table.c.config_file_content: config_file_content,
+                    job_info_table.c.priority: priority,
+                })
         session.commit()
-        assert updated_count <= 1, (job_id, updated_count)
+        assert updated_count == len(job_ids), (job_ids, updated_count)
 
 
 @_init_db
@@ -1722,6 +1865,37 @@ def set_current_cluster_name(job_id: int, current_cluster_name: str) -> None:
         session.commit()
 
 
+@_init_db
+def set_job_infra(job_id: int,
+                  cloud: Optional[str] = None,
+                  region: Optional[str] = None,
+                  zone: Optional[str] = None) -> None:
+    """Update the infrastructure info for a job.
+
+    This is called after a job is launched to record the cloud/region/zone
+    for sorting and filtering purposes.
+
+    Args:
+        job_id: The job ID to update.
+        cloud: The cloud provider (e.g., 'GCP', 'AWS').
+        region: The region (e.g., 'us-central1').
+        zone: The zone (e.g., 'us-central1-a').
+    """
+    assert _SQLALCHEMY_ENGINE is not None
+    with orm.Session(_SQLALCHEMY_ENGINE) as session:
+        update_values = {}
+        if cloud is not None:
+            update_values[job_info_table.c.cloud] = cloud
+        if region is not None:
+            update_values[job_info_table.c.region] = region
+        if zone is not None:
+            update_values[job_info_table.c.zone] = zone
+        if update_values:
+            session.query(job_info_table).filter(
+                job_info_table.c.spot_job_id == job_id).update(update_values)
+            session.commit()
+
+
 @_init_db
 def update_job_full_resources(job_id: int,
                               full_resources_json: Dict[str, Any]) -> None:
@@ -1892,6 +2066,36 @@ def get_num_alive_jobs(pool: Optional[str] = None) -> int:
                 sqlalchemy.and_(*where_conditions))).fetchone()[0]
 
 
+@_init_db
+def get_pending_jobs_count_by_pool(pool: str) -> int:
+    """Get the count of pending jobs in a pool.
+
+    Pending jobs are jobs that are waiting for a worker, i.e., jobs with:
+    - status = PENDING
+
+    Args:
+        pool: The pool name
+
+    Returns:
+        The number of pending jobs in the pool
+    """
+    assert _SQLALCHEMY_ENGINE is not None
+    with orm.Session(_SQLALCHEMY_ENGINE) as session:
+        # Join job_info_table with spot_table to get status
+        query = sqlalchemy.select(
+            sqlalchemy.func.count()  # pylint: disable=not-callable
+        ).select_from(
+            job_info_table.join(
+                spot_table, job_info_table.c.spot_job_id ==
+                spot_table.c.spot_job_id)).where(
+                    sqlalchemy.and_(
+                        spot_table.c.status == ManagedJobStatus.PENDING.value,
+                        job_info_table.c.pool == pool,
+                    ))
+        result = session.execute(query).fetchone()
+        return result[0] if result else 0
+
+
 @_init_db
 def get_nonterminal_job_ids_by_pool(pool: str,
                                     cluster_name: Optional[str] = None
@@ -2547,7 +2751,8 @@ def set_job_info(job_id: int,
                  entrypoint: str,
                  pool: Optional[str],
                  pool_hash: Optional[str],
-                 user_hash: Optional[str] = None):
+                 user_hash: Optional[str] = None,
+                 execution: Optional[str] = None):
     assert _SQLALCHEMY_ENGINE is not None
     with orm.Session(_SQLALCHEMY_ENGINE) as session:
         if (_SQLALCHEMY_ENGINE.dialect.name ==
@@ -2567,6 +2772,7 @@ def set_job_info(job_id: int,
             pool=pool,
             pool_hash=pool_hash,
             user_hash=user_hash,
+            execution=execution,
         )
         session.execute(insert_stmt)
         session.commit()
diff --git a/sky/jobs/utils.py b/sky/jobs/utils.py
index 2cc1f264985..efc8902ddb2 100644
--- a/sky/jobs/utils.py
+++ b/sky/jobs/utils.py
@@ -30,6 +30,8 @@
 from sky.adaptors import common as adaptors_common
 from sky.backends import backend_utils
 from sky.backends import cloud_vm_ray_backend
+from sky.dag import DagExecution
+from sky.dag import DEFAULT_EXECUTION
 from sky.jobs import constants as managed_job_constants
 from sky.jobs import scheduler
 from sky.jobs import state as managed_job_state
@@ -40,7 +42,6 @@
 from sky.usage import usage_lib
 from sky.utils import annotations
 from sky.utils import common_utils
-from sky.utils import context_utils
 from sky.utils import controller_utils
 from sky.utils import infra_utils
 from sky.utils import log_utils
@@ -115,11 +116,19 @@
     'zone',
     'infra',
     'accelerators',
+    'cluster_name_on_cloud',
+    'labels',
 ]
 
 # The response fields for managed jobs that are not stored in the database
 # These fields will be mapped to the DB fields in the `_update_fields`.
-_NON_DB_FIELDS = _CLUSTER_HANDLE_FIELDS + ['user_yaml', 'user_name', 'details']
+_NON_DB_FIELDS = _CLUSTER_HANDLE_FIELDS + [
+    'user_yaml',
+    'user_name',
+    'details',
+    # is_job_group is derived from execution column (execution == 'parallel')
+    'is_job_group',
+]
 
 
 class ManagedJobQueueResultType(enum.Enum):
@@ -344,7 +353,7 @@ async def get_job_status(
     # TODO(zhwu, cooperc): Make this get job status aware of cluster status, so
     # that it can exit retry early if the cluster is down.
     # TODO(luca) make this async
-    handle = await context_utils.to_thread(
+    handle = await asyncio.to_thread(
         global_user_state.get_handle_from_cluster_name, cluster_name)
     if handle is None:
         # This can happen if the cluster was preempted and background status
@@ -356,10 +365,10 @@ async def get_job_status(
     try:
         logger.info('=== Checking the job status... ===')
         statuses = await asyncio.wait_for(
-            context_utils.to_thread(backend.get_job_status,
-                                    handle,
-                                    job_ids=job_ids,
-                                    stream_logs=False),
+            asyncio.to_thread(backend.get_job_status,
+                              handle,
+                              job_ids=job_ids,
+                              stream_logs=False),
             timeout=_JOB_STATUS_FETCH_TIMEOUT_SECONDS)
         status = list(statuses.values())[0]
         if status is None:
@@ -774,7 +783,7 @@ def callback_func(status: str):
         logger.info(f'=== END: event callback for {status!r} ===')
 
     async def async_callback_func(status: str):
-        return await context_utils.to_thread(callback_func, status)
+        return await asyncio.to_thread(callback_func, status)
 
     return async_callback_func
 
@@ -917,11 +926,21 @@ def controller_log_file_for_job(job_id: int,
     return os.path.join(log_dir, f'{job_id}.log')
 
 
-def stream_logs_by_id(job_id: int,
-                      follow: bool = True,
-                      tail: Optional[int] = None) -> Tuple[str, int]:
+def stream_logs_by_id(
+        job_id: int,
+        follow: bool = True,
+        tail: Optional[int] = None,
+        task: Optional[Union[str, int]] = None) -> Tuple[str, int]:
     """Stream logs by job id.
 
+    Args:
+        job_id: The job ID to stream logs for.
+        follow: Whether to follow the logs.
+        tail: Number of lines to tail from the end of the log file.
+        task: Task identifier to view logs for a specific task in a JobGroup.
+            If an int, it is treated as a task ID. If a str, it is treated as
+            a task name. If None, logs for all tasks are shown.
+
     Returns:
         A tuple containing the log message and an exit code based on success or
         failure of the job. 0 if success, 100 if the job failed.
@@ -934,16 +953,57 @@ def should_keep_logging(status: managed_job_state.ManagedJobStatus) -> bool:
         return (not status.is_terminal() and
                 status != managed_job_state.ManagedJobStatus.CANCELLING)
 
+    def matches_task_filter(task_id: int, task_name: str,
+                            task_filter: Optional[Union[str, int]]) -> bool:
+        """Check if a task matches the task filter.
+
+        If task_filter is an int, it is matched against task_id.
+        If task_filter is a str, it is matched against task_name.
+        """
+        if task_filter is None:
+            return True
+        if isinstance(task_filter, int):
+            return task_id == task_filter
+        # task_filter is a str, match by task name
+        return task_name == task_filter
+
     msg = _JOB_WAITING_STATUS_MESSAGE.format(status_str='', job_id=job_id)
     status_display = rich_utils.safe_status(msg)
     num_tasks = managed_job_state.get_num_tasks(job_id)
 
+    # Check if job exists - if num_tasks is 0, the job doesn't exist
+    if num_tasks == 0:
+        return (f'Job {job_id} not found.', exceptions.JobExitCode.NOT_FOUND)
+
+    # Resolve task filter to a specific task_id if provided
+    # This is used for running jobs to stream logs from the correct task
+    filtered_task_id: Optional[int] = None
+    if task is not None:
+        task_info = managed_job_state.get_all_task_ids_names_statuses_logs(
+            job_id)
+        for t_id, t_name, _, _, _ in task_info:
+            if matches_task_filter(t_id, t_name, task):
+                filtered_task_id = t_id
+                break
+        if filtered_task_id is None:
+            valid_range = f'0-{num_tasks - 1}' if num_tasks > 1 else '0'
+            return (f'No task found matching {task!r} in job {job_id}. '
+                    f'Valid task IDs are {valid_range}.',
+                    exceptions.JobExitCode.NOT_FOUND)
+
     with status_display:
         prev_msg = msg
         while (managed_job_status :=
                managed_job_state.get_status(job_id)) is None:
             time.sleep(1)
 
+        # Show hint about per-task filtering when there are multiple tasks
+        if num_tasks > 1 and task is None:
+            print(f'{colorama.Fore.CYAN}Hint: This job has {num_tasks} tasks. '
+                  f'Use \'sky jobs logs {job_id} TASK\' to view logs for a '
+                  f'specific task (TASK can be task ID or name).'
+                  f'{colorama.Style.RESET_ALL}')
+
         if not should_keep_logging(managed_job_status):
             job_msg = ''
             if managed_job_status.is_failed():
@@ -952,6 +1012,19 @@ def should_keep_logging(status: managed_job_state.ManagedJobStatus) -> bool:
             log_file_ever_existed = False
             task_info = managed_job_state.get_all_task_ids_names_statuses_logs(
                 job_id)
+            total_tasks = len(task_info)
+            # Filter tasks if task filter is specified
+            if task is not None:
+                task_info = [
+                    t for t in task_info
+                    if matches_task_filter(t[0], t[1], task)
+                ]
+                if not task_info:
+                    valid_range = (f'0-{total_tasks - 1}'
+                                   if total_tasks > 1 else '0')
+                    return (f'No task found matching {task!r} in job {job_id}. '
+                            f'Valid task IDs are {valid_range}.',
+                            exceptions.JobExitCode.NOT_FOUND)
             num_tasks = len(task_info)
             for (task_id, task_name, task_status, log_file,
                  logs_cleaned_at) in task_info:
@@ -965,7 +1038,8 @@ def should_keep_logging(status: managed_job_state.ManagedJobStatus) -> bool:
                         continue
                     task_str = (f'Task {task_name}({task_id})'
                                 if task_name else f'Task {task_id}')
-                    if num_tasks > 1:
+                    # Show task header when multiple tasks OR when filtering
+                    if num_tasks > 1 or task is not None:
                         print(f'=== {task_str} ===')
                     with open(os.path.expanduser(log_file),
                               'r',
@@ -990,7 +1064,8 @@ def should_keep_logging(status: managed_job_state.ManagedJobStatus) -> bool:
                                 start_streaming = True
                             if start_streaming:
                                 print(line, end='', flush=True)
-                    if num_tasks > 1:
+                    # Show task finished message for multi-task or filtering
+                    if num_tasks > 1 or task is not None:
                         # Add the "Task finished" message for terminal states
                         if task_status.is_terminal():
                             print(ux_utils.finishing_message(
@@ -1017,6 +1092,12 @@ def should_keep_logging(status: managed_job_state.ManagedJobStatus) -> bool:
         task_id, managed_job_status = (
             managed_job_state.get_latest_task_id_status(job_id))
 
+        # If a task filter was specified, use the filtered task_id instead of
+        # the latest task_id. This allows viewing logs for a specific task in
+        # a JobGroup with parallel execution.
+        if filtered_task_id is not None:
+            task_id = filtered_task_id
+
         # We wait for managed_job_status to be not None above. Once we see that
         # it's not None, we don't expect it to every become None again.
         assert managed_job_status is not None, (job_id, task_id,
@@ -1059,6 +1140,9 @@ def should_keep_logging(status: managed_job_state.ManagedJobStatus) -> bool:
                 time.sleep(JOB_STATUS_CHECK_GAP_SECONDS)
                 task_id, managed_job_status = (
                     managed_job_state.get_latest_task_id_status(job_id))
+                # Preserve filtered task_id if specified
+                if filtered_task_id is not None:
+                    task_id = filtered_task_id
                 assert managed_job_status is not None, (job_id, task_id,
                                                         managed_job_status)
                 continue
@@ -1127,6 +1211,11 @@ def is_managed_job_status_updated(
                     if task_id == num_tasks - 1:
                         break
 
+                    # If a task filter was specified, we're done with the
+                    # specific task - don't wait for other tasks.
+                    if filtered_task_id is not None:
+                        break
+
                     # The log for the current job is finished. We need to
                     # wait until next job to be started.
                     logger.debug(
@@ -1209,9 +1298,20 @@ def stream_logs(job_id: Optional[int],
                 job_name: Optional[str],
                 controller: bool = False,
                 follow: bool = True,
-                tail: Optional[int] = None) -> Tuple[str, int]:
+                tail: Optional[int] = None,
+                task: Optional[Union[str, int]] = None) -> Tuple[str, int]:
     """Stream logs by job id or job name.
 
+    Args:
+        job_id: The job ID to stream logs for.
+        job_name: The job name to stream logs for.
+        controller: Whether to stream controller logs.
+        follow: Whether to follow the logs.
+        tail: Number of lines to tail from the end of the log file.
+        task: Task identifier to view logs for a specific task in a JobGroup.
+            If an int, it is treated as a task ID. If a str, it is treated as
+            a task name. If None, logs for all tasks are shown.
+
     Returns:
         A tuple containing the log message and the exit code based on success
         or failure of the job. 0 if success, 100 if the job failed.
@@ -1336,7 +1436,7 @@ def stream_logs(job_id: Optional[int],
                 f'Multiple running jobs found with name {job_name!r}.')
         job_id = job_ids[0]
 
-    return stream_logs_by_id(job_id, follow, tail)
+    return stream_logs_by_id(job_id, follow, tail, task)
 
 
 def dump_managed_job_queue(
@@ -1351,11 +1451,14 @@ def dump_managed_job_queue(
     user_hashes: Optional[List[Optional[str]]] = None,
     statuses: Optional[List[str]] = None,
     fields: Optional[List[str]] = None,
+    sort_by: Optional[str] = None,
+    sort_order: Optional[str] = None,
 ) -> str:
     return message_utils.encode_payload(
         get_managed_job_queue(skip_finished, accessible_workspaces, job_ids,
                               workspace_match, name_match, pool_match, page,
-                              limit, user_hashes, statuses, fields))
+                              limit, user_hashes, statuses, fields, sort_by,
+                              sort_order))
 
 
 def _update_fields(fields: List[str],) -> Tuple[List[str], bool]:
@@ -1405,6 +1508,10 @@ def _update_fields(fields: List[str],) -> Tuple[List[str], bool]:
             new_fields.append('original_user_yaml_path')
         if 'original_user_yaml_content' not in new_fields:
             new_fields.append('original_user_yaml_content')
+    # is_job_group is derived from execution column
+    if 'is_job_group' in fields:
+        if 'execution' not in new_fields:
+            new_fields.append('execution')
     if cluster_handle_required:
         if 'task_name' not in new_fields:
             new_fields.append('task_name')
@@ -1486,6 +1593,7 @@ def _populate_job_record_from_handle(
         handle.launched_resources.zone).formatted_str()
     job['accelerators'] = handle.launched_resources.accelerators
     job['labels'] = handle.launched_resources.labels
+    job['cluster_name_on_cloud'] = handle.cluster_name_on_cloud
 
 
 def get_managed_job_queue(
@@ -1500,6 +1608,8 @@ def get_managed_job_queue(
     user_hashes: Optional[List[Optional[str]]] = None,
     statuses: Optional[List[str]] = None,
     fields: Optional[List[str]] = None,
+    sort_by: Optional[str] = None,
+    sort_order: Optional[str] = None,
 ) -> Dict[str, Any]:
     """Get the managed job queue.
 
@@ -1515,6 +1625,8 @@ def get_managed_job_queue(
         user_hashes: The user hashes.
         statuses: The statuses.
         fields: The fields to include in the response.
+        sort_by: The field to sort by.
+        sort_order: The sort order ('asc' or 'desc').
 
     Returns:
         A dictionary containing the managed job queue.
@@ -1554,6 +1666,8 @@ def get_managed_job_queue(
         skip_finished=skip_finished,
         page=page,
         limit=limit,
+        sort_by=sort_by,
+        sort_order=sort_order,
     )
 
     if cluster_handle_required:
@@ -1616,6 +1730,7 @@ def get_managed_job_queue(
                 job['zone'] = '-'
                 job['infra'] = '-'
                 job['labels'] = None
+                job['cluster_name_on_cloud'] = None
 
     _populate_job_records_from_handles(jobs_with_handle)
 
@@ -1624,6 +1739,10 @@ def get_managed_job_queue(
             _format_job_details(
                 job=job, highest_blocking_priority=highest_blocking_priority)
 
+        # Derive is_job_group from execution column
+        job['is_job_group'] = (
+            job.get('execution') == DagExecution.PARALLEL.value)
+
     return {
         'jobs': jobs,
         'total': total,
@@ -1747,11 +1866,35 @@ def load_managed_job_queue(
 def _get_job_status_from_tasks(
     job_tasks: Union[List[responses.ManagedJobRecord], List[Dict[str, Any]]]
 ) -> Tuple[managed_job_state.ManagedJobStatus, int]:
-    """Get the current task status and the current task id for a job."""
+    """Get the current task status and the current task id for a job.
+
+    For job groups with primary/auxiliary tasks, the job status is determined
+    only by the primary tasks. If all primary tasks succeed, the job is
+    considered successful even if auxiliary tasks were cancelled.
+    """
+    # Filter to only primary tasks for status determination.
+    # is_primary_in_job_group: True/False for job groups, None for non-groups.
+    # For non-job-groups (None), all tasks count for status.
+    # For job groups, only tasks with is_primary_in_job_group=True count.
+    primary_job_tasks = [
+        t for t in job_tasks
+        if t.get('is_primary_in_job_group') is None or  # Non-job-group
+        t.get('is_primary_in_job_group') is True  # Primary task in job group
+    ]
+    # Use primary tasks for status; fall back to all tasks if none match
+    job_tasks_for_status: Union[List[responses.ManagedJobRecord],
+                                List[Dict[str, Any]]] = (primary_job_tasks
+                                                         if primary_job_tasks
+                                                         else job_tasks)
+
     managed_task_status = managed_job_state.ManagedJobStatus.SUCCEEDED
     current_task_id = 0
-    for task in job_tasks:
-        managed_task_status = task['status']
+    for task in job_tasks_for_status:
+        task_status = task['status']
+        # Handle both enum and string status values
+        if isinstance(task_status, str):
+            task_status = managed_job_state.ManagedJobStatus(task_status)
+        managed_task_status = task_status
         current_task_id = task['task_id']
 
         # Use the first non-succeeded status.
@@ -2030,6 +2173,12 @@ def get_user_column_values(task: Dict[str, Any]) -> List[str]:
                 job_values.insert(0, job_tasks[0].get('user', '-'))
             job_table.add_row(job_values)
 
+        # Check if this is a job group with auxiliary tasks.
+        # is_primary_in_job_group: True/False for job groups, None otherwise.
+        # We show [P] markers only for job groups that have auxiliary tasks.
+        has_auxiliary_tasks = any(
+            t.get('is_primary_in_job_group') is False for t in job_tasks)
+
         for task in job_tasks:
             # The job['job_duration'] is already calculated in
             # dump_managed_job_queue().
@@ -2047,11 +2196,16 @@ def get_user_column_values(task: Dict[str, Any]) -> List[str]:
             if task_job_id in job_to_worker and pool != '-':
                 pool = f'{pool} (worker={job_to_worker[task_job_id]})'
 
+            # Add [P] marker for primary tasks in job groups with auxiliaries
+            task_name = task['task_name']
+            if has_auxiliary_tasks and task.get('is_primary_in_job_group'):
+                task_name = f'{task_name} [P]'
+
             values = [
                 task['job_id'] if len(job_tasks) == 1 else ' \u21B3',
                 task['task_id'] if len(job_tasks) > 1 else '-',
                 *([task_workspace] if show_workspace else []),
-                task['task_name'],
+                task_name,
                 *user_values,
                 task['resources'],
                 # SUBMITTED
@@ -2217,8 +2371,14 @@ def get_job_table(
         user_hashes: Optional[List[Optional[str]]] = None,
         statuses: Optional[List[str]] = None,
         fields: Optional[List[str]] = None,
+        sort_by: Optional[str] = None,
+        sort_order: Optional[str] = None,
     ) -> str:
         code = textwrap.dedent(f"""\
+        # Filter out is_primary_in_job_group for older controllers (< 15)
+        _fields = {fields!r}
+        if managed_job_version < 15 and _fields is not None:
+            _fields = [f for f in _fields if f != 'is_primary_in_job_group']
         if managed_job_version < 9:
             # For backward compatibility, since filtering is not supported
             # before #6652.
@@ -2247,6 +2407,19 @@ def get_job_table(
                                 limit={limit!r},
                                 user_hashes={user_hashes!r},
                                 statuses={statuses!r})
+        elif managed_job_version < 14:
+            job_table = utils.dump_managed_job_queue(
+                                skip_finished={skip_finished},
+                                accessible_workspaces={accessible_workspaces!r},
+                                job_ids={job_ids!r},
+                                workspace_match={workspace_match!r},
+                                name_match={name_match!r},
+                                pool_match={pool_match!r},
+                                page={page!r},
+                                limit={limit!r},
+                                user_hashes={user_hashes!r},
+                                statuses={statuses!r},
+                                fields=_fields)
         else:
             job_table = utils.dump_managed_job_queue(
                                 skip_finished={skip_finished},
@@ -2259,7 +2432,9 @@ def get_job_table(
                                 limit={limit!r},
                                 user_hashes={user_hashes!r},
                                 statuses={statuses!r},
-                                fields={fields!r})
+                                fields=_fields,
+                                sort_by={sort_by!r},
+                                sort_order={sort_order!r})
         print(job_table, flush=True)
         """)
         return cls._build(code)
@@ -2354,15 +2529,21 @@ def stream_logs(cls,
                     job_id: Optional[int],
                     follow: bool = True,
                     controller: bool = False,
-                    tail: Optional[int] = None) -> str:
+                    tail: Optional[int] = None,
+                    task: Optional[Union[str, int]] = None) -> str:
         code = textwrap.dedent(f"""\
         if managed_job_version < 6:
-            # Versions before 5 did not support tail parameter
+            # Versions before 6 did not support tail parameter
             result = utils.stream_logs(job_id={job_id!r}, job_name={job_name!r},
                                     follow={follow}, controller={controller})
-        else:
+        elif managed_job_version < 15:
+            # Versions before 15 did not support task parameter
             result = utils.stream_logs(job_id={job_id!r}, job_name={job_name!r},
                                     follow={follow}, controller={controller}, tail={tail!r})
+        else:
+            result = utils.stream_logs(job_id={job_id!r}, job_name={job_name!r},
+                                    follow={follow}, controller={controller}, tail={tail!r},
+                                    task={task!r})
         if managed_job_version < 3:
             # Versions 2 and older did not return a retcode, so we just print
             # the result.
@@ -2384,6 +2565,10 @@ def set_pending(cls,
                     user_hash: Optional[str] = None) -> str:
         dag_name = managed_job_dag.name
         pool = managed_job_dag.pool
+        # Execution mode: 'parallel' for job groups, 'serial' for pipelines and
+        # single jobs
+        execution = (managed_job_dag.execution.value
+                     if managed_job_dag.execution else DEFAULT_EXECUTION.value)
         # Add the managed job to queue table.
         code = textwrap.dedent(f"""\
             set_job_info_kwargs = {{'workspace': {workspace!r}}}
@@ -2400,20 +2585,34 @@ def set_pending(cls,
                 set_job_info_kwargs['pool_hash'] = pool_hash
             if managed_job_version >= 11:
                 set_job_info_kwargs['user_hash'] = {user_hash!r}
+            if managed_job_version >= 15:
+                set_job_info_kwargs['execution'] = {execution!r}
             managed_job_state.set_job_info(
                 {job_id}, {dag_name!r}, **set_job_info_kwargs)
             """)
         for task_id, task in enumerate(managed_job_dag.tasks):
             resources_str = backend_utils.get_task_resources_str(
                 task, is_managed_job=True)
+            # For job groups, determine which tasks are primary vs auxiliary.
+            # For non-job-groups, is_primary_in_job_group=None for all tasks.
+            is_primary_in_job_group: Optional[bool] = None
+            if managed_job_dag.is_job_group():
+                is_primary_in_job_group = (
+                    managed_job_dag.primary_tasks is None or
+                    task.name in managed_job_dag.primary_tasks)
             code += textwrap.dedent(f"""\
                 if managed_job_version < 7:
                     managed_job_state.set_pending({job_id}, {task_id},
                                     {task.name!r}, {resources_str!r})
-                else:
+                elif managed_job_version < 15:
                     managed_job_state.set_pending({job_id}, {task_id},
                                     {task.name!r}, {resources_str!r},
                                     {task.metadata_json!r})
+                else:
+                    managed_job_state.set_pending({job_id}, {task_id},
+                                    {task.name!r}, {resources_str!r},
+                                    {task.metadata_json!r},
+                                    {is_primary_in_job_group!r})
                 """)
         return cls._build(code)
 
diff --git a/sky/metrics/utils.py b/sky/metrics/utils.py
index 2c19512597d..70174c0dde2 100644
--- a/sky/metrics/utils.py
+++ b/sky/metrics/utils.py
@@ -1,4 +1,5 @@
 """Utilities for processing GPU metrics from Kubernetes clusters."""
+import asyncio
 import contextlib
 import functools
 import os
@@ -14,7 +15,6 @@
 from sky import sky_logging
 from sky.skylet import constants
 from sky.utils import common_utils
-from sky.utils import context_utils
 
 _SELECT_TIMEOUT = 1
 _SELECT_BUFFER_SIZE = 4096
@@ -356,7 +356,7 @@ async def send_metrics_request_with_port_forward(
     port_forward_process = None
     try:
         # Start port forward
-        port_forward_process, local_port = await context_utils.to_thread(
+        port_forward_process, local_port = await asyncio.to_thread(
             start_svc_port_forward, context, namespace, service, service_port)
 
         # Build endpoint URL
@@ -381,8 +381,7 @@ async def send_metrics_request_with_port_forward(
     finally:
         # Always clean up port forward
         if port_forward_process:
-            await context_utils.to_thread(stop_svc_port_forward,
-                                          port_forward_process)
+            await asyncio.to_thread(stop_svc_port_forward, port_forward_process)
 
 
 async def add_cluster_name_label(metrics_text: str, context: str) -> str:
diff --git a/sky/models.py b/sky/models.py
index 726b118edc5..bfc103f847a 100644
--- a/sky/models.py
+++ b/sky/models.py
@@ -4,7 +4,7 @@
 import dataclasses
 import getpass
 import os
-from typing import Any, ClassVar, Dict, Optional
+from typing import Any, ClassVar, Dict, List, Optional
 
 import pydantic
 
@@ -36,12 +36,6 @@ def __init__(
     def to_dict(self) -> Dict[str, Any]:
         return {'id': self.id, 'name': self.name}
 
-    def to_env_vars(self) -> Dict[str, Any]:
-        return {
-            constants.USER_ID_ENV_VAR: self.id,
-            constants.USER_ENV_VAR: self.name,
-        }
-
     @classmethod
     def get_current_user(cls) -> 'User':
         """Returns the current user."""
@@ -79,6 +73,11 @@ class KubernetesNodeInfo:
     memory_free_gb: Optional[float] = None
     # Whether the node is ready (all conditions are satisfied)
     is_ready: bool = True
+    # Whether the node is cordoned (spec.unschedulable is true)
+    is_cordoned: bool = False
+    # List of taints on the node, each taint is a dict with 'key', 'value',
+    # 'effect'
+    taints: Optional[List[Dict[str, Any]]] = None
 
 
 @dataclasses.dataclass
diff --git a/sky/optimizer.py b/sky/optimizer.py
index 1965bfaae1c..4516aa93164 100644
--- a/sky/optimizer.py
+++ b/sky/optimizer.py
@@ -11,6 +11,7 @@
 
 from sky import check as sky_check
 from sky import clouds
+from sky import dag as dag_lib
 from sky import exceptions
 from sky import resources as resources_lib
 from sky import sky_logging
@@ -32,8 +33,6 @@
 
 if typing.TYPE_CHECKING:
     import networkx as nx
-
-    from sky import dag as dag_lib
 else:
     nx = adaptors_common.LazyImport('networkx')
 
@@ -1033,6 +1032,351 @@ def _print_candidates(node_to_candidate_map: _TaskToPerCloudCandidates):
                 logger.info(
                     f'To list more details, run: sky show-gpus {acc_name}\n')
 
+    @staticmethod
+    @timeline.event
+    def optimize_job_group(
+            dag: 'dag_lib.Dag',
+            minimize: common.OptimizeTarget = common.OptimizeTarget.COST,
+            blocked_resources: Optional[Iterable[
+                resources_lib.Resources]] = None,
+            quiet: bool = False) -> 'dag_lib.Dag':
+        """Optimize a JobGroup DAG.
+
+        This method optimizes all tasks in a JobGroup to run on the same
+        infrastructure (cloud + region/zone or K8s cluster).
+
+        Args:
+            dag: The JobGroup DAG to optimize.
+            minimize: Whether to minimize cost or time.
+            blocked_resources: Resources that should not be used.
+            quiet: Whether to suppress logging.
+
+        Returns:
+            The optimized DAG with best_resources set for each task.
+
+        Raises:
+            exceptions.ResourcesUnavailableError: If no common infrastructure
+                can satisfy all tasks.
+        """
+        if not dag.is_job_group():
+            # Fall back to normal optimization for non-JobGroup DAGs
+            return Optimizer.optimize(dag, minimize, blocked_resources, quiet)
+
+        tasks = dag.tasks
+
+        if not quiet:
+            logger.info(
+                f'Optimizing JobGroup "{dag.name}" with {len(tasks)} jobs')
+
+        # Find common infrastructure for all tasks in the JobGroup
+        return Optimizer._optimize_same_infra(dag, minimize, blocked_resources,
+                                              quiet)
+
+    @staticmethod
+    def _optimize_independent(dag: 'dag_lib.Dag',
+                              minimize: common.OptimizeTarget,
+                              blocked_resources: Optional[Iterable[
+                                  resources_lib.Resources]],
+                              quiet: bool) -> 'dag_lib.Dag':
+        """Optimize each task in the JobGroup independently."""
+        for task in dag.tasks:
+            # Create a temporary single-task DAG for optimization
+            temp_dag = dag_lib.Dag()
+            temp_dag.add(task)
+            temp_dag.name = task.name
+
+            # Optimize this task
+            Optimizer.optimize(temp_dag, minimize, blocked_resources, quiet)
+
+        return dag
+
+    @staticmethod
+    def _optimize_same_infra(dag: 'dag_lib.Dag',
+                             minimize: common.OptimizeTarget,
+                             blocked_resources: Optional[Iterable[
+                                 resources_lib.Resources]],
+                             quiet: bool) -> 'dag_lib.Dag':
+        """Optimize JobGroup with SAME_INFRA constraint.
+
+        Find a common cloud+region that can satisfy all tasks.
+        """
+        tasks = dag.tasks
+
+        # Step 1: Get feasible resources for each task
+        # Use launchable_resources (with regions) instead of cloud_candidates
+        # (without regions) to ensure proper region constraints.
+        task_launchables: Dict[task_lib.Task, _PerCloudCandidates] = {}
+
+        for task in tasks:
+            # Get launchable resources for this task
+            launchable_resources, _, _, _ = (_fill_in_launchable_resources(
+                task, blocked_resources=blocked_resources, quiet=quiet))
+
+            if not any(launchable_resources.values()):
+                with ux_utils.print_exception_no_traceback():
+                    raise exceptions.ResourcesUnavailableError(
+                        f'No resources available for job "{task.name}" '
+                        f'in JobGroup "{dag.name}"')
+
+            # Build cloud -> launchable resources mapping (with regions)
+            cloud_launchables: _PerCloudCandidates = collections.defaultdict(
+                list)
+            for launchable_list in launchable_resources.values():
+                for res in launchable_list:
+                    if res.cloud is not None:
+                        cloud_launchables[res.cloud].append(res)
+            task_launchables[task] = cloud_launchables
+
+        # Step 2: Find common cloud+region combinations
+        common_infras = Optimizer._find_common_infras(task_launchables)
+
+        if not common_infras:
+            # If no common infra, fallback to independent optimization
+            if not quiet:
+                logger.warning('No common infrastructure found for all jobs. '
+                               'Falling back to independent optimization.')
+            return Optimizer._optimize_independent(dag, minimize,
+                                                   blocked_resources, quiet)
+
+        # Step 3: Select best infra based on minimize target
+        best_infra = Optimizer._select_best_infra(
+            common_infras, task_launchables, tasks,
+            minimize == common.OptimizeTarget.COST)
+
+        if not quiet:
+            cloud, region = best_infra
+            # Format infra as lowercase cloud/region
+            infra_str = f'{str(cloud).lower()}/{region}'
+            logger.info(f'Selected infrastructure: {infra_str}')
+            # Hint user about other available infras
+            other_infras = [(c, r)
+                            for c, r in common_infras
+                            if not (str(c) == str(cloud) and r == region)]
+            if other_infras:
+                other_infras_str = ', '.join(
+                    [f'{str(c).lower()}/{r}' for c, r in other_infras[:3]])
+                if len(other_infras) > 3:
+                    other_infras_str += f', ... ({len(other_infras) - 3} more)'
+                logger.info(
+                    f'Other available common infras: {other_infras_str}')
+
+        # Step 4: Assign resources for each task on the selected infra
+        cloud, region = best_infra
+        cloud_name = str(cloud)
+        for task in tasks:
+            candidates = task_launchables[task]
+            # Find the cloud object in candidates that matches (by name)
+            # since different tasks may have different Cloud instances
+            matching_cloud = None
+            for cand_cloud in candidates.keys():
+                if str(cand_cloud) == cloud_name:
+                    matching_cloud = cand_cloud
+                    break
+            if matching_cloud is None:
+                continue
+
+            # Find resources in this cloud+region
+            for resources in candidates[matching_cloud]:
+                if resources.region == region:
+                    # Set best_resources on the task
+                    task.best_resources = resources
+                    # Also set resources override to ensure the constraint
+                    # persists through YAML serialization to the controller.
+                    # Without this, the controller would re-optimize each
+                    # task independently, placing them on different infras.
+                    override_params: Dict[str, Any] = {}
+                    if resources.cloud is not None:
+                        override_params['cloud'] = resources.cloud
+                    if resources.region is not None:
+                        override_params['region'] = resources.region
+                    if override_params:
+                        task.set_resources_override(override_params)
+                    break
+
+        # Step 5: Print optimizer table for job groups
+        if not quiet and len(tasks) > 1:
+            Optimizer._print_job_group_plan(tasks)
+
+        return dag
+
+    @staticmethod
+    def _print_job_group_plan(tasks: List[task_lib.Task]) -> None:
+        """Print the optimizer table for a job group."""
+        resource_fields = ['INFRA', 'INSTANCE', 'vCPUs', 'Mem(GB)', 'GPUS']
+        table = _create_table(['TASK', '#NODES'] + resource_fields)
+
+        rows = []
+        for task in tasks:
+            best_resources = task.best_resources
+            if best_resources is None:
+                continue
+
+            # Get instance type string (display '-' for K8s/Slurm in table)
+            instance_type = best_resources.instance_type
+            if instance_type is None:
+                display_instance_type = '-'
+            elif isinstance(best_resources.cloud,
+                            (clouds.Kubernetes, clouds.Slurm)):
+                display_instance_type = '-'
+            else:
+                display_instance_type = instance_type
+
+            # Get vCPUs and memory
+            vcpus = '-'
+            mem = '-'
+            if best_resources.cloud is not None and instance_type is not None:
+                cloud = best_resources.cloud
+                vcpus_, mem_ = cloud.get_vcpus_mem_from_instance_type(
+                    instance_type)
+                if vcpus_ is not None:
+                    vcpus = (str(int(vcpus_))
+                             if vcpus_.is_integer() else f'{vcpus_:.1f}')
+                if mem_ is not None:
+                    mem = (str(int(mem_))
+                           if mem_.is_integer() else f'{mem_:.1f}')
+
+            # Get accelerators
+            accelerators = best_resources.get_accelerators_str()
+
+            # Get spot string
+            spot = best_resources.get_spot_str()
+
+            # Get infra string
+            infra = best_resources.infra.formatted_str()
+
+            row = [
+                task.name,
+                str(task.num_nodes), infra, display_instance_type + spot, vcpus,
+                mem,
+                str(accelerators)
+            ]
+            rows.append(row)
+
+        if rows:
+            table.add_rows(rows)
+            logger.info(f'{colorama.Style.BRIGHT}Best plan: '
+                        f'{colorama.Style.RESET_ALL}')
+            logger.info(f'{table}')
+
+    @staticmethod
+    def _find_common_infras(
+        task_candidates: Dict[task_lib.Task, _PerCloudCandidates]
+    ) -> List[Tuple[clouds.Cloud, Optional[str]]]:
+        """Find cloud+region combinations that satisfy all tasks.
+
+        Returns:
+            List of (cloud, region) tuples that can run all tasks.
+        """
+        if not task_candidates:
+            return []
+
+        # Collect infras per task
+        infras_per_task: List[Set[Tuple[str, Optional[str]]]] = []
+
+        for _, cloud_candidates in task_candidates.items():
+            task_infras: Set[Tuple[str, Optional[str]]] = set()
+            for cloud, resources_list in cloud_candidates.items():
+                cloud_name = str(cloud)
+                for resources in resources_list:
+                    # Use (cloud_name, region) as infra identifier
+                    region = resources.region
+                    task_infras.add((cloud_name, region))
+            infras_per_task.append(task_infras)
+
+        if not infras_per_task:
+            return []
+
+        # Find intersection of all task infras
+        common_infras = infras_per_task[0]
+        for infra_set in infras_per_task[1:]:
+            common_infras = common_infras & infra_set
+
+        # Convert back to (Cloud, region) tuples
+        result: List[Tuple[clouds.Cloud, Optional[str]]] = []
+        for cloud_name, region in common_infras:
+            # Get Cloud object from name
+            cloud_obj = None
+            for task_cloud_candidates in task_candidates.values():
+                for cloud in task_cloud_candidates.keys():
+                    if str(cloud) == cloud_name:
+                        cloud_obj = cloud
+                        break
+                if cloud_obj:
+                    break
+            if cloud_obj:
+                result.append((cloud_obj, region))
+
+        return result
+
+    @staticmethod
+    def _select_best_infra(
+            common_infras: List[Tuple[clouds.Cloud, Optional[str]]],
+            task_candidates: Dict[task_lib.Task, _PerCloudCandidates],
+            tasks: List[task_lib.Task],
+            minimize_cost: bool) -> Tuple[clouds.Cloud, Optional[str]]:
+        """Select the best infrastructure from common options.
+
+        Args:
+            common_infras: List of (cloud, region) that can run all tasks.
+            task_candidates: Per-task candidate resources.
+            tasks: List of tasks.
+            minimize_cost: If True, minimize cost; else minimize time.
+
+        Returns:
+            Best (cloud, region) tuple.
+        """
+        if len(common_infras) == 1:
+            return common_infras[0]
+
+        # Estimate total cost/time for each infra
+        best_infra: Optional[Tuple[clouds.Cloud, Optional[str]]] = None
+        best_score = float('inf')
+
+        for cloud, region in common_infras:
+            total_score = 0.0
+            all_tasks_valid = True
+
+            for task in tasks:
+                candidates = task_candidates.get(task, {})
+                if cloud not in candidates:
+                    # Task cannot run on this cloud - skip this infra
+                    all_tasks_valid = False
+                    break
+
+                # Find cheapest/fastest resources in this infra
+                best_task_score = float('inf')
+                for resources in candidates[cloud]:
+                    if resources.region == region or resources.region is None:
+                        # Estimate score for this resource
+                        if task.time_estimator_func is None:
+                            runtime = 3600  # Default 1 hour
+                        else:
+                            runtime = task.estimate_runtime(resources)
+                            if runtime is None:
+                                runtime = 3600  # Default 1 hour
+
+                        if minimize_cost:
+                            hourly_cost = resources.get_cost(runtime)
+                            score = hourly_cost * task.num_nodes
+                        else:
+                            score = runtime
+
+                        if score < best_task_score:
+                            best_task_score = score
+
+                if best_task_score < float('inf'):
+                    total_score += best_task_score
+                else:
+                    # No valid resources found for this task on this infra
+                    all_tasks_valid = False
+                    break
+
+            if all_tasks_valid and total_score < best_score:
+                best_score = total_score
+                best_infra = (cloud, region)
+
+        return best_infra if best_infra else common_infras[0]
+
     @staticmethod
     def _optimize_dag(
         dag: 'dag_lib.Dag',
@@ -1426,3 +1770,16 @@ def _fill_in_launchable_resources(
             launchable[resources], blocked_resources)
     return launchable, cloud_candidates, list(
         sorted(all_fuzzy_candidates)), resource_hints
+
+
+# Expose optimize_job_group as a module-level function
+def optimize_job_group(
+        dag: 'dag_lib.Dag',
+        minimize: common.OptimizeTarget = common.OptimizeTarget.COST,
+        blocked_resources: Optional[Iterable[resources_lib.Resources]] = None,
+        quiet: bool = False) -> 'dag_lib.Dag':
+    """Optimize a JobGroup DAG with SAME_INFRA constraint.
+
+    This is a convenience wrapper around Optimizer.optimize_job_group().
+    """
+    return Optimizer.optimize_job_group(dag, minimize, blocked_resources, quiet)
diff --git a/sky/provision/__init__.py b/sky/provision/__init__.py
index b19ba17e4d9..7a0d5d94487 100644
--- a/sky/provision/__init__.py
+++ b/sky/provision/__init__.py
@@ -33,6 +33,7 @@
 from sky.provision import ssh
 from sky.provision import vast
 from sky.provision import vsphere
+from sky.provision import yotta
 from sky.utils import command_runner
 from sky.utils import timeline
 
@@ -191,6 +192,27 @@ def map_all_volumes_usedby(
     raise NotImplementedError
 
 
+@_route_to_cloud_impl
+def get_all_volumes_errors(
+        provider_name: str,
+        configs: List[models.VolumeConfig]) -> Dict[str, Optional[str]]:
+    """Get error messages for all volumes.
+
+    Checks if volumes have errors (e.g., pending state due to
+    misconfiguration) and returns appropriate error messages.
+
+    Args:
+        provider_name: Name of the provider.
+        configs: List of VolumeConfig objects.
+
+    Returns:
+        Dictionary mapping volume name to error message (None if no error).
+    """
+    # Default implementation returns empty dict (no error checking)
+    del provider_name, configs
+    return {}
+
+
 @_route_to_cloud_impl
 def run_instances(provider_name: str, region: str, cluster_name: str,
                   cluster_name_on_cloud: str,
@@ -221,6 +243,27 @@ def terminate_instances(
     raise NotImplementedError
 
 
+@_route_to_cloud_impl
+def cleanup_cluster_resources(
+    provider_name: str,
+    cluster_name_on_cloud: str,
+    provider_config: Optional[Dict[str, Any]] = None,
+) -> None:
+    """Cleanup all cloud resources for a cluster (services, etc.).
+
+    Called during post-teardown to ensure resources are cleaned up even when
+    instances were deleted externally. Currently only Kubernetes needs this
+    to clean up orphaned services.
+
+    Args:
+        provider_name: Name of the cloud provider
+        cluster_name_on_cloud: The cluster name on cloud
+        provider_config: Provider configuration dictionary
+    """
+    # Default implementation does nothing - only Kubernetes overrides this
+    del provider_name, cluster_name_on_cloud, provider_config
+
+
 @_route_to_cloud_impl
 def cleanup_custom_multi_network(
     provider_name: str,
diff --git a/sky/provision/aws/config.py b/sky/provision/aws/config.py
index c71db25f47f..2cec5d0ca0f 100644
--- a/sky/provision/aws/config.py
+++ b/sky/provision/aws/config.py
@@ -21,6 +21,7 @@
 from sky.adaptors import aws
 from sky.clouds import aws as aws_cloud
 from sky.provision import common
+from sky.provision import constants as provision_constants
 from sky.provision.aws import utils
 from sky.utils import annotations
 from sky.utils import common_utils
@@ -47,8 +48,9 @@ def _skypilot_log_error_and_exit_for_failover(error: str) -> None:
 
     Mainly used for handling VPC/subnet errors before nodes are launched.
     """
-    # NOTE: keep. The backend looks for this to know no nodes are launched.
-    full_error = f'SKYPILOT_ERROR_NO_NODES_LAUNCHED: {error}'
+    # NOTE: keep. The backend and provisioner looks for this to know
+    # no nodes are launched.
+    full_error = f'{provision_constants.ERROR_NO_NODES_LAUNCHED}: {error}'
     logger.error(full_error)
     raise RuntimeError(full_error)
 
diff --git a/sky/provision/constants.py b/sky/provision/constants.py
index 8e8ad5ddf1b..50d32e5ef47 100644
--- a/sky/provision/constants.py
+++ b/sky/provision/constants.py
@@ -17,6 +17,9 @@
     TAG_SKYPILOT_HEAD_NODE: '0',
 }
 
+# Magic error string indicating that no nodes were launched.
+ERROR_NO_NODES_LAUNCHED = 'SKYPILOT_ERROR_NO_NODES_LAUNCHED'
+
 # Names for Azure Deployments.
 DEPLOYMENT_NAME = 'skypilot-config'
 LEGACY_DEPLOYMENT_NAME = 'ray-config'
diff --git a/sky/provision/docker_utils.py b/sky/provision/docker_utils.py
index 135e4ed7799..d3be90ad56f 100644
--- a/sky/provision/docker_utils.py
+++ b/sky/provision/docker_utils.py
@@ -1,6 +1,7 @@
 """Initialize docker containers on a remote node."""
 
 import dataclasses
+import re
 import shlex
 import time
 from typing import Any, Dict, List, Optional
@@ -48,6 +49,9 @@
     'unzip -q /tmp/awscliv2.zip -d /tmp && sudo /tmp/aws/install '
     '&& rm -rf /tmp/awscliv2.zip /tmp/aws)')
 
+# Pattern to extract SSH user from command output, handling MOTD contamination
+_DOCKER_USER_PATTERN = re.compile(r'SKYPILOT_DOCKER_USER: ([^\s\n]+)')
+
 
 def _extract_region_from_ecr_server(server: str) -> str:
     """Extract AWS region from ECR server URL.
@@ -456,7 +460,16 @@ def initialize(self) -> str:
             run_env='docker')
 
         # SkyPilot: End of Setup Commands.
-        docker_user = self._run('whoami', run_env='docker')
+        # Pattern matching to prevent MOTD contamination and reliably
+        # parse docker user. Refer to CommandRunner::_get_remote_home_dir.
+        docker_user_output = self._run('echo "SKYPILOT_DOCKER_USER: $(whoami)"',
+                                       run_env='docker')
+        docker_user_match = _DOCKER_USER_PATTERN.search(docker_user_output)
+        if docker_user_match:
+            docker_user = docker_user_match.group(1)
+        else:
+            raise ValueError('Failed to find Docker user identifier: '
+                             f'{docker_user_output}')
         self.initialized = True
         return docker_user
 
diff --git a/sky/provision/gcp/config.py b/sky/provision/gcp/config.py
index 183861396d6..815f9f8a328 100644
--- a/sky/provision/gcp/config.py
+++ b/sky/provision/gcp/config.py
@@ -10,6 +10,7 @@
 from sky.adaptors import gcp
 from sky.clouds.utils import gcp_utils
 from sky.provision import common
+from sky.provision import constants as provision_constants
 from sky.provision.gcp import constants
 from sky.provision.gcp import instance_utils
 from sky.utils import resources_utils
@@ -26,7 +27,7 @@ def _skypilot_log_error_and_exit_for_failover(error_code: str,
     Mainly used for handling VPC/subnet errors before nodes are launched.
     """
     # NOTE: keep. The backend looks for this to know no nodes are launched.
-    prefix = 'SKYPILOT_ERROR_NO_NODES_LAUNCHED: '
+    prefix = f'{provision_constants.ERROR_NO_NODES_LAUNCHED}: '
     error = common.ProvisionerError(prefix + error_msg)
     error.errors = [{
         'code': error_code,
diff --git a/sky/provision/gcp/instance_utils.py b/sky/provision/gcp/instance_utils.py
index 7323eddf31b..41f1841932b 100644
--- a/sky/provision/gcp/instance_utils.py
+++ b/sky/provision/gcp/instance_utils.py
@@ -31,6 +31,7 @@
 GCP_CREATE_MAX_RETRIES = 5
 GCP_RETRY_INTERVAL_SECONDS = 5
 GCP_TIMEOUT = 300
+GCP_QUEUED_RESOURCE_TIMEOUT = 6000
 
 logger = sky_logging.init_logger(__name__)
 
@@ -838,6 +839,27 @@ def _use_bulk_insert(cls, config) -> bool:
                     return False
         return True
 
+    @classmethod
+    def _handle_http_error(cls, e, zone):
+        # NOTE: Error example:
+        # {
+        #   'message': "Quota '...' exceeded. Limit: ... in region xx-xxxx.", # pylint: disable=line-too-long
+        #   'domain': 'usageLimits',
+        #   'reason': 'quotaExceeded'
+        # }
+        error_details = getattr(e, 'error_details', [])
+        errors = []
+        for detail in error_details:
+            # To be consistent with error messages returned by operation wait.
+            errors.append({
+                'code': detail.get('reason'),
+                'domain': detail.get('domain'),
+                'message': detail.get('message', str(e)),
+            })
+        logger.debug(f'create_instances: googleapiclient.errors.HttpError: {e}')
+        _format_and_log_message_from_errors(errors, e, zone)
+        return errors
+
     @classmethod
     def _create_instances(
         cls,
@@ -848,27 +870,6 @@ def _create_instances(
         head_tag_needed: List[bool],
     ) -> Optional[List]:
 
-        def _handle_http_error(e):
-            # NOTE: Error example:
-            # {
-            #   'message': "Quota '...' exceeded. Limit: ... in region xx-xxxx.", # pylint: disable=line-too-long
-            #   'domain': 'usageLimits',
-            #   'reason': 'quotaExceeded'
-            # }
-            error_details = getattr(e, 'error_details', [])
-            errors = []
-            for detail in error_details:
-                # To be consistent with error messages returned by operation wait.
-                errors.append({
-                    'code': detail.get('reason'),
-                    'domain': detail.get('domain'),
-                    'message': detail.get('message', str(e)),
-                })
-            logger.debug(
-                f'create_instances: googleapiclient.errors.HttpError: {e}')
-            _format_and_log_message_from_errors(errors, e, zone)
-            return errors
-
         # Allow Google Compute Engine instance templates.
         #
         # Config example:
@@ -889,7 +890,7 @@ def _handle_http_error(e):
             else:
                 operations = cls._insert(names, project_id, zone, config)
         except gcp.http_error_exception() as e:
-            return _handle_http_error(e)
+            return cls._handle_http_error(e, zone)
 
         for operation in operations:
             errors = operation.get('error', {}).get('errors', [])
@@ -906,7 +907,7 @@ def _handle_http_error(e):
         except common.ProvisionerError as e:
             return e.errors
         except gcp.http_error_exception() as e:
-            return _handle_http_error(e)
+            return cls._handle_http_error(e, zone)
 
         # assign labels for head node
         with pool.ThreadPool() as p:
@@ -1434,6 +1435,68 @@ def set_labels(cls, project_id: str, availability_zone: str, node_id: str,
 
         cls.wait_for_operation(operation, project_id, availability_zone)
 
+    @classmethod
+    def wait_for_queued_resource(cls, project_id: str, zone: str,
+                                 queued_resource_id: str) -> None:
+        """Wait for queued resource to be ready."""
+        logger.debug(
+            f'Waiting for queued resource {queued_resource_id} to be ready '
+            f'(timeout={GCP_QUEUED_RESOURCE_TIMEOUT}s)...')
+
+        @_retry_on_gcp_http_exception()
+        def get_queued_resource():
+            return cls.load_resource().projects().locations().queuedResources(
+            ).get(
+                name=f'projects/{project_id}/locations/{zone}/queuedResources/'
+                f'{queued_resource_id}').execute()
+
+        wait_start = time.time()
+        while time.time() - wait_start < GCP_QUEUED_RESOURCE_TIMEOUT:
+            try:
+                qr = get_queued_resource()
+            except Exception as e:  # pylint: disable=broad-except
+                logger.warning(
+                    f'Failed to get queued resource status: {e}. Retrying...')
+                time.sleep(constants.POLL_INTERVAL)
+                continue
+
+            state = qr.get('state', {}).get('state')
+
+            # Values: STATE_UNSPECIFIED, CREATING, ACCEPTED, PROVISIONING,
+            # FAILED, DELETING, ACTIVE, SUSPENDING, SUSPENDED,
+            # WAITING_FOR_RESOURCES
+            if state == 'ACTIVE':
+                logger.debug(f'Queued resource {queued_resource_id} is active.')
+                return
+
+            if state == 'FAILED':
+                error_details = qr.get('status', {})
+                provisioner_error = common.ProvisionerError(
+                    f'Queued resource {queued_resource_id} failed with state '
+                    f'{state}. Error: {error_details}')
+                provisioner_error.errors = [{
+                    'code': error_details.get('code', 'UNKNOWN'),
+                    'message': error_details.get('message', str(error_details)),
+                    'domain': 'queued_resource',
+                }]
+                raise provisioner_error
+
+            logger.debug(
+                f'Queued resource {queued_resource_id} state: {state}. '
+                'Waiting...')
+            time.sleep(constants.POLL_INTERVAL)
+
+        provisioner_error = common.ProvisionerError(
+            f'Timed out waiting for queued resource {queued_resource_id} to be '
+            'ready.')
+        provisioner_error.errors = [{
+            'code': 'TIMEOUT',
+            'message': (f'Timed out waiting for queued resource '
+                        f'{queued_resource_id} to be ready.'),
+            'domain': 'queued_resource',
+        }]
+        raise provisioner_error
+
     @classmethod
     def create_instances(
         cls,
@@ -1475,20 +1538,113 @@ def create_instances(
             raise NotImplementedError(
                 'TPU VMs do not support reservations yet.')
 
-        # Allow Google Compute Engine instance templates.
-        #
-        # Config example:
-        #
-        #     ...
-        #     node_config:
-        #         sourceInstanceTemplate: global/instanceTemplates/worker-16
-        #         machineType: e2-standard-16
-        #     ...
-        #
-        # node_config parameters override matching template parameters, if any.
-        #
-        # https://cloud.google.com/compute/docs/instance-templates
-        # https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
+        if config.get('gcp_queued_resource'):
+            return cls._create_queued_resource_instances(
+                names, project_id, zone, config)
+        else:
+            return cls._create_standard_instances(names, project_id, zone,
+                                                  config)
+
+    @classmethod
+    def _create_queued_resource_instances(
+        cls,
+        names: List[str],
+        project_id: str,
+        zone: str,
+        config: dict,
+    ) -> Tuple[Optional[List], List[str]]:
+        operations = []
+        queued_resource_ids = []
+
+        for i, name in enumerate(names):
+            node_config = config.copy()
+            if i == 0:
+                node_config['labels'].update(provision_constants.HEAD_NODE_TAGS)
+            else:
+                node_config['labels'].update(
+                    provision_constants.WORKER_NODE_TAGS)
+            node_config.pop('gcp_queued_resource')
+
+            qr_id = f'{name}-q'  # TODO: should this be configurable?
+            parent = f'projects/{project_id}/locations/{zone}'
+
+            qr_body = {
+                'tpu': {
+                    'nodeSpec': {
+                        'parent': parent,
+                        'nodeId': name,
+                        'node': node_config,
+                    }
+                }
+            }
+
+            if config.get('schedulingConfig', {}).get('preemptible'):
+                qr_body['spot'] = {}
+            node_config.pop('schedulingConfig', None)
+
+            logger.debug(
+                f'Creating Queued Resource {qr_id} with body: {qr_body}')
+            try:
+                request = cls.load_resource().projects().locations(
+                ).queuedResources().create(parent=parent,
+                                           queuedResourceId=qr_id,
+                                           body=qr_body)
+                operation = request.execute(num_retries=GCP_CREATE_MAX_RETRIES)
+                operations.append(operation)
+                queued_resource_ids.append(qr_id)
+
+            except gcp.http_error_exception() as e:
+                error_details = getattr(e, 'error_details', [])
+                logger.debug(
+                    f'create_instances: googleapiclient.errors.HttpError: {e}')
+                errors = []
+                if isinstance(error_details, str):
+                    errors.append({
+                        'code': 'CREATION_FAILED',
+                        'domain': 'create_instances',
+                        'message': error_details,
+                    })
+                    _format_and_log_message_from_errors(errors, e, zone)
+                    return errors, names
+                for detail in error_details:
+                    # To be consistent with error messages returned by operation
+                    # wait.
+                    violations = detail.get('violations', [])
+                    if not violations:
+                        errors.append({
+                            'code': detail.get('reason'),
+                            'domain': detail.get('domain'),
+                            'message': detail.get('message', str(e)),
+                        })
+                    else:
+                        for violation in violations:
+                            errors.append({
+                                'code': detail.get('@type'),
+                                'domain': violation.get('subject'),
+                                'message': violation.get('description'),
+                            })
+                _format_and_log_message_from_errors(errors, e, zone)
+                return errors, names
+
+        logger.debug('Waiting GCP instances to be ready (Queued Resources) ...')
+
+        # 1. Wait for creation operations
+        for operation in operations:
+            cls.wait_for_operation(operation, project_id, zone=zone)
+
+        # 2. Wait for Queued Resources to be ACTIVE
+        for qr_id in queued_resource_ids:
+            cls.wait_for_queued_resource(project_id, zone, qr_id)
+        return None, names
+
+    @classmethod
+    def _create_standard_instances(
+        cls,
+        names: List[str],
+        project_id: str,
+        zone: str,
+        config: dict,
+    ) -> Tuple[Optional[List], List[str]]:
         operations = []
         for i, name in enumerate(names):
             node_config = config.copy()
diff --git a/sky/provision/instance_setup.py b/sky/provision/instance_setup.py
index 16eac5a4cec..e456f2d829b 100644
--- a/sky/provision/instance_setup.py
+++ b/sky/provision/instance_setup.py
@@ -232,7 +232,7 @@ def setup_runtime_on_cluster(cluster_name: str, setup_commands: List[str],
     @_auto_retry()
     def _setup_node(runner: command_runner.CommandRunner, log_path: str):
         for cmd in setup_commands:
-            returncode, stdout, stderr = runner.run(
+            returncode, stdout, stderr = runner.run_setup(
                 cmd,
                 stream_logs=False,
                 log_path=log_path,
@@ -250,11 +250,12 @@ def _setup_node(runner: command_runner.CommandRunner, log_path: str):
                             'Retrying setup in 10 seconds.')
                 time.sleep(10)
                 retry_cnt += 1
-                returncode, stdout, stderr = runner.run(cmd,
-                                                        stream_logs=False,
-                                                        log_path=log_path,
-                                                        require_outputs=True,
-                                                        source_bashrc=True)
+                returncode, stdout, stderr = runner.run_setup(
+                    cmd,
+                    stream_logs=False,
+                    log_path=log_path,
+                    require_outputs=True,
+                    source_bashrc=True)
                 if not returncode:
                     break
 
@@ -561,18 +562,17 @@ def _internal_file_mounts(file_mounts: Dict,
             mkdir_command = f'mkdir -p {os.path.dirname(dst)}'
         else:
             mkdir_command = f'mkdir -p {dst}'
-
-        rc, stdout, stderr = runner.run(mkdir_command,
-                                        log_path=log_path,
-                                        stream_logs=False,
-                                        require_outputs=True)
+        rc, stdout, stderr = runner.run_setup(mkdir_command,
+                                              log_path=log_path,
+                                              stream_logs=False,
+                                              require_outputs=True)
         subprocess_utils.handle_returncode(
             rc,
             mkdir_command, ('Failed to run command before rsync '
                             f'{src} -> {dst}.'),
             stderr=stdout + stderr)
 
-        runner.rsync(
+        runner.rsync_setup(
             source=src,
             target=dst,
             up=True,
diff --git a/sky/provision/kubernetes/__init__.py b/sky/provision/kubernetes/__init__.py
index 351401aeb0f..d803c58186f 100644
--- a/sky/provision/kubernetes/__init__.py
+++ b/sky/provision/kubernetes/__init__.py
@@ -1,6 +1,7 @@
 """Kubernetes provisioner for SkyPilot."""
 
 from sky.provision.kubernetes.config import bootstrap_instances
+from sky.provision.kubernetes.instance import cleanup_cluster_resources
 from sky.provision.kubernetes.instance import get_cluster_info
 from sky.provision.kubernetes.instance import get_command_runners
 from sky.provision.kubernetes.instance import query_instances
@@ -13,6 +14,7 @@
 from sky.provision.kubernetes.network import query_ports
 from sky.provision.kubernetes.volume import apply_volume
 from sky.provision.kubernetes.volume import delete_volume
+from sky.provision.kubernetes.volume import get_all_volumes_errors
 from sky.provision.kubernetes.volume import get_all_volumes_usedby
 from sky.provision.kubernetes.volume import get_volume_usedby
 from sky.provision.kubernetes.volume import map_all_volumes_usedby
diff --git a/sky/provision/kubernetes/constants.py b/sky/provision/kubernetes/constants.py
index fe1687449dc..a63054db76a 100644
--- a/sky/provision/kubernetes/constants.py
+++ b/sky/provision/kubernetes/constants.py
@@ -1,5 +1,55 @@
 """Constants for Kubernetes provisioning."""
 
+# Canonical GPU names for GPU detection and labeling.
+# Used by both GFDLabelFormatter and the GPU labeler script.
+#
+# IMPORTANT: Order matters for the GPU labeler script which uses substring
+# matching (canonical_name in gpu_name). Names that are prefixes of other
+# names must come later (e.g., 'L40S' before 'L40' before 'L4') to prevent
+# 'L4' from matching 'L40S'. GFDLabelFormatter uses word boundary regex
+# and is order-independent.
+CANONICAL_GPU_NAMES = [
+    # Blackwell architecture (2024+)
+    'GB300',
+    'GB200',
+    'B300',
+    'B200',
+    'B100',
+    # Hopper architecture
+    'GH200',
+    'H200',
+    'H100-80GB',
+    'H100-MEGA',
+    'H100',
+    # Ampere architecture
+    'A100-80GB',
+    'A100',
+    'A10G',
+    'A10',
+    'A16',
+    'A30',
+    'A40',
+    # Ada Lovelace architecture - Professional (RTX Ada)
+    'RTX6000-Ada',
+    'L40S',
+    'L40',
+    'L4',
+    # Quadro/RTX Professional (Ampere)
+    'A6000',
+    'A5000',
+    'A4000',
+    # Older architectures - Volta/Pascal/Turing
+    'V100',
+    'P100',
+    'P40',
+    'P4000',
+    'P4',
+    'T4g',
+    'T4',
+    'K80',
+    'M60',
+]
+
 NO_GPU_HELP_MESSAGE = ('If your cluster contains GPUs, make sure '
                        'nvidia.com/gpu resource is available on the nodes and '
                        'the node labels for identifying GPUs '
@@ -21,5 +71,8 @@
 TAG_POD_INITIALIZED = 'skypilot-initialized'
 TAG_SKYPILOT_DEPLOYMENT_NAME = 'skypilot-deployment-name'
 
+# Default name of the primary workload container in SkyPilot Ray pods.
+RAY_NODE_CONTAINER_NAME = 'ray-node'
+
 # Pod phases that are not holding PVCs
 PVC_NOT_HOLD_POD_PHASES = ['Succeeded', 'Failed']
diff --git a/sky/provision/kubernetes/instance.py b/sky/provision/kubernetes/instance.py
index fefbf7b6170..c54890f169e 100644
--- a/sky/provision/kubernetes/instance.py
+++ b/sky/provision/kubernetes/instance.py
@@ -214,16 +214,10 @@ def _get_pvc_binding_status(namespace: str, context: Optional[str],
                     _request_timeout=kubernetes.API_TIMEOUT)
             if pvc.status.phase == 'Pending':
                 # Get events for the PVC to understand why it's pending
-                pvc_events = kubernetes.core_api(context).list_namespaced_event(
-                    namespace,
-                    field_selector=(
-                        f'involvedObject.name={pvc_name},'
-                        'involvedObject.kind=PersistentVolumeClaim'),
-                    _request_timeout=kubernetes.API_TIMEOUT)
-                # Sort events by creation timestamp to get the most recent
-                sorted_events = sorted(
-                    pvc_events.items,
-                    key=lambda e: e.metadata.creation_timestamp)
+                sorted_events = kubernetes_utils.get_pvc_events(context,
+                                                                namespace,
+                                                                pvc_name,
+                                                                reverse=False)
                 event_messages = []
                 for event in sorted_events:
                     if event.type == 'Warning' or event.reason in (
@@ -379,22 +373,23 @@ def _raise_pod_scheduling_errors(namespace, context, new_nodes):
                     insufficent_resources=nice_names,
                 )
 
-            # Check for PVC binding issues
-            pvc_error = _get_pvc_binding_status(namespace, context, pod)
-            has_pvc_issue = ('unbound immediate PersistentVolumeClaims'
-                             in event_message)
-            if pvc_error is not None or has_pvc_issue:
-                pvc_msg = pvc_error if pvc_error else (
-                    _format_pvc_binding_error(
-                        pvc_details=None, pvc_names=[], namespace=namespace))
-                raise config_lib.KubernetesError(
-                    f'{pvc_msg}\n'
-                    f'Pod status: {pod_status} '
-                    f'Details: \'{event_message}\' ')
-
-            raise config_lib.KubernetesError(f'{timeout_err_msg} '
-                                             f'Pod status: {pod_status} '
-                                             f'Details: \'{event_message}\' ')
+        # Check for PVC binding issues
+        pvc_error = _get_pvc_binding_status(namespace, context, pod)
+        has_pvc_issue = (event_message is not None and
+                         'unbound immediate PersistentVolumeClaims'
+                         in event_message)
+        if pvc_error is not None or has_pvc_issue:
+            pvc_msg = pvc_error if pvc_error else (_format_pvc_binding_error(
+                pvc_details=None, pvc_names=[], namespace=namespace))
+            err_msg = f'{pvc_msg}\nPod status: {pod_status}'
+            if event_message:
+                err_msg += f' Details: \'{event_message}\''
+            raise config_lib.KubernetesError(err_msg)
+
+        err_msg = f'{timeout_err_msg} Pod status: {pod_status}'
+        if event_message:
+            err_msg += f' Details: \'{event_message}\''
+        raise config_lib.KubernetesError(err_msg)
 
     raise config_lib.KubernetesError(f'{timeout_err_msg}')
 
@@ -900,7 +895,8 @@ def _pre_init_thread(new_node):
         pod_name = new_node.metadata.name
         logger.info(f'{"-"*20}Start: Pre-init in pod {pod_name!r} {"-"*20}')
         runner = command_runner.KubernetesCommandRunner(
-            ((namespace, context), pod_name))
+            ((namespace, context), pod_name),
+            container=k8s_constants.RAY_NODE_CONTAINER_NAME)
 
         # Run the combined pre-init command
         rc, stdout, _ = runner.run(pre_init_cmd,
@@ -971,10 +967,10 @@ def _create_namespaced_pod_with_retries(namespace: str, pod_spec: dict,
 
             # Remove the AppArmor annotation
             annotations = pod_spec.get('metadata', {}).get('annotations', {})
-            if ('container.apparmor.security.beta.kubernetes.io/ray-node'
-                    in annotations):
-                del annotations[
-                    'container.apparmor.security.beta.kubernetes.io/ray-node']
+            apparmor_key = ('container.apparmor.security.beta.kubernetes.io/'
+                            f'{k8s_constants.RAY_NODE_CONTAINER_NAME}')
+            if apparmor_key in annotations:
+                del annotations[apparmor_key]
                 pod_spec['metadata']['annotations'] = annotations
                 logger.info('AppArmor annotation removed from Pod spec.')
             else:
@@ -1008,6 +1004,44 @@ def _create_namespaced_pod_with_retries(namespace: str, pod_spec: dict,
                                    pod_spec,
                                    details=error_message,
                                    extra_msg=extra_message))
+        elif (e.status == 409 and
+              re.match(r'^object is being deleted: pods \".+\" already exists$',
+                       error_message)):
+            # Pod from a previous cluster with the same name is
+            # still being deleted.
+            # Extract pod name from the error message.
+            # The error message is expected to match:
+            # object is being deleted: pods "<podname>" already exists
+            match = re.search(r'pods "([^"]+)"', error_message)
+            assert match, f'Could not extract pod name from: {error_message}'
+            pod_name = match.group(1)
+            logger.info(
+                f'Pod {pod_name} from previous cluster is still being deleted. '
+                'Force deleting it and retrying pod creation.')
+            try:
+                # Since the pod is already being deleted,
+                # we can try force deleting it to make sure it's deleted,
+                # then retry the pod creation.
+                kubernetes.core_api(context).delete_namespaced_pod(
+                    pod_name,
+                    namespace,
+                    _request_timeout=config_lib.DELETION_TIMEOUT,
+                    grace_period_seconds=0)
+            except kubernetes.api_exception() as delete_exception:
+                logger.warning(
+                    f'Failed to force delete pod {pod_name}, but proceeding '
+                    f'to retry creation. Error: {delete_exception}')
+            try:
+                pod = kubernetes.core_api(context).create_namespaced_pod(
+                    namespace, pod_spec)
+                logger.info(
+                    f'Pod {pod.metadata.name} created successfully '
+                    'after force deleting the pod from previous cluster.')
+                return pod
+            except kubernetes.api_exception() as retry_exception:
+                logger.warning(f'Failed to create pod {pod_name} on retry: '
+                               f'{retry_exception}')
+                raise retry_exception
         else:
             # Re-raise the exception if it's a different error
             raise e
@@ -1062,8 +1096,6 @@ def _create_pods(region: str, cluster_name: str, cluster_name_on_cloud: str,
     if to_create_deployment:
         deployment_spec = pod_spec.pop('deployment_spec')
         pvc_spec = pod_spec.pop('pvc_spec')
-        assert len(pod_spec['spec']['containers']) == 1, (
-            'Only one container is supported for deployment')
 
     tags = ray_tag_filter(cluster_name_on_cloud)
 
@@ -1466,12 +1498,37 @@ def _delete_services(name_prefix: str,
             resource_name=service_name)
 
 
+def _delete_cluster_services(cluster_name: str, namespace: str,
+                             context: Optional[str]) -> None:
+    """Delete all services associated with a cluster using label selector.
+
+    This is a fallback cleanup mechanism that works even when pods have been
+    deleted externally. Services are identified by the skypilot-cluster-name
+    label.
+
+    Args:
+        cluster_name: The cluster name used in the skypilot-cluster-name label
+        namespace: Kubernetes namespace
+        context: Kubernetes context
+    """
+    label_selector = f'{constants.TAG_SKYPILOT_CLUSTER_NAME}={cluster_name}'
+    try:
+        kubernetes.core_api(context).delete_collection_namespaced_service(
+            namespace,
+            label_selector=label_selector,
+            _request_timeout=config_lib.DELETION_TIMEOUT)
+    except kubernetes.api_exception() as e:
+        logger.warning(f'Failed to cleanup services for cluster '
+                       f'{cluster_name}: {e}')
+
+
 def _terminate_node(namespace: str,
                     context: Optional[str],
                     pod_name: str,
                     is_head: bool = False) -> None:
     """Terminate a pod and its associated services."""
-    logger.debug('terminate_instances: calling delete_namespaced_pod')
+    logger.debug(f'terminate_instances: namespace: {namespace}, context: '
+                 f'{context}, pod_name: {pod_name}, is_head: {is_head}')
 
     if is_head:
         # Delete services for the head pod
@@ -1558,6 +1615,33 @@ def _terminate_pod_thread(pod_info):
     subprocess_utils.run_in_parallel(_terminate_pod_thread, list(pods.items()),
                                      _NUM_THREADS)
 
+    if not worker_only:
+        # Cleanup all services by label selector as a fallback.
+        # This handles the case where pods were deleted externally.
+        # Only do this when terminating the entire cluster, not when
+        # terminating workers only (head services should remain).
+        _delete_cluster_services(cluster_name_on_cloud, namespace, context)
+
+
+def cleanup_cluster_resources(
+    cluster_name_on_cloud: str,
+    provider_config: Dict[str, Any],
+) -> None:
+    """Cleanup Kubernetes resources for a cluster.
+
+    This function is called during post-teardown cleanup to ensure all cluster
+    resources are deleted even when pods were deleted externally. It uses label
+    selectors to find and delete resources, making it resilient to external
+    deletions.
+
+    Args:
+        cluster_name_on_cloud: The cluster name on cloud
+        provider_config: Provider configuration dictionary
+    """
+    namespace = kubernetes_utils.get_namespace_from_config(provider_config)
+    context = kubernetes_utils.get_context_from_config(provider_config)
+    _delete_cluster_services(cluster_name_on_cloud, namespace, context)
+
 
 def get_cluster_info(
         region: str,
@@ -1600,7 +1684,13 @@ def get_cluster_info(
             head_pod_name = pod_name
             head_spec = pod.spec
             assert head_spec is not None, pod
-            cpu_request = head_spec.containers[0].resources.requests['cpu']
+            primary_container = kubernetes_utils.get_pod_primary_container(pod)
+            resources = getattr(primary_container, 'resources', None)
+            requests = (getattr(resources, 'requests', None)
+                        if resources else None)
+            limits = (getattr(resources, 'limits', None) if resources else None)
+            cpu_request = ((requests or {}).get('cpu') or
+                           (limits or {}).get('cpu'))
 
     if cpu_request is None:
         raise RuntimeError(f'Pod {cluster_name_on_cloud}-head not found'
@@ -1614,7 +1704,8 @@ def get_cluster_info(
     get_k8s_ssh_user_cmd = 'echo "SKYPILOT_SSH_USER: $(whoami)"'
     assert head_pod_name is not None
     runner = command_runner.KubernetesCommandRunner(
-        ((namespace, context), head_pod_name))
+        ((namespace, context), head_pod_name),
+        container=k8s_constants.RAY_NODE_CONTAINER_NAME)
     rc, stdout, stderr = runner.run(get_k8s_ssh_user_cmd,
                                     require_outputs=True,
                                     separate_stderr=True,
@@ -1710,7 +1801,9 @@ def _get_pod_termination_reason(pod: Any, cluster_name: str) -> str:
                     # just in-case reason is None, have default for debugging
                     reason = f'exit({exit_code})'
                 container_reasons.append(reason)
-                latest_timestamp = max(latest_timestamp, terminated.finished_at)
+                if terminated.finished_at is not None:
+                    latest_timestamp = max(latest_timestamp,
+                                           terminated.finished_at)
 
             # TODO (kyuds): later, if needed, query `last_state` too.
 
@@ -2115,7 +2208,10 @@ def get_command_runners(
 
         node_list = [((namespace, context), pod_name)]
         head_runner = command_runner.KubernetesCommandRunner(
-            node_list[0], deployment=deployment, **credentials)
+            node_list[0],
+            deployment=deployment,
+            container=k8s_constants.RAY_NODE_CONTAINER_NAME,
+            **credentials)
         runners.append(head_runner)
 
     node_list = [((namespace, context), pod_name)
@@ -2123,6 +2219,8 @@ def get_command_runners(
                  if pod_name != cluster_info.head_instance_id]
     runners.extend(
         command_runner.KubernetesCommandRunner.make_runner_list(
-            node_list, **credentials))
+            node_list,
+            container=k8s_constants.RAY_NODE_CONTAINER_NAME,
+            **credentials))
 
     return runners
diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py
index 8f9c1b2ddad..461e95e5e76 100644
--- a/sky/provision/kubernetes/utils.py
+++ b/sky/provision/kubernetes/utils.py
@@ -37,6 +37,7 @@
 from sky.utils import config_utils
 from sky.utils import env_options
 from sky.utils import kubernetes_enums
+from sky.utils import plugin_extensions
 from sky.utils import schemas
 from sky.utils import status_lib
 from sky.utils import timeline
@@ -44,7 +45,9 @@
 from sky.utils import yaml_utils
 
 if typing.TYPE_CHECKING:
+    from dateutil import parser as dateutil_parser
     import jinja2
+    from kubernetes.client import models as kubernetes_models
     import yaml
 
     from sky import backends
@@ -52,6 +55,8 @@
 else:
     jinja2 = adaptors_common.LazyImport('jinja2')
     yaml = adaptors_common.LazyImport('yaml')
+    dateutil_parser = adaptors_common.LazyImport('dateutil.parser')
+    kubernetes_models = adaptors_common.LazyImport('kubernetes.client.models')
 
 # Please be careful when changing this.
 # When mounting, Kubernetes changes the ownership of the parent directory
@@ -95,6 +100,8 @@ class KubernetesHighPerformanceNetworkType(enum.Enum):
     GCP_GPUDIRECT_RDMA = 'gcp_gpudirect_rdma'
     NEBIUS = 'nebius'
     COREWEAVE = 'coreweave'
+    TOGETHER = 'together'
+    AWS_EFA = 'aws_efa'
     NONE = 'none'
 
     def get_network_env_vars(self) -> Dict[str, str]:
@@ -106,6 +113,13 @@ def get_network_env_vars(self) -> Dict[str, str]:
                 'UCX_NET_DEVICES': ('mlx5_0:1,mlx5_1:1,mlx5_2:1,mlx5_3:1,'
                                     'mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1')
             }
+        elif self == KubernetesHighPerformanceNetworkType.TOGETHER:
+            # Together AI cluster with InfiniBand - use InfiniBand optimizations
+            return {
+                'NCCL_IB_HCA': 'mlx5',
+                'UCX_NET_DEVICES': ('mlx5_0:1,mlx5_1:1,mlx5_2:1,mlx5_3:1,'
+                                    'mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1')
+            }
         elif self == KubernetesHighPerformanceNetworkType.COREWEAVE:
             return {
                 'NCCL_SOCKET_IFNAME': 'eth0',
@@ -114,6 +128,10 @@ def get_network_env_vars(self) -> Dict[str, str]:
                 'UCX_TLS': 'tcp',
                 'UCX_NET_DEVICES': 'eth0',
             }
+        elif self == KubernetesHighPerformanceNetworkType.AWS_EFA:
+            return {
+                'FI_PROVIDER': 'efa',
+            }
         else:
             # GCP clusters and generic clusters - environment variables are
             # handled directly in the template
@@ -677,15 +695,15 @@ def get_accelerator_from_label_value(cls, value: str) -> str:
         """Searches against a canonical list of NVIDIA GPUs and pattern
         matches the canonical GPU name against the GFD label.
         """
-        canonical_gpu_names = [
-            'A100-80GB', 'A100', 'A10G', 'H100', 'K80', 'M60', 'T4g', 'T4',
-            'V100', 'A10', 'P4000', 'P100', 'P40', 'P4', 'L40', 'L4'
-        ]
-        for canonical_name in canonical_gpu_names:
-            # A100-80G accelerator is A100-SXM-80GB or A100-PCIE-80GB
+        for canonical_name in kubernetes_constants.CANONICAL_GPU_NAMES:
+            # A100-80GB accelerator is A100-SXM-80GB or A100-PCIE-80GB
             if canonical_name == 'A100-80GB' and re.search(
                     r'A100.*-80GB', value):
                 return canonical_name
+            # H100-80GB accelerator is H100-SXM-80GB or H100-PCIE-80GB
+            if canonical_name == 'H100-80GB' and re.search(
+                    r'H100.*-80GB', value):
+                return canonical_name
             # Use word boundary matching to prevent substring matches
             elif re.search(rf'\b{re.escape(canonical_name)}\b', value):
                 return canonical_name
@@ -695,12 +713,49 @@ def get_accelerator_from_label_value(cls, value: str) -> str:
         # 2. remove 'GEFORCE-' (e.g., 'NVIDIA-GEFORCE-RTX-3070' -> 'RTX-3070')
         # 3. remove 'RTX-' (e.g. 'RTX-6000' -> 'RTX6000')
         # Same logic, but uppercased, as the Skypilot labeler job found in
-        # sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml
+        # sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml.j2
         return value.upper().replace('NVIDIA-',
                                      '').replace('GEFORCE-',
                                                  '').replace('RTX-', 'RTX')
 
 
+def _accelerator_name_matches(requested_acc: str,
+                              viable_names: List[str]) -> bool:
+    """Check if requested accelerator matches any viable name.
+
+    For backward compatibility with GPU name changes (e.g., when canonical names
+    like 'H200' are added to replace fallback names like 'H200-SXM-80GB'), this
+    function also matches if one name is a prefix of the other separated by '-'.
+
+    This handles cases where:
+    - Clusters were launched with fallback names (e.g., 'H200-SXM-80GB') but
+      after upgrading, the same label now maps to canonical name (e.g., 'H200').
+    - Users specify canonical names but the cluster uses fallback names.
+
+    Args:
+        requested_acc: The accelerator type requested (e.g., from launched_resources).
+        viable_names: List of viable accelerator names from node labels.
+
+    Returns:
+        True if the requested accelerator matches any viable name.
+    """
+    requested_lower = requested_acc.lower()
+    for viable in viable_names:
+        viable_lower = viable.lower()
+        if requested_lower == viable_lower:
+            return True
+        # Check prefix match with '-' separator for backward compatibility.
+        # E.g., 'H200' matches 'H200-SXM-80GB' and vice versa.
+        shorter, longer = ((requested_lower, viable_lower)
+                           if len(requested_lower) <= len(viable_lower) else
+                           (viable_lower, requested_lower))
+        if longer.startswith(shorter):
+            # Ensure it's a proper prefix (followed by '-' or end of string)
+            if len(longer) == len(shorter) or longer[len(shorter)] == '-':
+                return True
+    return False
+
+
 class KarpenterLabelFormatter(SkyPilotLabelFormatter):
     """Karpeneter label formatter
     Karpenter uses the label `karpenter.k8s.aws/instance-gpu-name` to identify
@@ -996,8 +1051,8 @@ def _check_instance_fits_gke_autoscaler_node_pool(
         logger.debug(
             f'checking if autoscale-enabled node pool {node_pool_name} '
             f'can create a node satisfying {instance_type}')
-        k8s_instance_type = KubernetesInstanceType.\
-            from_instance_type(instance_type)
+        k8s_instance_type = (
+            KubernetesInstanceType.from_instance_type(instance_type))
         node_config = node_pool['config']
         machine_type = node_config['machineType']
 
@@ -1045,7 +1100,16 @@ def _check_instance_fits_gke_autoscaler_node_pool(
                          f'{node_pool_name}.')
             return True
 
-        vcpus, mem = clouds.GCP.get_vcpus_mem_from_instance_type(machine_type)
+        try:
+            vcpus, mem = clouds.GCP.get_vcpus_mem_from_instance_type(
+                machine_type)
+        except ValueError as e:
+            logger.warning(
+                f'Failed to get vcpu and memory from instance type '
+                f'{machine_type}. Skipping the fit check for node pool '
+                f'{node_pool_name}, assuming the node pool can create a node '
+                f'satisfying {k8s_instance_type}. Error: {e}')
+            return True
         if vcpus is not None and vcpus < k8s_instance_type.cpus:
             logger.debug(f'vcpu check failed for {machine_type} '
                          f'on node pool {node_pool_name}')
@@ -1075,7 +1139,9 @@ def _node_pool_has_gpu_capacity(cls, node_pool_accelerators: List[dict],
                 continue
             node_accelerator_count = accelerator['acceleratorCount']
             viable_names = [node_accelerator_type.lower(), raw_value.lower()]
-            if (requested_gpu_type.lower() in viable_names and
+            # Use _accelerator_name_matches for backward compatibility
+            # with GPU name changes (e.g., 'H200' vs 'H200-SXM-80GB').
+            if (_accelerator_name_matches(requested_gpu_type, viable_names) and
                     int(node_accelerator_count) >= requested_gpu_count):
                 return True
         return False
@@ -1221,15 +1287,32 @@ class V1NodeStatus:
     conditions: List[V1NodeCondition]
 
 
+@dataclasses.dataclass
+class V1Taint:
+    """Represents a Kubernetes node taint."""
+    key: str
+    effect: str
+    value: Optional[str] = None
+
+
+@dataclasses.dataclass
+class V1NodeSpec:
+    """Represents a Kubernetes node spec."""
+    unschedulable: bool
+    taints: List[V1Taint]
+
+
 @dataclasses.dataclass
 class V1Node:
     """Represents a Kubernetes node."""
     metadata: V1ObjectMeta
     status: V1NodeStatus
+    spec: V1NodeSpec
 
     @classmethod
     def from_dict(cls, data: dict) -> 'V1Node':
         """Create V1Node from a dictionary."""
+        spec_data = data.get('spec', {})
         return cls(metadata=V1ObjectMeta(
             name=data['metadata']['name'],
             labels=data['metadata'].get('labels', {}),
@@ -1246,7 +1329,15 @@ def from_dict(cls, data: dict) -> 'V1Node':
                            V1NodeCondition(type=cond['type'],
                                            status=cond['status'])
                            for cond in data['status'].get('conditions', [])
-                       ]))
+                       ]),
+                   spec=V1NodeSpec(unschedulable=spec_data.get(
+                       'unschedulable', False),
+                                   taints=[
+                                       V1Taint(key=taint['key'],
+                                               effect=taint['effect'],
+                                               value=taint.get('value'))
+                                       for taint in spec_data.get('taints', [])
+                                   ]))
 
     def is_ready(self) -> bool:
         """Check if the node is ready based on its conditions.
@@ -1259,6 +1350,50 @@ def is_ready(self) -> bool:
                 return condition.status == 'True'
         return False
 
+    def is_cordoned(self) -> bool:
+        """Check if the node is cordoned based on its spec.unschedulable."""
+        return self.spec.unschedulable
+
+    def get_taints(
+        self,
+        exclude_cordon: bool = False,
+        exclude_not_ready: bool = False,
+        exclude_effects: Optional[List[str]] = None,
+        exclude_keys: Optional[List[str]] = None,
+    ) -> List[Dict[str, Any]]:
+        """Get the taints on the node.
+
+        Args:
+            exclude_cordon: Whether to exclude the cordon taint.
+            exclude_not_ready: Whether to exclude the not ready taint.
+            exclude_effects: The taint effects to exclude,
+              e.g. ['PreferNoSchedule'].
+            exclude_keys: The taint keys to exclude.
+
+        Returns:
+            List[Dict[str, Any]]: The taints on the node.
+        """
+        taints = []
+        for t in self.spec.taints:
+            if (exclude_cordon and
+                    t.key == 'node.kubernetes.io/unschedulable' and
+                    t.effect == 'NoSchedule'):
+                continue
+            if (exclude_not_ready and
+                    t.key == 'node.kubernetes.io/unreachable' and
+                (t.effect == 'NoSchedule' or t.effect == 'NoExecute')):
+                continue
+            if exclude_effects and t.effect in exclude_effects:
+                continue
+            if exclude_keys and t.key in exclude_keys:
+                continue
+            taints.append({
+                'key': t.key,
+                'value': t.value if t.value else None,
+                'effect': t.effect
+            })
+        return taints
+
 
 @annotations.lru_cache(scope='request', maxsize=10)
 @_retry_on_error(resource_type='node')
@@ -1709,11 +1844,13 @@ def get_accelerator_label_key_values(
                     continue
                 for label, value in label_list:
                     if label_formatter.match_label_key(label):
-                        # match either canonicalized name or raw name
+                        # Match either canonicalized name or raw name.
+                        # Use _accelerator_name_matches for backward compatibility
+                        # with GPU name changes (e.g., H200-SXM-80GB -> H200).
                         accelerator = (label_formatter.
                                        get_accelerator_from_label_value(value))
                         viable = [value.lower(), accelerator.lower()]
-                        if acc_type.lower() not in viable:
+                        if not _accelerator_name_matches(acc_type, viable):
                             continue
                         if is_tpu_on_gke(acc_type):
                             assert isinstance(label_formatter,
@@ -1944,7 +2081,7 @@ class PodValidator:
 
     @classmethod
     def validate(cls, data):
-        return cls.__validate(data, kubernetes.models.V1Pod)
+        return cls.__validate(data, kubernetes_models.V1Pod)
 
     @classmethod
     def __validate(cls, data, klass):
@@ -1977,7 +2114,7 @@ def __validate(cls, data, klass):
             if klass in cls.NATIVE_TYPES_MAPPING:
                 klass = cls.NATIVE_TYPES_MAPPING[klass]
             else:
-                klass = getattr(kubernetes.models, klass)
+                klass = getattr(kubernetes_models, klass)
 
         if klass in cls.PRIMITIVE_TYPES:
             return cls.__validate_primitive(data, klass)
@@ -2022,7 +2159,7 @@ def __validate_date(cls, string):
         :return: date.
         """
         try:
-            return kubernetes.dateutil_parser.parse(string).date()
+            return dateutil_parser.parse(string).date()
         except ValueError as exc:
             raise ValueError(
                 f'Failed to parse `{string}` as date object') from exc
@@ -2037,7 +2174,7 @@ def __validate_datetime(cls, string):
         :return: datetime.
         """
         try:
-            return kubernetes.dateutil_parser.parse(string)
+            return dateutil_parser.parse(string)
         except ValueError as exc:
             raise ValueError(
                 f'Failed to parse `{string}` as datetime object') from exc
@@ -3077,6 +3214,15 @@ def get_unlabeled_accelerator_nodes(context: Optional[str] = None) -> List[Any]:
     return unlabeled_nodes
 
 
+def get_handled_taint_keys() -> List[str]:
+    """Get the taint keys that will be handled automatically by SkyPilot."""
+    keys = [TPU_RESOURCE_KEY, *SUPPORTED_GPU_RESOURCE_KEYS.values()]
+    custom_key = os.getenv('CUSTOM_GPU_RESOURCE_KEY', None)
+    if custom_key:
+        keys.append(custom_key)
+    return keys
+
+
 def get_kubernetes_node_info(
         context: Optional[str] = None) -> models.KubernetesNodesInfo:
     """Gets the resource information for all the nodes in the cluster.
@@ -3098,6 +3244,20 @@ def get_kubernetes_node_info(
         KubernetesNodesInfo: A model that contains the node info map and other
             information.
     """
+    # Try external node info source first (e.g., node-info-service cache).
+    # This allows plugins to provide cached node info for faster queries.
+    if plugin_extensions.NodeInfoSource.is_registered():
+        # Resolve context before calling the provider so it can be cached
+        resolved_context = (context if context is not None else
+                            get_current_kube_config_context_name())
+        if resolved_context is not None:
+            result = plugin_extensions.NodeInfoSource.get(resolved_context)
+            if result is not None:
+                logger.debug(f'Got node info from external provider for '
+                             f'{resolved_context}')
+                return result
+        # Fall through to direct Kubernetes API query if provider returns None
+
     nodes = get_kubernetes_nodes(context=context)
 
     lf, _ = detect_gpu_label_formatter(context)
@@ -3207,6 +3367,11 @@ def get_kubernetes_node_info(
 
         # Check if node is ready
         node_is_ready = node.is_ready()
+        node_taints = node.get_taints(exclude_cordon=True,
+                                      exclude_not_ready=True,
+                                      exclude_effects=['PreferNoSchedule'],
+                                      exclude_keys=get_handled_taint_keys())
+        node_is_tainted = len(node_taints) > 0
 
         if accelerator_count == 0:
             node_info_dict[node.metadata.name] = models.KubernetesNodeInfo(
@@ -3219,11 +3384,14 @@ def get_kubernetes_node_info(
                 memory_gb=memory_gb,
                 cpu_free=cpu_free,
                 memory_free_gb=memory_free_gb,
-                is_ready=node_is_ready)
+                is_ready=node_is_ready,
+                is_cordoned=node.is_cordoned(),
+                taints=node_taints,
+            )
             continue
 
-        if not node_is_ready:
-            # If node is not ready, report 0 available GPUs
+        if not node_is_ready or node.is_cordoned() or node_is_tainted:
+            # If node is not ready, cordoned, or tainted, report 0 available GPUs
             accelerators_available = 0
         elif not has_accelerator_nodes or error_on_get_allocated_resources:
             accelerators_available = -1
@@ -3248,7 +3416,10 @@ def get_kubernetes_node_info(
             memory_gb=memory_gb,
             cpu_free=cpu_free,
             memory_free_gb=memory_free_gb,
-            is_ready=node_is_ready)
+            is_ready=node_is_ready,
+            is_cordoned=node.is_cordoned(),
+            taints=node_taints,
+        )
     hint = ''
     if has_multi_host_tpu:
         hint = ('(Note: Multi-host TPUs are detected and excluded from the '
@@ -3574,6 +3745,28 @@ def from_cluster(
         )
 
 
+def get_pod_primary_container(
+    pod: Any,
+    *,
+    primary_name: str = kubernetes_constants.RAY_NODE_CONTAINER_NAME,
+):
+    """Return the primary workload container for a SkyPilot pod.
+
+    Pods may include sidecars (e.g., log shippers). Kubernetes preserves the
+    ordering of the `containers` list as authored, but mutating webhooks can
+    inject additional containers. Callers should not rely on containers[0].
+    """
+    spec = getattr(pod, 'spec', None)
+    containers = getattr(spec, 'containers', None) if spec is not None else None
+    if not containers:
+        pod_name = getattr(getattr(pod, 'metadata', None), 'name', '<unknown>')
+        raise ValueError(f'Pod {pod_name!r} has no containers.')
+    for container in containers:
+        if getattr(container, 'name', None) == primary_name:
+            return container
+    return containers[0]
+
+
 def process_skypilot_pods(
     pods: List[Any],
     context: Optional[str] = None
@@ -3611,14 +3804,18 @@ def process_skypilot_pods(
                 start_time = pod.status.start_time.timestamp()
 
             # Parse resources
+            primary_container = get_pod_primary_container(pod)
+            resources = getattr(primary_container, 'resources', None)
+            requests = getattr(resources, 'requests',
+                               None) if resources else None
             cpu_request = parse_cpu_or_gpu_resource(
-                pod.spec.containers[0].resources.requests.get('cpu', '0'))
+                (requests.get('cpu', '0') if requests is not None else '0'))
             memory_request = parse_memory_resource(
-                pod.spec.containers[0].resources.requests.get('memory', '0'),
+                (requests.get('memory', '0') if requests is not None else '0'),
                 unit='G')
             gpu_count = parse_cpu_or_gpu_resource(
-                pod.spec.containers[0].resources.requests.get(
-                    get_gpu_resource_key(context), '0'))
+                (requests.get(get_gpu_resource_key(context), '0')
+                 if requests is not None else '0'))
             gpu_name = None
             if gpu_count > 0:
                 label_formatter, _ = (detect_gpu_label_formatter(context))
@@ -3897,3 +4094,25 @@ def get_cleaned_context_and_cloud_str(
         cloud_str = 'ssh'
         context = context[len('ssh-'):]
     return context, cloud_str
+
+
+def get_pvc_events(context: Optional[str],
+                   namespace: str,
+                   pvc_name: str,
+                   reverse: bool = True) -> List[Any]:
+    """Get the events for a PVC, sorted by creation_timestamp."""
+    try:
+        pvc_events = kubernetes.core_api(context).list_namespaced_event(
+            namespace,
+            field_selector=(f'involvedObject.name={pvc_name},'
+                            'involvedObject.kind=PersistentVolumeClaim'),
+            _request_timeout=kubernetes.API_TIMEOUT)
+    except (kubernetes.max_retry_error(), kubernetes.api_exception(),
+            kubernetes.config_exception()) as e:
+        logger.warning(f'Failed to get PVC events: {e}')
+        return []
+
+    return sorted(pvc_events.items,
+                  key=lambda e:
+                  (e.last_timestamp or e.metadata.creation_timestamp),
+                  reverse=reverse)
diff --git a/sky/provision/kubernetes/volume.py b/sky/provision/kubernetes/volume.py
index 80013385f01..9412bec54b6 100644
--- a/sky/provision/kubernetes/volume.py
+++ b/sky/provision/kubernetes/volume.py
@@ -14,6 +14,9 @@
 
 logger = sky_logging.init_logger(__name__)
 
+PVC_FAILING_EVENT_REASONS = ('ProvisioningFailed',)
+WARNING_EVENT_TYPE = 'Warning'
+
 
 def _get_context_namespace(config: models.VolumeConfig) -> Tuple[str, str]:
     """Gets the context and namespace of a volume."""
@@ -279,6 +282,191 @@ def map_all_volumes_usedby(
                                                   {}).get(pvc_name, []))
 
 
+def get_all_volumes_errors(
+    configs: List[models.VolumeConfig],) -> Dict[str, Optional[str]]:
+    """Gets error messages for all Kubernetes PVC volumes.
+
+    Checks if PVCs are in Pending state and if so, checks for access mode
+    mismatches between the PVC and the storage class's allowed access modes.
+
+    Args:
+        configs: List of VolumeConfig objects.
+
+    Returns:
+        Dictionary mapping volume name to error message (None if no error).
+    """
+    context_to_namespaces: Dict[str, Set[str]] = {}
+    config_by_pvc_name: Dict[str, Dict[str, models.VolumeConfig]] = {}
+
+    for config in configs:
+        context, namespace = _get_context_namespace(config)
+        context_to_namespaces.setdefault(context, set()).add(namespace)
+        config_by_pvc_name.setdefault(context,
+                                      {})[config.name_on_cloud] = config
+
+    volume_errors: Dict[str, Optional[str]] = {}
+
+    for context, namespaces in context_to_namespaces.items():
+        for namespace in namespaces:
+            try:
+                # List all PVCs in the namespace with the skypilot label
+                pvcs = kubernetes.core_api(
+                    context).list_namespaced_persistent_volume_claim(
+                        namespace=namespace,
+                        label_selector='parent=skypilot',
+                        _request_timeout=kubernetes.API_TIMEOUT)
+            except Exception as e:  # pylint: disable=broad-except
+                logger.debug(f'Failed to get PVCs in namespace {namespace} '
+                             f'in context {context}: {e}')
+                continue
+
+            for pvc in pvcs.items:
+                pvc_name = pvc.metadata.name
+                vol_config = config_by_pvc_name.get(context, {}).get(pvc_name)
+                if vol_config is None:
+                    continue
+
+                volume_name = vol_config.name
+                pvc_phase = pvc.status.phase
+
+                # If PVC is bound, no error
+                if pvc_phase == 'Bound':
+                    volume_errors[volume_name] = None
+                    continue
+
+                # If PVC is pending, check for access mode mismatch
+                if pvc_phase == 'Pending':
+                    error_msg = _check_pvc_access_mode_error(context, pvc)
+                    if error_msg:
+                        volume_errors[volume_name] = error_msg
+                    else:
+                        volume_binding_mode = (
+                            _check_storage_class_volume_binding_mode(
+                                context, pvc))
+                        if (volume_binding_mode is None or
+                                volume_binding_mode == 'WaitForFirstConsumer'):
+                            error_msg = None
+                            pvc_events = kubernetes_utils.get_pvc_events(
+                                context, namespace, pvc_name)
+                            for event in pvc_events:
+                                if (event.type == WARNING_EVENT_TYPE or
+                                        event.reason
+                                        in PVC_FAILING_EVENT_REASONS):
+                                    reason_str = event.reason
+                                    if event.message:
+                                        reason_str += f': {event.message}'
+                                    error_msg = (f'PVC is pending. '
+                                                 f'{reason_str}. To debug, run'
+                                                 f': kubectl describe pvc '
+                                                 f'{pvc_name} -n {namespace}')
+                                    break
+                            volume_errors[volume_name] = error_msg
+                        else:
+                            # Generic pending message if no specific error
+                            # detected
+                            volume_errors[volume_name] = (
+                                'PVC is pending. This may be due to '
+                                'insufficient storage resources or '
+                                'misconfiguration. To debug, run: '
+                                f'kubectl describe pvc {pvc_name} -n '
+                                f'{namespace}')
+                elif pvc_phase == 'Lost':
+                    volume_errors[volume_name] = (
+                        'PVC is in Lost state. The bound PersistentVolume '
+                        'has been deleted or is unavailable. To debug, '
+                        f'run: kubectl describe pvc {pvc_name} -n {namespace}')
+                else:
+                    # Other phases (e.g., Terminating)
+                    volume_errors[volume_name] = None
+
+    return volume_errors
+
+
+def _check_storage_class_volume_binding_mode(context: Optional[str],
+                                             pvc: Any) -> Optional[str]:
+    """Check the volumeBindingMode of the storage class for the PVC.
+
+    Args:
+        context: Kubernetes context
+        pvc: V1PersistentVolumeClaim object
+
+    Returns:
+        volumeBindingMode of the storage class for the PVC,
+        None if failed to read the storage class.
+    """
+    storage_class_name = pvc.spec.storage_class_name
+    if not storage_class_name:
+        return None
+    try:
+        storage_class = kubernetes.storage_api(context).read_storage_class(
+            name=storage_class_name, _request_timeout=kubernetes.API_TIMEOUT)
+        return storage_class.volume_binding_mode
+    except Exception as e:  # pylint: disable=broad-except
+        logger.debug(f'Failed to read storage class {storage_class_name}: {e}')
+        return None
+
+
+def _check_pvc_access_mode_error(context: Optional[str],
+                                 pvc: Any) -> Optional[str]:
+    """Check if a pending PVC has an access mode mismatch.
+
+    Args:
+        context: Kubernetes context
+        pvc: V1PersistentVolumeClaim object
+
+    Returns:
+        Error message if there's an access mode mismatch, None otherwise.
+    """
+    pvc_access_modes = pvc.spec.access_modes or []
+    if not pvc_access_modes:
+        return None
+
+    pvc_access_mode = pvc_access_modes[0]
+    storage_class_name = pvc.spec.storage_class_name
+
+    # Try to find available PVs and check their access modes
+    try:
+        pvs = kubernetes.core_api(context).list_persistent_volume(
+            _request_timeout=kubernetes.API_TIMEOUT)
+    except Exception as e:  # pylint: disable=broad-except
+        logger.debug(f'Failed to list PVs: {e}')
+        return None
+
+    # Filter PVs that match the storage class and are available
+    available_pvs = []
+    for pv in pvs.items:
+        # Check if PV matches storage class
+        pv_storage_class = pv.spec.storage_class_name
+        if storage_class_name and pv_storage_class != storage_class_name:
+            continue
+        # Check if PV is available
+        if pv.status.phase == 'Available':
+            available_pvs.append(pv)
+
+    if not available_pvs:
+        return None
+
+    # Check if any available PV has a compatible access mode
+    for pv in available_pvs:
+        pv_access_modes = pv.spec.access_modes or []
+        if pvc_access_mode in pv_access_modes:
+            # Found a compatible PV, so access mode is not the issue
+            return None
+
+    # No compatible PV found - access mode mismatch
+    pv_access_modes_str = ', '.join(
+        sorted(
+            set(mode for pv in available_pvs
+                for mode in (pv.spec.access_modes or []))))
+    pvc_name = pvc.metadata.name
+    namespace = pvc.metadata.namespace
+    return (f'PVC access mode mismatch: PVC requests {pvc_access_mode}, but '
+            f'available PersistentVolumes support: {pv_access_modes_str}. '
+            f'Update the volume with the correct access_mode '
+            f'(e.g., {pv_access_modes_str}) and recreate it. To debug, run: '
+            f'kubectl describe pvc {pvc_name} -n {namespace}')
+
+
 def _populate_config_from_pvc(config: models.VolumeConfig,
                               pvc_obj: Any) -> None:
     """Populate missing fields in config from a PVC object.
diff --git a/sky/provision/provisioner.py b/sky/provision/provisioner.py
index 0056807543b..5152a59187b 100644
--- a/sky/provision/provisioner.py
+++ b/sky/provision/provisioner.py
@@ -25,6 +25,7 @@
 from sky.backends import backend_utils
 from sky.jobs.server import utils as server_jobs_utils
 from sky.provision import common as provision_common
+from sky.provision import constants as provision_constants
 from sky.provision import instance_setup
 from sky.provision import logging as provision_logging
 from sky.provision import metadata_utils
@@ -240,9 +241,19 @@ def teardown_cluster(cloud_name: str, cluster_name: resources_utils.ClusterName,
             specific exceptions will be raised by the cloud APIs.
     """
     if terminate:
-        provision.terminate_instances(cloud_name, cluster_name.name_on_cloud,
-                                      provider_config)
+        try:
+            provision.terminate_instances(cloud_name,
+                                          cluster_name.name_on_cloud,
+                                          provider_config)
+        except RuntimeError as e:
+            if provision_constants.ERROR_NO_NODES_LAUNCHED in str(e):
+                logger.info(
+                    'Ignoring teardown failure as no nodes were launched.')
+                logger.debug(f'Stacktrace: {traceback.format_exc()}')
+            else:
+                raise
         metadata_utils.remove_cluster_metadata(cluster_name.name_on_cloud)
+        # This won't crash because not found volumes is ignored.
         provision_volume.delete_ephemeral_volumes(provider_config)
     else:
         provision.stop_instances(cloud_name, cluster_name.name_on_cloud,
@@ -289,15 +300,6 @@ def _ssh_probe_command(ip: str,
     return command
 
 
-def _shlex_join(command: List[str]) -> str:
-    """Join a command list into a shell command string.
-
-    This is copied from Python 3.8's shlex.join, which is not available in
-    Python 3.7.
-    """
-    return ' '.join(shlex.quote(arg) for arg in command)
-
-
 def _wait_ssh_connection_direct(ip: str,
                                 ssh_port: int,
                                 ssh_user: str,
@@ -341,7 +343,7 @@ def _wait_ssh_connection_direct(ip: str,
     command = _ssh_probe_command(ip, ssh_port, ssh_user, ssh_private_key,
                                  ssh_probe_timeout, ssh_proxy_command)
     logger.debug(f'Waiting for SSH to {ip}. Try: '
-                 f'{_shlex_join(command)}. '
+                 f'{shlex.join(command)}. '
                  f'{stderr}')
     return False, stderr
 
@@ -362,7 +364,7 @@ def _wait_ssh_connection_indirect(ip: str,
     del ssh_control_name, kwargs  # unused
     command = _ssh_probe_command(ip, ssh_port, ssh_user, ssh_private_key,
                                  ssh_probe_timeout, ssh_proxy_command)
-    message = f'Waiting for SSH using command: {_shlex_join(command)}'
+    message = f'Waiting for SSH using command: {shlex.join(command)}'
     logger.debug(message)
     try:
         proc = subprocess.run(command,
diff --git a/sky/provision/slurm/instance.py b/sky/provision/slurm/instance.py
index c7fec1092af..f511022d577 100644
--- a/sky/provision/slurm/instance.py
+++ b/sky/provision/slurm/instance.py
@@ -1,46 +1,47 @@
 """Slurm instance provisioning."""
 
+import shlex
 import tempfile
-import textwrap
+import threading
 import time
-from typing import Any, cast, Dict, List, Optional, Tuple
+from typing import Any, Dict, List, Optional, Tuple
 
+from sky import exceptions
 from sky import sky_logging
 from sky import skypilot_config
 from sky.adaptors import slurm
 from sky.provision import common
 from sky.provision import constants
 from sky.provision.slurm import utils as slurm_utils
+from sky.skylet import constants as skylet_constants
 from sky.utils import command_runner
 from sky.utils import common_utils
+from sky.utils import env_options
 from sky.utils import status_lib
 from sky.utils import subprocess_utils
 from sky.utils import timeline
 
 logger = sky_logging.init_logger(__name__)
 
-# TODO(kevin): This assumes $HOME is in a shared filesystem.
-# We should probably make it configurable, and add a check
-# during sky check.
-SHARED_ROOT_SKY_DIRECTORY = '~/.sky_clusters'
 PROVISION_SCRIPTS_DIRECTORY_NAME = '.sky_provision'
 PROVISION_SCRIPTS_DIRECTORY = f'~/{PROVISION_SCRIPTS_DIRECTORY_NAME}'
 
+
+def _sbatch_log_path(job_id: str) -> str:
+    return f'{PROVISION_SCRIPTS_DIRECTORY_NAME}/slurm-{job_id}.out'
+
+
 POLL_INTERVAL_SECONDS = 2
 # Default KillWait is 30 seconds, so we add some buffer time here.
 _JOB_TERMINATION_TIMEOUT_SECONDS = 60
-_SKY_DIR_CREATION_TIMEOUT_SECONDS = 30
 
 
-def _sky_cluster_home_dir(cluster_name_on_cloud: str) -> str:
+def _sky_cluster_home_dir(home_dir: str, cluster_name_on_cloud: str) -> str:
     """Returns the SkyPilot cluster's home directory path on the Slurm cluster.
 
     This path is assumed to be on a shared NFS mount accessible by all nodes.
-    To support clusters with non-NFS home directories, we would need to let
-    users specify an NFS-backed "working directory" or use a different
-    coordination mechanism.
     """
-    return f'{SHARED_ROOT_SKY_DIRECTORY}/{cluster_name_on_cloud}'
+    return f'{home_dir}/.sky_clusters/{cluster_name_on_cloud}'
 
 
 def _sbatch_provision_script_path(filename: str) -> str:
@@ -56,6 +57,58 @@ def _skypilot_runtime_dir(cluster_name_on_cloud: str) -> str:
     return f'/tmp/{cluster_name_on_cloud}'
 
 
+def _enroot_container_name_global_scope(cluster_name_on_cloud: str) -> str:
+    """Get enroot container name when container_scope=global."""
+    # Not publicly documented, but see:
+    # https://github.com/NVIDIA/pyxis/blob/fb9c2d5a08a778346dd398d670deeb5a569904e5/pyxis_slurmstepd.c#L1104
+    # Added in commit:
+    # https://github.com/NVIDIA/pyxis/commit/a35027cf2ffa45cf702b117d215b1240aa6de22e
+    return f'pyxis_{slurm_utils.pyxis_container_name(cluster_name_on_cloud)}'
+
+
+def _wait_for_job_ready(
+    login_node_runner: 'command_runner.SSHCommandRunner',
+    client: 'slurm.SlurmClient',
+    job_id: str,
+    ready_signal: str,
+    slurm_log: str,
+    timeout: Optional[float] = None,
+) -> None:
+    """Wait for Slurm job initialization to complete.
+
+    Polls while the job is running. Fails if:
+    1. The job exits/fails (state not in PENDING/RUNNING/CONFIGURING)
+    2. The ready signal file never appears
+    3. The timeout is exceeded (if specified)
+    """
+    poll_interval_seconds = 1
+    start_time = time.time()
+
+    while True:
+        if timeout is not None:
+            elapsed = time.time() - start_time
+            if elapsed >= timeout:
+                raise TimeoutError(f'Slurm job {job_id} initialization timed '
+                                   'out. See sbatch logs for details: '
+                                   f'{slurm_log}')
+
+        rc, _, _ = login_node_runner.run(f'test -f {ready_signal}',
+                                         require_outputs=True,
+                                         stream_logs=False)
+        if rc == 0:
+            return
+
+        job_state = client.get_job_state(job_id)
+        # Job states that indicate the job is still initializing
+        # See: https://slurm.schedmd.com/squeue.html#SECTION_JOB-STATE-CODES
+        if job_state not in ('PENDING', 'RUNNING', 'CONFIGURING'):
+            raise RuntimeError(f'Slurm job {job_id} exited ({job_state}) '
+                               'before initialization completed. See sbatch '
+                               f'logs for details: {slurm_log}')
+
+        time.sleep(poll_interval_seconds)
+
+
 @timeline.event
 def _create_virtual_instance(
         region: str, cluster_name_on_cloud: str,
@@ -70,7 +123,7 @@ def _create_virtual_instance(
     ssh_host = ssh_config_dict['hostname']
     ssh_port = int(ssh_config_dict['port'])
     ssh_user = ssh_config_dict['user']
-    ssh_key = ssh_config_dict['private_key']
+    ssh_key = ssh_config_dict.get('private_key', None)
     ssh_proxy_command = ssh_config_dict.get('proxycommand', None)
     ssh_proxy_jump = ssh_config_dict.get('proxyjump', None)
     partition = slurm_utils.get_partition_from_config(provider_config)
@@ -85,6 +138,8 @@ def _create_virtual_instance(
     )
 
     slurm_cluster = slurm_utils.get_slurm_cluster_from_config(provider_config)
+
+    proctrack_type = slurm_utils.get_proctrack_type(slurm_cluster)
     partition_info = slurm_utils.get_partition_info(slurm_cluster, partition)
     if partition_info is None:
         raise ValueError(f'Partition info for {partition} not found '
@@ -172,12 +227,51 @@ def _create_virtual_instance(
         accelerator_count = int(
             accelerator_count_raw) if accelerator_count_raw is not None else 0
     except (TypeError, ValueError):
+        logger.warning(
+            f'Invalid accelerator_count value: {accelerator_count_raw!r}. '
+            'Defaulting to 0 (no accelerators).')
         accelerator_count = 0
 
+    # To bootstrap things, we need to do it with SSHCommandRunner first.
+    # SlurmCommandRunner is for after the virtual instances are created.
+    login_node_runner = command_runner.SSHCommandRunner(
+        (ssh_host, ssh_port),
+        ssh_user,
+        ssh_key,
+        ssh_proxy_command=ssh_proxy_command,
+        ssh_proxy_jump=ssh_proxy_jump,
+        enable_interactive_auth=True,
+        # Allow ssh-agent and default key fallback for Slurm.
+        disable_identities_only=True,
+    )
+
+    remote_home_dir = login_node_runner.get_remote_home_dir()
+
     skypilot_runtime_dir = _skypilot_runtime_dir(cluster_name_on_cloud)
-    sky_home_dir = _sky_cluster_home_dir(cluster_name_on_cloud)
-    ready_signal = f'{sky_home_dir}/.sky_sbatch_ready'
-    slurm_marker_file = f'{sky_home_dir}/{slurm_utils.SLURM_MARKER_FILE}'
+    sky_cluster_home_dir = _sky_cluster_home_dir(remote_home_dir,
+                                                 cluster_name_on_cloud)
+    ready_signal = f'{sky_cluster_home_dir}/.sky_sbatch_ready'
+    slurm_marker_file = (
+        f'{sky_cluster_home_dir}/{slurm_utils.SLURM_MARKER_FILE}')
+
+    # For non-Docker Hub registries, pyxis/enroot requires '#' separator
+    # between registry and path. See:
+    # https://github.com/NVIDIA/pyxis/wiki/Usage#registry-syntax
+    container_image = resources.get('image_id')
+    if container_image is not None:
+        if container_image.endswith('.sqsh'):
+            # Local .sqsh file, use path directly.
+            pass
+        else:
+            parts = container_image.split('/', 1)
+            if len(parts) > 1:
+                maybe_domain, maybe_path = parts
+                is_custom_registry = ('.' in maybe_domain or
+                                      ':' in maybe_domain or
+                                      maybe_domain == 'localhost')
+                if is_custom_registry:
+                    container_image = f'{maybe_domain}#{maybe_path}'
+    container_name = slurm_utils.pyxis_container_name(cluster_name_on_cloud)
 
     # Build the sbatch script
     gpu_directive = ''
@@ -186,65 +280,131 @@ def _create_virtual_instance(
         gpu_directive = (f'#SBATCH --gres=gpu:{accelerator_type}:'
                          f'{accelerator_count}')
 
+    # Build container initialization block if container image specified
+    container_block = ''
+    if container_image is not None:
+        # Note: /dev/shm is NOT mounted here because enroot handles it:
+        # - If ENROOT_RESTRICT_DEV is set: /dev is restricted but /dev/shm is
+        #   explicitly mounted by the 10-devices.sh hook
+        # - If ENROOT_RESTRICT_DEV is unset: /dev is not restricted, so
+        #   /dev/shm is inherited from the host
+        # See:
+        # https://github.com/NVIDIA/enroot/blob/main/conf/hooks/10-devices.sh
+        host_ccache_dir = '/tmp/ccache_$(id -u)'
+        container_ccache_dir = '/var/cache/ccache'
+        container_mounts = ','.join([
+            f'{remote_home_dir}:{remote_home_dir}',
+            f'{host_ccache_dir}:{container_ccache_dir}',
+        ])
+        # Add sudo alias to bashrc since we're already root in the container.
+        # This allows scripts with 'sudo' commands to work without modification.
+        # For containers, ~ is /root which is isolated inside the container,
+        # so modifying bashrc doesn't affect non-containerized sessions.
+        container_init_script = """\
+set -e
+echo "[container-init] Starting..."
+INIT_START=$SECONDS
+apt-get update
+apt-get install -y ca-certificates rsync curl git wget fuse
+echo 'alias sudo=""' >> ~/.bashrc
+echo "[container-init] Packages installed in $((SECONDS - INIT_START))s"
+"""
+        container_marker_file = (f'{sky_cluster_home_dir}/'
+                                 f'{slurm_utils.SLURM_CONTAINER_MARKER_FILE}')
+        container_init_done_dir = (
+            f'{sky_cluster_home_dir}/.sky_container_init_done')
+        # Run container init, touch per-node "done" marker, then sleep infinity
+        # to keep container running. Use --overlap so subsequent sruns can share
+        # the allocation. Background with & so sbatch continues.
+        container_cmd = shlex.quote(
+            f'{container_init_script}'
+            f'touch {container_init_done_dir}/$SLURM_PROCID && sleep infinity')
+        container_block = (
+            f'srun --nodes={num_nodes} mkdir -p {host_ccache_dir}\n'
+            f'CONTAINER_START=$SECONDS\n'
+            f'echo "[container] Initializing {container_name} on all nodes"\n'
+            f'rm -rf {container_init_done_dir}\n'
+            f'mkdir -p {container_init_done_dir}\n'
+            f'srun --overlap {"--label " if num_nodes > 1 else ""}--unbuffered '
+            f'--nodes={num_nodes} --ntasks-per-node=1 '
+            f'--container-image={shlex.quote(container_image)} '
+            f'--container-name={shlex.quote(container_name)}:create '
+            f'--container-mounts="{container_mounts}" '
+            f'--container-remap-root '
+            f'--no-container-mount-home '
+            f'--container-writable '
+            f'bash -c {container_cmd} &\n'
+            f'while true; do\n'
+            f'  num_ready=$(ls -1 {container_init_done_dir} 2>/dev/null | '
+            f'wc -l)\n'
+            f'  if [ "$num_ready" -ge "{num_nodes}" ]; then\n'
+            f'    break\n'
+            f'  fi\n'
+            f'  sleep 1\n'
+            f'done\n'
+            f'echo "[container] Ready in $((SECONDS - CONTAINER_START))s"\n'
+            f'touch {container_marker_file} {ready_signal}')
+
     # By default stdout and stderr will be written to $HOME/slurm-%j.out
     # (because we invoke sbatch from $HOME). Redirect elsewhere to not pollute
     # the home directory.
-    provision_script = textwrap.dedent(f"""\
-        #!/bin/bash
-        #SBATCH --job-name={cluster_name_on_cloud}
-        #SBATCH --output={PROVISION_SCRIPTS_DIRECTORY_NAME}/slurm-%j.out
-        #SBATCH --error={PROVISION_SCRIPTS_DIRECTORY_NAME}/slurm-%j.out
-        #SBATCH --nodes={num_nodes}
-        #SBATCH --wait-all-nodes=1
-        #SBATCH --time={max_time}
-        # Let the job be terminated rather than requeued implicitly.
-        #SBATCH --no-requeue
-        #SBATCH --cpus-per-task={int(resources["cpus"])}
-        #SBATCH --mem={int(resources["memory"])}G
-        {gpu_directive}
-
-        # Cleanup function to remove cluster dirs on job termination.
-        cleanup() {{
-            # The Skylet is daemonized, so it is not automatically terminated when
-            # the Slurm job is terminated, we need to kill it manually.
-            echo "Terminating Skylet..."
-            if [ -f "{skypilot_runtime_dir}/.sky/skylet_pid" ]; then
-                kill $(cat "{skypilot_runtime_dir}/.sky/skylet_pid") 2>/dev/null || true
-            fi
-            echo "Cleaning up sky directories..."
-            # Clean up sky runtime directory on each node.
-            # NOTE: We can do this because --nodes for both this srun and the
-            # sbatch is the same number. Otherwise, there are no guarantees
-            # that this srun will run on the same subset of nodes as the srun
-            # that created the sky directories.
-            srun --nodes={num_nodes} rm -rf {skypilot_runtime_dir}
-            rm -rf {sky_home_dir}
-        }}
-        trap cleanup TERM
-
-        # Create sky home directory for the cluster.
-        mkdir -p {sky_home_dir}
-        # Create sky runtime directory on each node.
-        srun --nodes={num_nodes} mkdir -p {skypilot_runtime_dir}
-        # Marker file to indicate we're in a Slurm cluster.
-        touch {slurm_marker_file}
-        # Suppress login messages.
-        touch {sky_home_dir}/.hushlogin
-        # Signal that the sbatch script has completed setup.
-        touch {ready_signal}
-        sleep infinity
-        """)
-
-    # To bootstrap things, we need to do it with SSHCommandRunner first.
-    # SlurmCommandRunner is for after the virtual instances are created.
-    login_node_runner = command_runner.SSHCommandRunner(
-        (ssh_host, ssh_port),
-        ssh_user,
-        ssh_key,
-        ssh_proxy_command=ssh_proxy_command,
-        ssh_proxy_jump=ssh_proxy_jump,
-        enable_interactive_auth=True,
-    )
+    # pylint: disable=line-too-long
+    # fmt: off
+    provision_script = f"""\
+#!/bin/bash
+#SBATCH --job-name={cluster_name_on_cloud}
+#SBATCH --output={_sbatch_log_path('%j')}
+#SBATCH --error={_sbatch_log_path('%j')}
+#SBATCH --nodes={num_nodes}
+#SBATCH --time={max_time}
+#SBATCH --wait-all-nodes=1
+# Let the job be terminated rather than requeued implicitly.
+#SBATCH --no-requeue
+#SBATCH --cpus-per-task={int(resources["cpus"])}
+#SBATCH --mem={int(resources["memory"])}G
+{gpu_directive}
+
+# Cleanup function to remove cluster dirs on job termination.
+cleanup() {{
+    # The Skylet is daemonized, so it is not automatically terminated when
+    # the Slurm job is terminated, we need to kill it manually.
+    echo "Terminating Skylet..."
+    if [ -f "{skypilot_runtime_dir}/.sky/skylet_pid" ]; then
+        kill $(cat "{skypilot_runtime_dir}/.sky/skylet_pid") 2>/dev/null || true
+    fi
+    echo "Cleaning up sky directories..."
+    # Remove the per-node enroot container, if it exists.
+    # This is only needed when container_scope=global.
+    # When container_scope=job, named containers are removed automatically
+    # at the end of the Slurm job, see: https://github.com/NVIDIA/pyxis/wiki/Setup#slurm-epilog
+    srun --nodes={num_nodes} --ntasks-per-node=1 enroot remove -f {shlex.quote(_enroot_container_name_global_scope(cluster_name_on_cloud))} 2>/dev/null || true
+    # Clean up sky runtime directory on each node.
+    # NOTE: We can do this because --nodes for both this srun and the
+    # sbatch is the same number. Otherwise, there are no guarantees
+    # that this srun will run on the same subset of nodes as the srun
+    # that created the sky directories.
+    srun --nodes={num_nodes} rm -rf {skypilot_runtime_dir}
+    rm -rf {sky_cluster_home_dir}
+    exit 0
+}}
+trap cleanup TERM
+
+# Create sky home directory and subdirectories for the cluster.
+mkdir -p {sky_cluster_home_dir}/sky_logs {sky_cluster_home_dir}/sky_workdir {sky_cluster_home_dir}/.sky
+# Create sky runtime directory on each node.
+srun --nodes={num_nodes} mkdir -p {skypilot_runtime_dir}
+# Marker file to indicate we're in a Slurm cluster.
+touch {slurm_marker_file}
+# Store proctrack type for task executor to read.
+echo '{proctrack_type or "unknown"}' > {sky_cluster_home_dir}/{skylet_constants.SLURM_PROCTRACK_TYPE_FILE}
+# Suppress login messages.
+touch {sky_cluster_home_dir}/.hushlogin
+{container_block}
+{f'touch {ready_signal}' if container_image is None else ''}
+{'sleep infinity' if container_image is None else 'wait'}
+"""
+    # fmt: on
+    # pylint: enable=line-too-long
 
     cmd = f'mkdir -p {PROVISION_SCRIPTS_DIRECTORY}'
     rc, stdout, stderr = login_node_runner.run(cmd,
@@ -268,6 +428,9 @@ def _create_virtual_instance(
                  f'{partition} for cluster {cluster_name_on_cloud} '
                  f'with {num_nodes} nodes')
 
+    # Track start time to calculate remaining timeout after node allocation
+    provision_start_time = time.time()
+
     nodes, _ = client.get_job_nodes(job_id,
                                     wait=True,
                                     timeout=provision_timeout)
@@ -275,23 +438,46 @@ def _create_virtual_instance(
         slurm_utils.instance_id(job_id, node) for node in nodes
     ]
 
+    # Calculate remaining timeout for job initialization
+    remaining_timeout = None
+    if provision_timeout is not None:
+        elapsed = time.time() - provision_start_time
+        remaining_timeout = max(0, provision_timeout - elapsed)
+
     # Wait for the sbatch script to create the cluster's sky directories,
     # to avoid a race condition where post-provision commands try to
     # access the directories before they are created.
-    ready_check_cmd = (f'end=$((SECONDS+{_SKY_DIR_CREATION_TIMEOUT_SECONDS})); '
-                       f'while [ ! -f {ready_signal} ]; do '
-                       'if (( SECONDS >= end )); then '
-                       'exit 1; fi; '
-                       'sleep 0.5; '
-                       'done')
-    rc, stdout, stderr = login_node_runner.run(ready_check_cmd,
-                                               require_outputs=True,
-                                               stream_logs=False)
-    subprocess_utils.handle_returncode(
-        rc,
-        ready_check_cmd,
-        'Failed to verify sky directories creation.',
-        stderr=f'{stdout}\n{stderr}')
+    slurm_log = f'~/{_sbatch_log_path(job_id)}'
+
+    # Stream logs in background thread for visibility if debug mode
+    if env_options.Options.SHOW_DEBUG_INFO.get():
+
+        def _stream_logs():
+            login_node_runner.run(f'tail -f {slurm_log} 2>/dev/null',
+                                  require_outputs=False,
+                                  stream_logs=True)
+
+        log_thread = threading.Thread(target=_stream_logs, daemon=True)
+        log_thread.start()
+
+    try:
+        _wait_for_job_ready(
+            login_node_runner,
+            client,
+            job_id,
+            ready_signal,
+            slurm_log,
+            remaining_timeout,
+        )
+    except (TimeoutError, RuntimeError, exceptions.CommandError) as e:
+        _, stdout, _ = login_node_runner.run(f'cat {slurm_log} 2>/dev/null',
+                                             require_outputs=True,
+                                             stream_logs=False)
+        if stdout:
+            logger.error(f'=== Slurm job logs ({slurm_log}) ===\n'
+                         f'{stdout}'
+                         f'=== End of Slurm job logs ===')
+        raise e
 
     return common.ProvisionRecord(provider_name='slurm',
                                   region=region,
@@ -318,7 +504,7 @@ def query_instances(
     ssh_host = ssh_config_dict['hostname']
     ssh_port = int(ssh_config_dict['port'])
     ssh_user = ssh_config_dict['user']
-    ssh_key = ssh_config_dict['private_key']
+    ssh_key = ssh_config_dict.get('private_key', None)
     ssh_proxy_command = ssh_config_dict.get('proxycommand', None)
     ssh_proxy_jump = ssh_config_dict.get('proxyjump', None)
 
@@ -416,7 +602,7 @@ def get_cluster_info(
     ssh_host = ssh_config_dict['hostname']
     ssh_port = int(ssh_config_dict['port'])
     ssh_user = ssh_config_dict['user']
-    ssh_key = ssh_config_dict['private_key']
+    ssh_key = ssh_config_dict.get('private_key', None)
     ssh_proxy_command = ssh_config_dict.get('proxycommand', None)
     ssh_proxy_jump = ssh_config_dict.get('proxyjump', None)
 
@@ -440,7 +626,6 @@ def get_cluster_info(
         return common.ClusterInfo(
             instances={},
             head_instance_id=None,
-            ssh_user=ssh_user,
             provider_name='slurm',
             provider_config=provider_config,
         )
@@ -471,7 +656,6 @@ def get_cluster_info(
     return common.ClusterInfo(
         instances=instances,
         head_instance_id=slurm_utils.instance_id(job_id, nodes[0]),
-        ssh_user=ssh_user,
         provider_name='slurm',
         provider_config=provider_config,
     )
@@ -512,7 +696,7 @@ def terminate_instances(
         ssh_host = ssh_config_dict['hostname']
         ssh_port = int(ssh_config_dict['port'])
         ssh_user = ssh_config_dict['user']
-        ssh_private_key = ssh_config_dict['private_key']
+        ssh_private_key = ssh_config_dict.get('private_key', None)
         ssh_proxy_command = ssh_config_dict.get('proxycommand', None)
         ssh_proxy_jump = ssh_config_dict.get('proxyjump', None)
 
@@ -580,11 +764,26 @@ def cleanup_ports(
     pass
 
 
+def _build_pyxis_args(cluster_name_on_cloud: str) -> str:
+    """Build pyxis/enroot container args for srun.
+
+    Uses :exec flag to attach to the already-running container (started with
+    sleep infinity in sbatch). Container settings like --container-remap-root,
+    --container-writable are preserved from when the container was created.
+    """
+    container_name = slurm_utils.pyxis_container_name(cluster_name_on_cloud)
+    quoted_name = shlex.quote(container_name)
+    return f'--container-remap-root --container-name={quoted_name}:exec'
+
+
 def get_command_runners(
     cluster_info: common.ClusterInfo,
     **credentials: Dict[str, Any],
 ) -> List[command_runner.SlurmCommandRunner]:
     """Get a command runner for the given cluster."""
+    # For Slurm, we use the login node credentials from provider_config['ssh']
+    # instead of `credentials` which is for ssh'ing to the SkyPilot cluster.
+    del credentials
     assert cluster_info.provider_config is not None, cluster_info
 
     if cluster_info.head_instance_id is None:
@@ -602,26 +801,74 @@ def get_command_runners(
         instance_infos[0] for instance_infos in cluster_info.instances.values()
     ]
 
-    # Note: For Slurm, the external IP for all instances is the same,
-    # it is the login node's. The internal IP is the private IP of the node.
-    ssh_user = cast(str, credentials.pop('ssh_user'))
-    ssh_private_key = cast(str, credentials.pop('ssh_private_key'))
-    # ssh_proxy_jump is Slurm-specific, it does not exist in the auth section
-    # of the cluster yaml.
-    ssh_proxy_jump = cluster_info.provider_config.get('ssh', {}).get(
-        'proxyjump', None)
+    provider_config = cluster_info.provider_config
+
+    # Get login node SSH credentials.
+    login_node_ssh_config = provider_config['ssh']
+    login_node_ssh_hostname = login_node_ssh_config['hostname']
+    login_node_ssh_port = int(login_node_ssh_config.get('port', 22))
+    login_node_ssh_user = login_node_ssh_config['user']
+    login_node_ssh_private_key = login_node_ssh_config.get('private_key', None)
+    login_node_ssh_proxy_command = login_node_ssh_config.get(
+        'proxycommand', None)
+    login_node_ssh_proxy_jump = login_node_ssh_config.get('proxyjump', None)
+    # For Slurm, multiple SkyPilot clusters may share the same underlying
+    # Slurm login node. By using a fixed ssh_control_name ('__default__'),
+    # we ensure that all connections to the same login node reuse the same
+    # SSH ControlMaster process, avoiding repeated SSH handshakes.
+    #
+    # The %C token in ControlPath (see ssh_options_list) ensures that
+    # connections to different login nodes use different sockets, avoiding
+    # collisions between different Slurm clusters.
+    ssh_control_name = command_runner.DEFAULT_SSH_CONTROL_NAME
+
+    login_node_runner = command_runner.SSHCommandRunner(
+        (login_node_ssh_hostname, login_node_ssh_port),
+        login_node_ssh_user,
+        login_node_ssh_private_key,
+        ssh_proxy_command=login_node_ssh_proxy_command,
+        ssh_proxy_jump=login_node_ssh_proxy_jump,
+        ssh_control_name=ssh_control_name,
+        enable_interactive_auth=True,
+        # Allow ssh-agent and default key fallback for Slurm.
+        disable_identities_only=True,
+    )
+    remote_home_dir = login_node_runner.get_remote_home_dir()
+
+    sky_cluster_home_dir = _sky_cluster_home_dir(remote_home_dir,
+                                                 cluster_name_on_cloud)
+    container_marker = (
+        f'{sky_cluster_home_dir}/{slurm_utils.SLURM_CONTAINER_MARKER_FILE}')
+    rc, stdout, stderr = login_node_runner.run(f'test -f {container_marker}',
+                                               require_outputs=True,
+                                               stream_logs=False)
+    if rc not in (0, 1):
+        subprocess_utils.handle_returncode(
+            rc,
+            f'test -f {container_marker}',
+            f'Failed to check for container marker file: {container_marker}',
+            stderr=f'{stdout}\n{stderr}')
+    container_args = _build_pyxis_args(
+        cluster_name_on_cloud) if rc == 0 else None
+
     runners = [
+        # Note: For Slurm, the external IP for all instances is the same,
+        # it is the login node's. The internal IP is the private IP of the node.
         command_runner.SlurmCommandRunner(
             (instance_info.external_ip or '', instance_info.ssh_port),
-            ssh_user,
-            ssh_private_key,
-            sky_dir=_sky_cluster_home_dir(cluster_name_on_cloud),
+            login_node_ssh_user,
+            login_node_ssh_private_key,
+            sky_dir=sky_cluster_home_dir,
             skypilot_runtime_dir=_skypilot_runtime_dir(cluster_name_on_cloud),
             job_id=instance_info.tags['job_id'],
             slurm_node=instance_info.tags['node'],
-            ssh_proxy_jump=ssh_proxy_jump,
+            ssh_proxy_jump=login_node_ssh_proxy_jump,
+            ssh_proxy_command=login_node_ssh_proxy_command,
+            ssh_control_name=ssh_control_name,
+            container_args=container_args,
             enable_interactive_auth=True,
-            **credentials) for instance_info in instances
+            # Allow ssh-agent and default key fallback for Slurm.
+            disable_identities_only=True) for instance_info in instances
     ]
 
     return runners
diff --git a/sky/provision/slurm/utils.py b/sky/provision/slurm/utils.py
index 6a9ee6785e8..eef6fd6f9ac 100644
--- a/sky/provision/slurm/utils.py
+++ b/sky/provision/slurm/utils.py
@@ -9,6 +9,7 @@
 
 from paramiko.config import SSHConfig
 
+from sky import clouds
 from sky import exceptions
 from sky import sky_logging
 from sky.adaptors import slurm
@@ -21,6 +22,7 @@
 
 DEFAULT_SLURM_PATH = '~/.slurm/config'
 SLURM_MARKER_FILE = '.sky_slurm_cluster'
+SLURM_CONTAINER_MARKER_FILE = '.sky_slurm_container'
 
 # Regex pattern for parsing GPU GRES strings.
 # Format: 'gpu[:acc_type]:acc_count(optional_extra_info)'
@@ -29,6 +31,8 @@
                                re.IGNORECASE)
 
 _SLURM_NODES_INFO_CACHE_TTL = 30 * 60
+# Proctrack type is highly unlikely to change.
+_SLURM_PROCTRACK_TYPE_CACHE_TTL = 24 * 60 * 60
 
 
 def get_gpu_type_and_count(gres_str: str) -> Tuple[Optional[str], int]:
@@ -43,6 +47,11 @@ def get_gpu_type_and_count(gres_str: str) -> Tuple[Optional[str], int]:
     return match.group('type'), int(match.group('count'))
 
 
+def pyxis_container_name(cluster_name_on_cloud: str) -> str:
+    """Get the pyxis container name that gets passed to --container-name."""
+    return cluster_name_on_cloud
+
+
 # SSH host key filename for sshd.
 SLURM_SSHD_HOST_KEY_FILENAME = 'skypilot_host_key'
 
@@ -54,6 +63,14 @@ def get_slurm_ssh_config() -> SSHConfig:
     return slurm_config
 
 
+def get_identity_file(ssh_config_dict: Dict[str, Any]) -> Optional[str]:
+    """Get the first identity file from SSH config, or None if not specified."""
+    identity_files = ssh_config_dict.get('identityfile')
+    if identity_files:
+        return identity_files[0]
+    return None
+
+
 @annotations.lru_cache(scope='request')
 def _get_slurm_nodes_info(cluster: str) -> List[slurm.NodeInfo]:
     cache_key = f'slurm:nodes_info:{cluster}'
@@ -68,7 +85,7 @@ def _get_slurm_nodes_info(cluster: str) -> List[slurm.NodeInfo]:
         ssh_config_dict['hostname'],
         int(ssh_config_dict.get('port', 22)),
         ssh_config_dict['user'],
-        ssh_config_dict['identityfile'][0],
+        get_identity_file(ssh_config_dict),
         ssh_proxy_command=ssh_config_dict.get('proxycommand', None),
         ssh_proxy_jump=ssh_config_dict.get('proxyjump', None),
     )
@@ -90,6 +107,38 @@ def _get_slurm_nodes_info(cluster: str) -> List[slurm.NodeInfo]:
     return nodes_info
 
 
+def get_proctrack_type(cluster: str) -> Optional[str]:
+    """Get the ProctrackType setting from Slurm configuration."""
+    cache_key = f'slurm:proctrack_type:{cluster}'
+    cached = kv_cache.get_cache_entry(cache_key)
+    if cached is not None:
+        logger.debug(f'Slurm proctrack type found in cache ({cache_key})')
+        return cached
+
+    ssh_config = get_slurm_ssh_config()
+    ssh_config_dict = ssh_config.lookup(cluster)
+    client = slurm.SlurmClient(
+        ssh_config_dict['hostname'],
+        int(ssh_config_dict.get('port', 22)),
+        ssh_config_dict['user'],
+        get_identity_file(ssh_config_dict),
+        ssh_proxy_command=ssh_config_dict.get('proxycommand', None),
+        ssh_proxy_jump=ssh_config_dict.get('proxyjump', None),
+    )
+    proctrack_type = client.get_proctrack_type()
+
+    if proctrack_type is not None:
+        try:
+            kv_cache.add_or_update_cache_entry(
+                cache_key, proctrack_type,
+                time.time() + _SLURM_PROCTRACK_TYPE_CACHE_TTL)
+        except Exception as e:  # pylint: disable=broad-except
+            logger.debug(f'Failed to cache slurm proctrack type for {cluster}: '
+                         f'{common_utils.format_exception(e)}')
+
+    return proctrack_type
+
+
 class SlurmInstanceType:
     """Class to represent the "Instance Type" in a Slurm cluster.
 
@@ -272,7 +321,7 @@ def get_cluster_default_partition(cluster_name: str) -> Optional[str]:
         ssh_config_dict['hostname'],
         int(ssh_config_dict.get('port', 22)),
         ssh_config_dict['user'],
-        ssh_config_dict['identityfile'][0],
+        get_identity_file(ssh_config_dict),
         ssh_proxy_command=ssh_config_dict.get('proxycommand', None),
         ssh_proxy_jump=ssh_config_dict.get('proxyjump', None),
     )
@@ -453,7 +502,7 @@ def get_gres_gpu_type(cluster: str, requested_gpu_type: str) -> str:
             ssh_config_dict['hostname'],
             int(ssh_config_dict.get('port', 22)),
             ssh_config_dict['user'],
-            ssh_config_dict['identityfile'][0],
+            get_identity_file(ssh_config_dict),
             ssh_proxy_command=ssh_config_dict.get('proxycommand', None),
             ssh_proxy_jump=ssh_config_dict.get('proxyjump', None),
         )
@@ -491,20 +540,17 @@ def _get_slurm_node_info_list(
     # can raise FileNotFoundError if config file does not exist.
     slurm_config = get_slurm_ssh_config()
     if slurm_cluster_name is None:
-        slurm_cluster_names = get_all_slurm_cluster_names()
-        if slurm_cluster_names:
-            slurm_cluster_name = slurm_cluster_names[0]
-    if slurm_cluster_name is None:
-        raise ValueError(
-            f'No Slurm cluster name found in the {DEFAULT_SLURM_PATH} '
-            f'configuration.')
+        slurm_cluster_names = clouds.Slurm.existing_allowed_clusters()
+        if not slurm_cluster_names:
+            return []
+        slurm_cluster_name = slurm_cluster_names[0]
     slurm_config_dict = slurm_config.lookup(slurm_cluster_name)
     logger.debug(f'Slurm config dict: {slurm_config_dict}')
     slurm_client = slurm.SlurmClient(
         slurm_config_dict['hostname'],
         int(slurm_config_dict.get('port', 22)),
         slurm_config_dict['user'],
-        slurm_config_dict['identityfile'][0],
+        get_identity_file(slurm_config_dict),
         ssh_proxy_command=slurm_config_dict.get('proxycommand', None),
         ssh_proxy_jump=slurm_config_dict.get('proxyjump', None),
     )
@@ -590,7 +636,7 @@ def slurm_node_info(
     try:
         node_list = _get_slurm_node_info_list(
             slurm_cluster_name=slurm_cluster_name)
-    except (RuntimeError, exceptions.NotSupportedError) as e:
+    except (FileNotFoundError, RuntimeError, exceptions.NotSupportedError) as e:
         logger.debug(f'Could not retrieve Slurm node info: {e}')
         return []
     return node_list
@@ -652,7 +698,7 @@ def get_partition_infos(cluster_name: str) -> Dict[str, slurm.SlurmPartition]:
             slurm_config_dict['hostname'],
             int(slurm_config_dict.get('port', 22)),
             slurm_config_dict['user'],
-            slurm_config_dict['identityfile'][0],
+            get_identity_file(slurm_config_dict),
             ssh_proxy_command=slurm_config_dict.get('proxycommand', None),
             ssh_proxy_jump=slurm_config_dict.get('proxyjump', None),
         )
@@ -697,6 +743,8 @@ def srun_sshd_command(
     job_id: str,
     target_node: str,
     unix_user: str,
+    cluster_name_on_cloud: str,
+    is_container_image: bool,
 ) -> str:
     """Build srun command for launching sshd -i inside a Slurm job.
 
@@ -707,6 +755,8 @@ def srun_sshd_command(
         job_id: The Slurm job ID
         target_node: The target compute node hostname
         unix_user: The Unix user for the job
+        cluster_name_on_cloud: SkyPilot cluster name on Slurm side.
+        is_container_image: Whether the cluster is on containers.
 
     Returns:
         List of command arguments to be extended to ssh base command
@@ -714,6 +764,59 @@ def srun_sshd_command(
     # We use ~username to ensure we use the real home of the user ssh'ing in,
     # because we override the home directory in SlurmCommandRunner.run.
     user_home_ssh_dir = f'~{unix_user}/.ssh'
+
+    # TODO(kevin): SSH sessions don't inherit Slurm env vars (SLURM_*, CUDA_*,
+    # etc.) because sshd/dropbear spawns a fresh shell. Fix by capturing env
+    # to a file and sourcing it.
+
+    if is_container_image:
+        # Dropbear + socat bridge for container mode.
+        # See slurm-ray.yml.j2 for why we use Dropbear instead of OpenSSH.
+        # Dropbear's -i (inetd) mode expects a socket fd on stdin, but srun
+        # provides pipes. socat bridges stdin/stdout to a TCP socket.
+        ssh_bootstrap_cmd = (
+            # Find dropbear in PATH
+            'DROPBEAR=$(command -v dropbear); '
+            'if [ -z "$DROPBEAR" ]; then '
+            'echo "dropbear not found" >&2; exit 1; fi; '
+            # Find a free port in the ephemeral range
+            'while :; do '
+            'PORT=$((30000 + RANDOM % 30000)); '
+            'ss -tln | awk \'{print $4}\' | grep -q ":$PORT$" || break; '
+            'done; '
+            # Start dropbear and wait for it to bind
+            '"$DROPBEAR" -F -s -R -p "127.0.0.1:$PORT" & '
+            'DROPBEAR_PID=$!; '
+            'trap "kill $DROPBEAR_PID 2>/dev/null" EXIT; '
+            'for i in $(seq 1 50); do '
+            'ss -tlnp 2>/dev/null | grep -q ":$PORT.*pid=$DROPBEAR_PID" '
+            '&& break; sleep 0.1; done; '
+            'if ! ss -tlnp 2>/dev/null | '
+            'grep -q ":$PORT.*pid=$DROPBEAR_PID"; then '
+            'echo "Error: Timed out waiting for dropbear to start." >&2; '
+            'exit 1; fi; '
+            'socat STDIO TCP:127.0.0.1:$PORT')
+        return shlex.join([
+            'srun',
+            '--overlap',
+            '--quiet',
+            '--unbuffered',
+            '--jobid',
+            job_id,
+            '--nodes=1',
+            '--ntasks=1',
+            '--ntasks-per-node=1',
+            '-w',
+            target_node,
+            '--container-remap-root',
+            f'--container-name='
+            f'{pyxis_container_name(cluster_name_on_cloud)}:exec',
+            '/bin/bash',
+            '-c',
+            ssh_bootstrap_cmd,
+        ])
+
+    # Non-container: OpenSSH sshd
     return shlex.join([
         'srun',
         '--quiet',
diff --git a/sky/provision/vast/instance.py b/sky/provision/vast/instance.py
index 1ecd9e600c8..7f659c3cb3b 100644
--- a/sky/provision/vast/instance.py
+++ b/sky/provision/vast/instance.py
@@ -1,4 +1,5 @@
 """Vast instance provisioning."""
+from pathlib import Path
 import time
 from typing import Any, Dict, List, Optional, Tuple
 
@@ -56,6 +57,20 @@ def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
     logger.debug(f'provider_config: {config.provider_config}')
     logger.debug(f'create_instance_kwargs from provider_config: '
                  f'{create_instance_kwargs}')
+
+    # Get SSH public key path and read the content for vast.ai key injection
+    ssh_public_key_path = config.authentication_config.get('ssh_public_key')
+    ssh_public_key = None
+    if ssh_public_key_path:
+        try:
+            expanded_path = Path(ssh_public_key_path).expanduser()
+            with open(expanded_path, 'r', encoding='utf-8') as f:
+                ssh_public_key = f.read().strip()
+            logger.debug(f'Read SSH public key from {expanded_path}')
+        except OSError as e:
+            logger.warning(f'Failed to read SSH public key from '
+                           f'{ssh_public_key_path}: {e}')
+
     docker_login_config = config.docker_config.get('docker_login_config')
     login_args = None
     image_name = config.node_config['ImageId']
@@ -123,6 +138,7 @@ def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
                     private_docker_registry=docker_login_config is not None,
                     login=login_args,
                     create_instance_kwargs=create_instance_kwargs,
+                    ssh_public_key=ssh_public_key,
                 )
             except Exception as e:  # pylint: disable=broad-except
                 logger.warning(f'run_instances error: {e}')
diff --git a/sky/provision/vast/utils.py b/sky/provision/vast/utils.py
index 630ce8e867c..5b026d883a9 100644
--- a/sky/provision/vast/utils.py
+++ b/sky/provision/vast/utils.py
@@ -6,6 +6,7 @@
 #
 """Vast library wrapper for SkyPilot."""
 from pathlib import Path
+import shlex
 from typing import Any, Dict, List, Optional
 
 from sky import sky_logging
@@ -44,7 +45,8 @@ def launch(name: str,
            secure_only: bool,
            private_docker_registry: Optional[bool] = None,
            login: Optional[str] = None,
-           create_instance_kwargs: Optional[Dict[str, Any]] = None) -> str:
+           create_instance_kwargs: Optional[Dict[str, Any]] = None,
+           ssh_public_key: Optional[str] = None) -> str:
     """Launches an instance with the given parameters.
 
     Converts the instance_type to the Vast GPU name, finds the specs for the
@@ -199,6 +201,21 @@ def launch(name: str,
         'touch ~/.no_auto_tmux',
         f'echo "{vast.vast().api_key_access}" > ~/.vast_api_key',
     ]
+
+    # Inject SSH public key into authorized_keys if provided
+    if ssh_public_key:
+        # Add commands to inject SSH key into authorized_keys
+        skypilot_onstart.extend([
+            'mkdir -p ~/.ssh',
+            'chmod 700 ~/.ssh',
+            # Add a newline first to ensure keys are on separate lines
+            'echo "" >> ~/.ssh/authorized_keys',
+            (f'echo "{shlex.quote(ssh_public_key.strip())}" >> '
+             '~/.ssh/authorized_keys'),
+            'chmod 600 ~/.ssh/authorized_keys',
+        ])
+        logger.debug('Added SSH key injection to onstart_cmd')
+
     if user_onstart_cmd:
         skypilot_onstart.append(user_onstart_cmd)
     launch_params['onstart_cmd'] = ';'.join(skypilot_onstart)
diff --git a/sky/provision/yotta/__init__.py b/sky/provision/yotta/__init__.py
new file mode 100644
index 00000000000..63d43c2b99e
--- /dev/null
+++ b/sky/provision/yotta/__init__.py
@@ -0,0 +1,15 @@
+"""Yotta provisioner module."""
+
+from sky.provision.yotta.config import bootstrap_instances
+from sky.provision.yotta.instance import cleanup_ports
+from sky.provision.yotta.instance import get_cluster_info
+from sky.provision.yotta.instance import query_instances
+from sky.provision.yotta.instance import query_ports
+from sky.provision.yotta.instance import run_instances
+from sky.provision.yotta.instance import stop_instances
+from sky.provision.yotta.instance import terminate_instances
+from sky.provision.yotta.instance import wait_instances
+
+__all__ = ('bootstrap_instances', 'run_instances', 'stop_instances',
+           'terminate_instances', 'wait_instances', 'get_cluster_info',
+           'cleanup_ports', 'query_instances', 'query_ports')
diff --git a/sky/provision/yotta/config.py b/sky/provision/yotta/config.py
new file mode 100644
index 00000000000..798b3ce585a
--- /dev/null
+++ b/sky/provision/yotta/config.py
@@ -0,0 +1,12 @@
+"""Yotta configuration bootstrapping."""
+
+from sky.provision import common
+
+
+def bootstrap_instances(
+        region: str, cluster_name: str,
+        config: common.ProvisionConfig) -> common.ProvisionConfig:
+    """Bootstraps instances for the given cluster."""
+    del region, cluster_name  # unused
+
+    return config
diff --git a/sky/provision/yotta/instance.py b/sky/provision/yotta/instance.py
new file mode 100644
index 00000000000..616f3bc0582
--- /dev/null
+++ b/sky/provision/yotta/instance.py
@@ -0,0 +1,352 @@
+"""Yotta instance provisioning."""
+import time
+from typing import Any, Dict, List, Optional, Tuple
+
+from sky import sky_logging
+from sky.provision import common
+from sky.provision.yotta import yotta_utils
+from sky.provision.yotta.yotta_utils import PodStatusEnum
+from sky.provision.yotta.yotta_utils import yotta_client
+from sky.utils import common_utils
+from sky.utils import resources_utils
+from sky.utils import status_lib
+from sky.utils import ux_utils
+
+POLL_INTERVAL = 10
+QUERY_PORTS_TIMEOUT_SECONDS = 30
+
+logger = sky_logging.init_logger(__name__)
+
+HEAD_NODE_SUFFIX = '-head'
+WORKER_NODE_SUFFIX = '-worker'
+
+
+def _format_instances(instances: Dict[str, Any]) -> List[Dict[str, Any]]:
+    return [{
+        'id': inst_id,
+        'podName': inst.get('podName'),
+        'status': inst.get('status')
+    } for inst_id, inst in instances.items()]
+
+
+def _filter_instances(cluster_name_on_cloud: str,
+                      status_filters: Optional[List[PodStatusEnum]] = None,
+                      head_only: bool = False) -> Dict[str, Any]:
+
+    instances = yotta_client.list_instances(cluster_name_on_cloud)
+    possible_names = [f'{cluster_name_on_cloud}{HEAD_NODE_SUFFIX}']
+    if not head_only:
+        possible_names.append(f'{cluster_name_on_cloud}{WORKER_NODE_SUFFIX}')
+    logger.debug(f'Possible names: {possible_names}')
+    filtered_instances = {}
+    for instance_id, instance in instances.items():
+        status = instance.get('status')
+        logger.debug(f'Instance {instance_id} status: {status}')
+        try:
+            instance_status = PodStatusEnum(status)
+        except ValueError:
+            logger.warning(f'Unknown pod status: {status}')
+            continue
+        logger.debug(f'Instance {instance_id} status: {instance_status}, '
+                     f'status_filters: {status_filters}')
+        if (status_filters is not None and
+                instance_status not in status_filters):
+            continue
+        logger.debug(f'Instance {instance_id} podName: '
+                     f'{instance.get("podName")} add filter: '
+                     f'{instance.get("podName") in possible_names}')
+        if instance.get('podName') in possible_names:
+            filtered_instances[instance_id] = instance
+    return filtered_instances
+
+
+def _get_head_instance_id(instances: Dict[str, Any]) -> Optional[str]:
+    head_instance_id = None
+    for inst_id, inst in instances.items():
+        if inst['name'].endswith(HEAD_NODE_SUFFIX):
+            head_instance_id = inst_id
+            break
+    return head_instance_id
+
+
+def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
+                  config: common.ProvisionConfig) -> common.ProvisionRecord:
+    """Runs instances for the given cluster."""
+    del cluster_name  # unused
+
+    exist_instances = _filter_instances(
+        cluster_name_on_cloud,
+        [PodStatusEnum.RUNNING, PodStatusEnum.INITIALIZE])
+    head_instance_id = _get_head_instance_id(exist_instances)
+
+    to_start_count = config.count - len(exist_instances)
+    if to_start_count < 0:
+        raise RuntimeError(
+            f'Cluster {cluster_name_on_cloud} already has '
+            f'{len(exist_instances)} nodes, but {config.count} are required.')
+    if to_start_count == 0:
+        if head_instance_id is None:
+            raise RuntimeError(
+                f'Cluster {cluster_name_on_cloud} has no head node.')
+        logger.debug(f'Cluster {cluster_name_on_cloud} already has '
+                     f'{len(exist_instances)} nodes, no need to start more.')
+        return common.ProvisionRecord(provider_name='yotta',
+                                      cluster_name=cluster_name_on_cloud,
+                                      region=region,
+                                      zone=None,
+                                      head_instance_id=head_instance_id,
+                                      resumed_instance_ids=[],
+                                      created_instance_ids=[])
+
+    # 1. Create cluster
+    logger.debug(f'Creating cluster {cluster_name_on_cloud}...')
+    try:
+        cluster_id = yotta_client.create_cluster(
+            cluster_name=cluster_name_on_cloud,
+            instance_type=config.node_config['InstanceType'],
+            region=region,
+            image_name=config.node_config['ImageId'],
+            disk_size=config.node_config['DiskSize'],
+            ports=config.ports_to_open_on_launch,
+            public_key=config.node_config['PublicKey'],
+            ssh_user=config.authentication_config['ssh_user'],
+            node_num=config.count,
+        )
+    except Exception as e:
+        logger.error(f'Failed to create cluster: {e}')
+        raise
+
+    # 2. Poll cluster status until RUNNING
+    logger.debug(f'Polling cluster {cluster_name_on_cloud} status...')
+    while True:
+        status = yotta_client.get_cluster_status(cluster_id)
+        if status == yotta_utils.ClusterStatusEnum.RUNNING.value:
+            logger.debug(f'Cluster {cluster_name_on_cloud} is RUNNING.')
+            break
+        if status in [
+                yotta_utils.ClusterStatusEnum.TERMINATED.value,
+                yotta_utils.ClusterStatusEnum.TERMINATING.value
+        ]:
+            logger.error(
+                f'Cluster {cluster_name_on_cloud} is TERMINATING or TERMINATED.'
+            )
+            raise RuntimeError(f'Cluster {cluster_name_on_cloud} is {status}')
+        logger.debug(f'Cluster {cluster_name_on_cloud} status: {status}. '
+                     'Waiting...')
+        time.sleep(20)
+
+    # 3. Create pods (instances)
+    created_instance_ids = []
+    to_start_count = config.count - len(exist_instances)
+    for _ in range(to_start_count):
+        node_suffix = (HEAD_NODE_SUFFIX
+                       if head_instance_id is None else WORKER_NODE_SUFFIX)
+        name = f'{cluster_name_on_cloud}{node_suffix}'
+        try:
+            instance_id = yotta_client.launch(
+                cluster_name=cluster_name_on_cloud,
+                cluster_id=cluster_id,
+                name=name,
+                image_name=config.node_config['ImageId'],
+                docker_login_config=config.provider_config.get(
+                    'docker_login_config'),
+                ports=config.ports_to_open_on_launch,
+                public_key=config.node_config['PublicKey'],
+            )
+        except Exception as e:
+            logger.error(f'Failed to launch instance: {e}')
+            raise
+        logger.debug(f'Launched instance {instance_id}.')
+        created_instance_ids.append(instance_id)
+        if head_instance_id is None:
+            head_instance_id = instance_id
+
+    # Wait for instances to be ready.
+    while True:
+        instances = _filter_instances(cluster_name_on_cloud)
+        logger.debug(f'Waiting for instances to be ready: '
+                     f'{_format_instances(instances)}')
+        ready_instance_cnt = 0
+
+        for instance_id, instance in instances.items():
+            if instance.get('status') in [
+                    yotta_utils.PodStatusEnum.TERMINATED.value,
+                    yotta_utils.PodStatusEnum.FAILED.value
+            ]:
+                logger.error(f'Instance {instance_id} is TERMINATED or FAILED.')
+                raise RuntimeError(
+                    f'Instance {instance_id} is {instance.get("status")}.')
+            port = yotta_utils.get_ssh_port(instance)
+            logger.debug(f'Instance {instance_id} port: {port}.')
+            if port is not None and port.get('healthy'):
+                ready_instance_cnt += 1
+        logger.debug('Waiting for instances to be ready: '
+                     f'({ready_instance_cnt}/{config.count}).')
+        if ready_instance_cnt == config.count:
+            break
+
+        time.sleep(POLL_INTERVAL)
+    assert head_instance_id is not None, 'head_instance_id should not be None'
+    return common.ProvisionRecord(provider_name='yotta',
+                                  cluster_name=cluster_name_on_cloud,
+                                  region=region,
+                                  zone=None,
+                                  head_instance_id=head_instance_id,
+                                  resumed_instance_ids=[],
+                                  created_instance_ids=created_instance_ids)
+
+
+def wait_instances(region: str, cluster_name_on_cloud: str,
+                   state: Optional[status_lib.ClusterStatus]) -> None:
+    del region, cluster_name_on_cloud, state
+
+
+def stop_instances(
+    cluster_name_on_cloud: str,
+    provider_config: Optional[Dict[str, Any]] = None,
+    worker_only: bool = False,
+) -> None:
+    raise NotImplementedError()
+
+
+def terminate_instances(
+    cluster_name_on_cloud: str,
+    provider_config: Optional[Dict[str, Any]] = None,
+    worker_only: bool = False,
+) -> None:
+    """See sky/provision/__init__.py"""
+    del provider_config  # unused
+    if worker_only:
+        # TODO: support worker_only termination
+        logger.warning('worker_only termination is not supported on Yotta. '
+                       'Terminating the whole cluster.')
+    try:
+        logger.debug(f'Terminating cluster {cluster_name_on_cloud}...')
+        yotta_client.terminate_instances(cluster_name_on_cloud)
+        while True:
+            instances = _filter_instances(cluster_name_on_cloud)
+            logger.debug(
+                f'Cluster {cluster_name_on_cloud} terminate_instances: '
+                f'{_format_instances(instances)}.')
+            all_terminated = all(
+                instance.get('status') in
+                [PodStatusEnum.TERMINATED.value, PodStatusEnum.FAILED.value]
+                for instance in instances.values())
+            if all_terminated:
+                logger.debug(f'Cluster {cluster_name_on_cloud} is terminated.')
+                break
+            time.sleep(POLL_INTERVAL)
+
+    except Exception as e:  # pylint: disable=broad-except
+        with ux_utils.print_exception_no_traceback():
+            raise RuntimeError(
+                f'Failed to terminate cluster {cluster_name_on_cloud}: '
+                f'{common_utils.format_exception(e, use_bracket=False)}') from e
+
+
+def get_cluster_info(
+        region: str,
+        cluster_name_on_cloud: str,
+        provider_config: Optional[Dict[str, Any]] = None) -> common.ClusterInfo:
+    del region  # unused
+    running_instances = _filter_instances(cluster_name_on_cloud,
+                                          [PodStatusEnum.RUNNING])
+    instances: Dict[str, List[common.InstanceInfo]] = {}
+    head_instance_id = None
+    for instance_id, instance_info in running_instances.items():
+        port = yotta_utils.get_ssh_port(instance_info)
+        external_ip = port.get('host')
+        internal_ip = instance_info.get('internalIp')
+        instances[instance_id] = [
+            common.InstanceInfo(
+                instance_id=instance_id,
+                internal_ip=internal_ip,
+                external_ip=external_ip,
+                ssh_port=port.get('proxyPort'),
+                tags={},
+            )
+        ]
+        if instance_info['podName'].endswith(HEAD_NODE_SUFFIX):
+            head_instance_id = instance_id
+    return common.ClusterInfo(
+        instances=instances,
+        head_instance_id=head_instance_id,
+        provider_name='yotta',
+        provider_config=provider_config,
+        custom_ray_options={'use_external_ip': True},
+    )
+
+
+def query_instances(
+    cluster_name: str,
+    cluster_name_on_cloud: str,
+    provider_config: Optional[Dict[str, Any]] = None,
+    non_terminated_only: bool = True,
+    retry_if_missing: bool = False,
+) -> Dict[str, Tuple[Optional[status_lib.ClusterStatus], Optional[str]]]:
+    """See sky/provision/__init__.py"""
+    del cluster_name, retry_if_missing  # unused
+    assert provider_config is not None, (cluster_name_on_cloud, provider_config)
+    instances = _filter_instances(cluster_name_on_cloud)
+    status_map = {
+        PodStatusEnum.INITIALIZE: status_lib.ClusterStatus.INIT,
+        PodStatusEnum.RUNNING: status_lib.ClusterStatus.UP,
+        PodStatusEnum.TERMINATING: status_lib.ClusterStatus.UP,
+        PodStatusEnum.TERMINATED: status_lib.ClusterStatus.STOPPED,
+        PodStatusEnum.FAILED: status_lib.ClusterStatus.STOPPED,
+        # not support pause just mapping status
+        PodStatusEnum.PAUSING: status_lib.ClusterStatus.UP,
+        PodStatusEnum.PAUSED: status_lib.ClusterStatus.STOPPED,
+    }
+    statuses: Dict[str, Tuple[Optional[status_lib.ClusterStatus],
+                              Optional[str]]] = {}
+    for inst_id, instance in instances.items():
+        status = status_map[PodStatusEnum(instance.get('status'))]
+        if non_terminated_only and status is None:
+            continue
+        statuses[inst_id] = (status, None)
+    return statuses
+
+
+def cleanup_ports(
+    cluster_name_on_cloud: str,
+    ports: List[str],
+    provider_config: Optional[Dict[str, Any]] = None,
+) -> None:
+    del cluster_name_on_cloud, ports, provider_config  # Unused.
+
+
+def query_ports(
+    cluster_name_on_cloud: str,
+    ports: List[str],
+    head_ip: Optional[str] = None,
+    provider_config: Optional[Dict[str, Any]] = None,
+) -> Dict[int, List[common.Endpoint]]:
+    """See sky/provision/__init__.py"""
+    del head_ip, provider_config  # Unused.
+    # Yotta ports sometimes take a while to be ready.
+    start_time = time.time()
+    ports_to_query = resources_utils.port_ranges_to_set(ports)
+    while True:
+        instances = _filter_instances(cluster_name_on_cloud, head_only=True)
+        # don't support multiple instances
+        assert len(instances) <= 1
+        # It is possible that the instance is terminated on console by
+        # the user. In this case, the instance will not be found, and we
+        # should return an empty dict.
+        if not instances:
+            return {}
+        head_instance = list(instances.values())[0]
+        ready_ports: Dict[int, List[common.Endpoint]] = {
+            port: [common.SocketEndpoint(**endpoint)]
+            for port, endpoint in head_instance['port2endpoint'].items()
+            if port in ports_to_query
+        }
+        not_ready_ports = ports_to_query - set(ready_ports.keys())
+        if not not_ready_ports:
+            return ready_ports
+        if time.time() - start_time > QUERY_PORTS_TIMEOUT_SECONDS:
+            logger.warning(f'Querying ports {ports} timed out. Ports '
+                           f'{not_ready_ports} are not ready.')
+            return ready_ports
+        time.sleep(1)
diff --git a/sky/provision/yotta/yotta_utils.py b/sky/provision/yotta/yotta_utils.py
new file mode 100644
index 00000000000..fb2a0316b9f
--- /dev/null
+++ b/sky/provision/yotta/yotta_utils.py
@@ -0,0 +1,344 @@
+"""Yotta API client."""
+
+import base64
+import enum
+import json
+import os
+from typing import Any, Dict, List, Optional, Tuple
+import uuid
+
+import requests
+
+from sky import sky_logging
+from sky.skylet import constants
+
+logger = sky_logging.init_logger(__name__)
+
+ENDPOINT = 'https://api.yottalabs.ai/openapi'
+API_KEY_HEADER = 'X-API-KEY'
+CLUSTER_NOT_FOUND_CODE = 44003
+CREDENTIAL_FILE = '~/.yotta/credentials'
+
+
+class PodStatusEnum(enum.Enum):
+    """Pod status."""
+    INITIALIZE = 'INITIALIZE'
+    RUNNING = 'RUNNING'
+    PAUSING = 'PAUSING'
+    PAUSED = 'PAUSED'
+    TERMINATING = 'TERMINATING'
+    TERMINATED = 'TERMINATED'
+    FAILED = 'FAILED'
+
+
+class ClusterStatusEnum(enum.Enum):
+    """Cluster status."""
+    INITIALIZE = 'INITIALIZE'
+    RUNNING = 'RUNNING'
+    TERMINATING = 'TERMINATING'
+    TERMINATED = 'TERMINATED'
+
+
+class ClusterTypeEnum(enum.Enum):
+    """Cluster type."""
+    PRIVATE = 1
+    PUBLIC = 2
+
+
+class ClusterSourceEnum(enum.Enum):
+    """Cluster source."""
+    SKY_PILOT = 1
+
+
+def get_key_suffix():
+    return str(uuid.uuid4()).replace('-', '')[:8]
+
+
+def _load_credentials() -> Tuple[str, str]:
+    """Reads the credentials file and returns orgId and apiKey."""
+    credentials_file_path = os.path.expanduser(CREDENTIAL_FILE)
+
+    if not os.path.isfile(credentials_file_path):
+        raise FileNotFoundError(
+            f'Credentials file not found at {credentials_file_path}')
+
+    try:
+        with open(credentials_file_path, 'r', encoding='utf-8') as f:
+            credentials = {}
+            for line in f:
+                if '=' in line:
+                    key, value = line.strip().split('=', 1)
+                    credentials[key] = value
+
+        org_id: str = credentials.get('orgId', '')
+        api_key: str = credentials.get('apikey', '')
+
+        if not org_id or not api_key:
+            raise ValueError(
+                f'Missing orgId or apikey in credentials'
+                f' file: {credentials_file_path}. '
+                'Please ensure the file contains \'orgId=<your_org_id>\' and '
+                '\'apikey=<your_api_key>\'.')
+
+        return org_id, api_key
+    except Exception as e:
+        raise ValueError(
+            f'Error reading credentials file: {credentials_file_path}. {e}'
+        ) from e
+
+
+def get_ssh_port(instance):
+    # get ssh port example:
+    # {'port': 22, 'proxyPort': 30035, 'protocol': 'SSH',
+    # 'host': '127.0.0.1', 'healthy': True,
+    # 'ingressUrl': 'ssh root@127.0.0.1 -p 30035 -i <private key file>',
+    # 'serviceName': 'SSH Port'}
+    expose = instance.get('expose', [])
+    for port in expose:
+        if port.get('protocol') == 'SSH':
+            return port
+    return None
+
+
+def raise_yotta_error(response: 'requests.Response') -> None:
+    """Raise YottaAPIError if appropriate."""
+    status_code = response.status_code
+    logger.debug(f'response: {response.status_code} - {response.text}')
+    try:
+        resp_json = response.json()
+    except (KeyError, json.decoder.JSONDecodeError) as e:
+        raise YottaAPIError(
+            f'Unexpected error. Status code: {status_code} \n {response.text} '
+            f'\n {str(e)}', status_code) from e
+    if response.ok:
+        if resp_json.get('code') != 10000:
+            raise YottaAPIError(
+                f'Business error: {resp_json.get("message", "Unknown error")}',
+                resp_json.get('code', status_code))
+        return
+    else:
+        raise YottaAPIError(
+            f'Unexpected error. Status code: {status_code} \n {response.text}',
+            status_code)
+
+
+class YottaAPIError(Exception):
+
+    def __init__(self, message: str, code: int = 400):
+        self.code = code
+        super().__init__(message)
+
+
+class YottaClient:
+    """Yotta API Client"""
+
+    def __init__(self):
+        self._org_id = None
+        self._api_key = None
+
+    @property
+    def org_id(self):
+        self._ensure_credentials_loaded()
+        return self._org_id
+
+    @property
+    def api_key(self):
+        self._ensure_credentials_loaded()
+        return self._api_key
+
+    def _ensure_credentials_loaded(self):
+        if self._org_id is None or self._api_key is None:
+            self._org_id, self._api_key = _load_credentials()
+
+    def check_api_key(self) -> bool:
+        url = f'{ENDPOINT}/key/check?orgId={self.org_id}'
+        logger.debug(f'Checking api key for user {self.org_id}')
+        response = requests.get(url, headers={API_KEY_HEADER: self.api_key})
+        raise_yotta_error(response)
+        check_result = response.json()
+        # True if api key is valid
+        logger.debug(f'Api key check result: {check_result}')
+        return check_result['data']
+
+    def list_instances(self,
+                       cluster_name_on_cloud: str) -> Dict[str, Dict[str, Any]]:
+        url = f'{ENDPOINT}/v1/skypilot/cluster/pods/list'
+        all_records: List[Dict[str, Any]] = []
+        request_data = {
+            'clusterName': cluster_name_on_cloud,
+            'source': ClusterSourceEnum.SKY_PILOT.value
+        }
+        logger.debug(f'Listing instances for cluster {cluster_name_on_cloud}')
+        response = requests.post(url,
+                                 headers={API_KEY_HEADER: self.api_key},
+                                 json=request_data)
+        response.raise_for_status()
+        response_json = response.json()
+        logger.debug(f'Listing instances for cluster {cluster_name_on_cloud}'
+                     f' response: {response_json}')
+        if response_json['code'] == CLUSTER_NOT_FOUND_CODE:
+            logger.debug('Cluster not found return empty list')
+            return {}
+        if response_json['code'] != 10000:
+            raise ValueError(
+                f'API returned an error: {response_json["message"]}')
+
+        records = response_json['data']
+        all_records.extend(records)
+
+        unique_records = {}
+        for record in all_records:
+            unique_records[record['id']] = record
+            status = PodStatusEnum(record.get('status'))
+            if status == PodStatusEnum.RUNNING:
+                ports = record.get('expose', [])
+                record['port2endpoint'] = {}
+                for port in ports:
+                    # container private port mapping to host public port
+                    record['port2endpoint'][port['port']] = {
+                        'host': port['host'],
+                        'port': port['proxyPort']
+                    }
+        return unique_records
+
+    def create_cluster(self, cluster_name: str, instance_type: str, region: str,
+                       image_name: str, ports: Optional[List[int]],
+                       disk_size: int, public_key: str, ssh_user: str,
+                       node_num: int) -> str:
+        url = f'{ENDPOINT}/v1/skypilot/cluster/create'
+        expose = []
+        if ports is not None:
+            for p in ports:
+                expose.append({'port': p, 'protocol': 'TCP'})
+        expose.append({'port': 22, 'protocol': 'SSH'})
+        expose.append({
+            'port': constants.SKY_REMOTE_RAY_DASHBOARD_PORT,
+            'protocol': 'HTTP'
+        })
+        expose.append({
+            'port': constants.SKY_REMOTE_RAY_PORT,
+            'protocol': 'HTTP'
+        })
+
+        request_data = {
+            'clusterName': cluster_name,
+            'instanceType': instance_type,
+            'region': region,
+            'imageName': image_name,
+            'expose': expose,
+            'publicKey': public_key,
+            'sshUser': ssh_user,
+            'nodeNum': node_num,
+            'clusterType': ClusterTypeEnum.PRIVATE.value,
+            'source': ClusterSourceEnum.SKY_PILOT.value,
+            'containerVolumeInGb': disk_size,
+        }
+        response = requests.post(url,
+                                 headers={API_KEY_HEADER: self.api_key},
+                                 json=request_data)
+        logger.debug(f'Creating cluster {cluster_name}, '
+                     f'response: {response.json()}')
+        raise_yotta_error(response)
+        return response.json()['data']['clusterId']
+
+    def get_cluster_status(self, cluster_id: str) -> str:
+        url = f'{ENDPOINT}/v1/skypilot/cluster/status/{cluster_id}'
+        response = requests.get(url, headers={API_KEY_HEADER: self.api_key})
+        logger.debug(f'Getting cluster status for {cluster_id}, '
+                     f'response: {response.json()}')
+        raise_yotta_error(response)
+        return response.json()['data']['status']
+
+    def launch(self, cluster_name: str, cluster_id: str, name: str,
+               image_name: str, docker_login_config: Optional[Dict[str, Any]],
+               ports: Optional[List[int]], public_key: str) -> str:
+        """Launches an instance with the given parameters."""
+        url = f'{ENDPOINT}/v1/skypilot/cluster/create/pod'
+
+        setup_cmd = f"""\
+            prefix_cmd() {{
+              if [ $(id -u) -ne 0 ]; then echo "sudo"; else echo ""; fi
+            }}
+            $(prefix_cmd) apt update
+            export DEBIAN_FRONTEND=noninteractive
+            $(prefix_cmd) apt install openssh-server rsync curl patch -y
+            $(prefix_cmd) mkdir -p /var/run/sshd
+            $(prefix_cmd) sed -i \
+              "s/PermitRootLogin prohibit-password/PermitRootLogin yes/" \
+              /etc/ssh/sshd_config
+            $(prefix_cmd) sed \
+              "s@session\\s*required\\s*pam_loginuid.so@session optional pam_loginuid.so@g" \
+              -i /etc/pam.d/sshd
+            cd /etc/ssh/ && $(prefix_cmd) ssh-keygen -A
+            $(prefix_cmd) mkdir -p ~/.ssh
+            $(prefix_cmd) chown -R $(whoami) ~/.ssh
+            $(prefix_cmd) chmod 700 ~/.ssh
+            $(prefix_cmd) echo "{public_key}" \
+              >> ~/.ssh/authorized_keys
+            $(prefix_cmd) chmod 644 ~/.ssh/authorized_keys
+            $(prefix_cmd) service ssh restart
+            $(prefix_cmd) export -p > ~/container_env_var.sh
+            $(prefix_cmd) mv ~/container_env_var.sh \
+              /etc/profile.d/container_env_var.sh
+            [ $(id -u) -eq 0 ] && echo alias sudo="" >> ~/.bashrc
+            sleep infinity\
+            """
+        # Use base64 to deal with the tricky quoting
+        # issues caused by Yotta API.
+        encoded = base64.b64encode(setup_cmd.encode('utf-8')).decode('utf-8')
+
+        docker_args = (f'bash -c \'echo {encoded} | base64 --decode > init.sh; '
+                       f'bash init.sh\'')
+
+        expose = []
+        if ports is not None:
+            for p in ports:
+                expose.append({'port': p, 'protocol': 'TCP'})
+        expose.append({'port': 22, 'protocol': 'SSH'})
+        expose.append({
+            'port': constants.SKY_REMOTE_RAY_DASHBOARD_PORT,
+            'protocol': 'HTTP'
+        })
+        expose.append({
+            'port': constants.SKY_REMOTE_RAY_PORT,
+            'protocol': 'HTTP'
+        })
+
+        request_data = {
+            'name': name,
+            'imagePublicType': 'PRIVATE' if docker_login_config else 'PUBLIC',
+            'image': image_name,
+            'clusterId': cluster_id,
+            'clusterName': cluster_name,
+            'expose': expose,
+            'initializationCommand': docker_args,
+        }
+        if docker_login_config:
+            request_data['imageRegistryUsername'] = str(
+                docker_login_config.get('username'))
+            request_data['imageRegistryToken'] = str(
+                docker_login_config.get('password'))
+
+        response = requests.post(url,
+                                 headers={API_KEY_HEADER: self.api_key},
+                                 json=request_data)
+        logger.debug(f'Launching instance for {cluster_id}, '
+                     f'request: {request_data}, '
+                     f'response: {response.json()}')
+        raise_yotta_error(response)
+        return response.json()['data']
+
+    def terminate_instances(self, cluster_name: str):
+        """Terminate instances."""
+        url = f'{ENDPOINT}/v1/skypilot/cluster/release'
+        request_data = {'clusterName': cluster_name}
+        response = requests.post(url=url,
+                                 headers={API_KEY_HEADER: self.api_key},
+                                 json=request_data)
+        logger.debug(f'Terminating instances for {cluster_name}, '
+                     f'response: {response.json()}')
+        raise_yotta_error(response)
+        return response.json()
+
+
+yotta_client = YottaClient()
diff --git a/sky/recipes/__init__.py b/sky/recipes/__init__.py
new file mode 100644
index 00000000000..5ee90330b28
--- /dev/null
+++ b/sky/recipes/__init__.py
@@ -0,0 +1 @@
+"""Recipes module for managing recipes."""
diff --git a/sky/recipes/core.py b/sky/recipes/core.py
new file mode 100644
index 00000000000..2d027c634da
--- /dev/null
+++ b/sky/recipes/core.py
@@ -0,0 +1,293 @@
+"""Core business logic for Recipe Hub.
+
+This module provides the main entry points for recipe operations,
+including CRUD operations and deployment functionality.
+"""
+from typing import Any, Dict, List, Optional, Tuple
+
+import yaml
+
+from sky import sky_logging
+from sky import task as task_lib
+from sky.data import data_utils
+from sky.recipes import db as recipes_db
+from sky.recipes.utils import RecipeType
+from sky.utils import common_utils
+from sky.utils import schemas
+
+logger = sky_logging.init_logger(__name__)
+
+
+def _validate_no_local_paths(config: Dict[str, Any]) -> None:
+    """Validate that recipes don't contain local file paths.
+
+    Recipes are shareable templates, so they cannot reference local files
+    that wouldn't exist on other users' machines.
+
+    Args:
+        config: The parsed YAML config dictionary.
+
+    Raises:
+        ValueError: If local paths are found in workdir or file_mounts.
+    """
+    # Check workdir - string means local path, dict means git URL
+    workdir = config.get('workdir')
+    if workdir is not None and isinstance(workdir, str):
+        raise ValueError('Local workdir paths are not allowed in recipes. '
+                         'Use a git URL instead. Example:\n'
+                         '  workdir:\n'
+                         '    url: https://github.com/user/repo\n'
+                         '    ref: main  # optional')
+
+    # Check file_mounts - sources must be cloud URLs
+    file_mounts = config.get('file_mounts')
+    if file_mounts is not None:
+        for target, source in file_mounts.items():
+            # source can be a string (path/URL) or a dict (inline storage)
+            if isinstance(source, str):
+                if not data_utils.is_cloud_store_url(source):
+                    raise ValueError(
+                        f'Local file mounts are not allowed in recipes. '
+                        f'Use cloud storage (s3://, gs://, etc.) instead. '
+                        f'Found local path at file_mounts[{target!r}]: '
+                        f'{source!r}')
+
+
+def _validate_skypilot_yaml(content: str, recipe_type: RecipeType) -> None:
+    """Validate YAML content against SkyPilot schema.
+
+    Args:
+        content: The YAML content string.
+        recipe_type: Type of recipe.
+
+    Raises:
+        ValueError: If the YAML doesn't conform to SkyPilot schema.
+    """
+    try:
+        # Parse YAML first
+        config = yaml.safe_load(content)
+        if config is None:
+            raise ValueError('YAML content is empty')
+        if not isinstance(config, dict):
+            raise ValueError(
+                'YAML must be a dictionary/mapping at the top level')
+
+        # Validate no local paths in recipes (workdir must be git, file_mounts
+        # must be cloud storage)
+        _validate_no_local_paths(config)
+
+        # Validate based on type
+        if recipe_type == RecipeType.VOLUME:
+            # Validate volume schema (handles required fields: name, type)
+            common_utils.validate_schema(config, schemas.get_volume_schema(),
+                                         'Invalid volume YAML: ')
+        else:
+            if recipe_type == RecipeType.POOL:
+                # Pool YAMLs should have a 'pool' section
+                if 'pool' not in config:
+                    raise ValueError('Pool YAML must contain a \'pool\' '
+                                     'section. Example:\n  pool:\n    name: '
+                                     'my-pool')
+
+            # Use Task.from_yaml_str for full schema validation
+            # This validates resources, envs, setup, run, file_mounts, etc.
+            task_lib.Task.from_yaml_str(content)
+
+    except yaml.YAMLError as e:
+        raise ValueError(f'Invalid YAML syntax: {e}') from e
+    except Exception as e:
+        # Re-raise ValueError as-is, wrap others
+        if isinstance(e, ValueError):
+            raise
+        raise ValueError(f'Invalid SkyPilot YAML: {e}') from e
+
+
+def get_recipe_content(recipe_name: str) -> Tuple[str, str]:
+    """Get recipe content and type by name.
+
+    This function is used by the CLI to fetch a recipe from the Hub
+    when launching with recipes:<name>.
+
+    Args:
+        recipe_name: The recipe's unique name.
+
+    Returns:
+        Tuple of (recipe_content, recipe_type).
+
+    Raises:
+        ValueError: If recipe not found.
+    """
+    template = recipes_db.get_recipe(recipe_name)
+    if template is None:
+        raise ValueError(f'Recipe not found: {recipe_name}')
+    return (template.content, template.recipe_type.value)
+
+
+def list_recipes(
+    user_id: Optional[str] = None,
+    pinned_only: bool = False,
+    my_recipes_only: bool = False,
+    recipe_type: Optional[str] = None,
+) -> List[Dict[str, Any]]:
+    """List recipes with optional filters.
+
+    By default, returns all pinned recipes plus recipes owned by the user.
+
+    Args:
+        user_id: The current user's ID (needed to see own recipes).
+        pinned_only: If True, only return pinned recipes.
+        my_recipes_only: If True, only return recipes owned by user_id.
+        recipe_type: Filter by type ('cluster' or 'job' or 'pool' or 'volume').
+
+    Returns:
+        List of recipe dictionaries.
+    """
+    recipe_type = (None
+                   if recipe_type is None else RecipeType.from_str(recipe_type))
+    recipes = recipes_db.list_recipes(
+        user_id=user_id,
+        pinned_only=pinned_only,
+        my_recipes_only=my_recipes_only,
+        recipe_type=recipe_type,
+    )
+    return [r.to_dict() for r in recipes]
+
+
+def get_recipe(recipe_name: str) -> Optional[Dict[str, Any]]:
+    """Get a single recipe by name.
+
+    Args:
+        recipe_name: The recipe's unique name.
+
+    Returns:
+        Recipe dictionary if found, None otherwise.
+    """
+    recipe = recipes_db.get_recipe(recipe_name)
+    if recipe is None:
+        return None
+    return recipe.to_dict()
+
+
+def create_recipe(
+    name: str,
+    content: str,
+    recipe_type: str,
+    user_id: str,
+    user_name: Optional[str] = None,
+    description: Optional[str] = None,
+) -> Dict[str, Any]:
+    """Create a new recipe.
+
+    Args:
+        name: Display name for the recipe.
+        content: The YAML content.
+        recipe_type: Type of recipe ('cluster' or 'job').
+        user_id: ID of the user creating the recipe.
+        user_name: Optional display name of the user.
+        description: Optional description.
+
+    Returns:
+        The created recipe as a dictionary.
+
+    Raises:
+        ValueError: If the YAML content is invalid or recipe_type is invalid.
+    """
+    # Validate recipe_type using enum (raises ValueError with helpful message)
+    recipe_type = RecipeType.from_str(recipe_type)
+
+    # Validate content against SkyPilot schema
+    _validate_skypilot_yaml(content, recipe_type)
+
+    recipe = recipes_db.create_recipe(
+        name=name,
+        content=content,
+        recipe_type=recipe_type,
+        user_id=user_id,
+        user_name=user_name,
+        description=description,
+    )
+    return recipe.to_dict()
+
+
+def update_recipe(
+    recipe_name: str,
+    user_id: str,
+    user_name: Optional[str] = None,
+    description: Optional[str] = None,
+    content: Optional[str] = None,
+) -> Optional[Dict[str, Any]]:
+    """Update a recipe.
+
+    A recipe can only be updated if it's editable.
+    Note: Recipe names cannot be changed as they are the primary identifier.
+
+    Args:
+        recipe_name: The recipe's unique name.
+        user_id: ID of the user making the update.
+        user_name: Name of the user making the update.
+        description: New description (if updating).
+        content: New YAML content (if updating).
+
+    Returns:
+        The updated recipe dictionary if successful, None if not found
+        or not authorized.
+
+    Raises:
+        ValueError: If the new YAML content is invalid or if the recipe
+            is not editable (e.g., default recipes).
+    """
+    # Validate YAML content if provided
+    if content is not None:
+        # Get the existing recipe to know the recipe_type for validation
+        existing = recipes_db.get_recipe(recipe_name)
+        if existing is not None:
+            _validate_skypilot_yaml(content, existing.recipe_type)
+
+    recipe = recipes_db.update_recipe(
+        recipe_name=recipe_name,
+        user_id=user_id,
+        user_name=user_name,
+        description=description,
+        content=content,
+    )
+    if recipe is None:
+        return None
+    return recipe.to_dict()
+
+
+def delete_recipe(recipe_name: str, user_id: str) -> bool:
+    """Delete a recipe.
+
+    Only the owner can delete their recipe, and only if it's editable.
+
+    Args:
+        recipe_name: The recipe's unique name.
+        user_id: ID of the user making the deletion (must be owner).
+
+    Returns:
+        True if deleted, False if not found or not authorized.
+
+    Raises:
+        ValueError: If the recipe is not editable (e.g., default recipes).
+    """
+    return recipes_db.delete_recipe(recipe_name, user_id)
+
+
+def toggle_pin(recipe_name: str, pinned: bool) -> Optional[Dict[str, Any]]:
+    """Toggle the pinned status of a recipe.
+
+    Args:
+        recipe_name: The recipe's unique name.
+        pinned: New pinned status.
+
+    Returns:
+        The updated recipe dictionary if successful, None if not found.
+
+    Raises:
+        ValueError: If the recipe's pin status is not editable (e.g., default
+        recipes).
+    """
+    recipe = recipes_db.toggle_pin(recipe_name, pinned)
+    if recipe is None:
+        return None
+    return recipe.to_dict()
diff --git a/sky/recipes/db.py b/sky/recipes/db.py
new file mode 100644
index 00000000000..58f61c395eb
--- /dev/null
+++ b/sky/recipes/db.py
@@ -0,0 +1,563 @@
+"""Database for recipes.
+
+This module provides a SQLAlchemy-backed database for storing recipes
+used in the Recipes feature. The database stores recipes that can be
+pinned globally, filtered by category/tags, and deployed as clusters, jobs,
+pools, or volumes.
+"""
+import functools
+import os
+import threading
+import time
+from typing import Any, Callable, Dict, List, Optional
+
+import sqlalchemy
+from sqlalchemy import orm
+from sqlalchemy.ext import declarative
+
+from sky import exceptions
+from sky import sky_logging
+from sky.recipes.utils import RecipeType
+from sky.utils import common_utils
+from sky.utils.db import db_utils
+from sky.utils.db import migration_utils
+
+logger = sky_logging.init_logger(__name__)
+
+# SQLAlchemy engine
+_SQLALCHEMY_ENGINE: Optional[sqlalchemy.engine.Engine] = None
+_SQLALCHEMY_ENGINE_LOCK = threading.Lock()
+
+# SQLAlchemy Base
+Base = declarative.declarative_base()
+
+# Table name
+RECIPES_TABLE = 'recipes'
+
+# SQLAlchemy table definition
+# Note: 'name' is the primary key and unique identifier for recipes
+recipes_table = sqlalchemy.Table(
+    RECIPES_TABLE,
+    Base.metadata,
+    sqlalchemy.Column('name', sqlalchemy.Text, primary_key=True),
+    sqlalchemy.Column('description', sqlalchemy.Text),
+    sqlalchemy.Column('content', sqlalchemy.Text, nullable=False),
+    sqlalchemy.Column('recipe_type', sqlalchemy.Text, nullable=False),
+    sqlalchemy.Column('pinned', sqlalchemy.Integer, default=0),
+    sqlalchemy.Column('user_id', sqlalchemy.Text, nullable=False),
+    sqlalchemy.Column('user_name', sqlalchemy.Text),
+    sqlalchemy.Column('created_at', sqlalchemy.Float),
+    sqlalchemy.Column('updated_at', sqlalchemy.Float),
+    sqlalchemy.Column('updated_by_id', sqlalchemy.Text),
+    sqlalchemy.Column('updated_by_name', sqlalchemy.Text),
+    sqlalchemy.Column('is_editable', sqlalchemy.Integer, default=1),
+    sqlalchemy.Column('is_pinnable', sqlalchemy.Integer, default=1),
+    sqlalchemy.Index('idx_recipe_user_id', 'user_id'),
+    sqlalchemy.Index('idx_recipe_pinned', 'pinned'),
+    sqlalchemy.Index('idx_recipe_type', 'recipe_type'),
+)
+
+# Directory containing example YAML files
+_EXAMPLES_DIR = os.path.join(os.path.dirname(__file__), 'examples')
+
+# Default templates: maps filename (without .yaml) to metadata
+# Content is loaded from sky/recipes/examples/{filename}.yaml
+# Note: Recipe names must use letters, numbers, and dashes only
+DEFAULT_TEMPLATES: Dict[str, Dict[str, str]] = {
+    'basic_cluster': {
+        'name': 'basic-cluster',
+        'description': 'A simple cluster with GPU resources',
+        'recipe_type': 'cluster',
+    },
+    'basic_managed_job': {
+        'name': 'basic-managed-job',
+        'description': 'A simple job with recovery',
+        'recipe_type': 'job',
+    },
+    'basic_pool': {
+        'name': 'basic-pool',
+        'description': 'A pool for concurrent jobs',
+        'recipe_type': 'pool',
+    },
+    'basic_volume': {
+        'name': 'basic-volume',
+        'description': 'A starting point for a k8s volume',
+        'recipe_type': 'volume',
+    },
+}
+
+
+def _load_example_content(filename: str) -> str:
+    """Load YAML content from an example file.
+
+    Args:
+        filename: The filename without .yaml extension.
+
+    Returns:
+        The file content as a string.
+
+    Raises:
+        FileNotFoundError: If the example file doesn't exist.
+    """
+    filepath = os.path.join(_EXAMPLES_DIR, f'{filename}.yaml')
+    with open(filepath, 'r', encoding='utf-8') as f:
+        return f.read()
+
+
+def _create_table(engine: sqlalchemy.engine.Engine) -> None:
+    """Create tables and run migrations."""
+    # Enable WAL mode for SQLite to avoid locking issues
+    if (engine.dialect.name == db_utils.SQLAlchemyDialect.SQLITE.value and
+            not common_utils.is_wsl()):
+        try:
+            with orm.Session(engine) as session:
+                session.execute(sqlalchemy.text('PRAGMA journal_mode=WAL'))
+                session.commit()
+        except sqlalchemy.exc.OperationalError as e:
+            if 'database is locked' not in str(e):
+                raise
+
+    # Run migrations
+    migration_utils.safe_alembic_upgrade(engine,
+                                         migration_utils.RECIPES_DB_NAME,
+                                         migration_utils.RECIPES_VERSION)
+
+
+def _insert_default_templates(engine: sqlalchemy.engine.Engine) -> None:
+    """Insert default templates if the table is empty.
+
+    Loads recipe content from example files in sky/recipes/examples/ and
+    populates the database with the default templates.
+    """
+    with orm.Session(engine) as session:
+        # Check if table is empty
+        # pylint: disable=not-callable
+        count = session.execute(
+            sqlalchemy.select(
+                sqlalchemy.func.count()).select_from(recipes_table)).scalar()
+        if count == 0:
+            now = time.time()
+            for filename, metadata in DEFAULT_TEMPLATES.items():
+                try:
+                    content = _load_example_content(filename)
+                except FileNotFoundError:
+                    logger.warning(f'Example file not found: {filename}.yaml')
+                    continue
+
+                session.execute(recipes_table.insert().values(
+                    name=metadata['name'],
+                    description=metadata['description'],
+                    content=content,
+                    recipe_type=metadata['recipe_type'],
+                    pinned=1,
+                    user_id='system',
+                    user_name='SkyPilot',
+                    created_at=now,
+                    updated_at=now,
+                    is_editable=0,
+                    is_pinnable=0,
+                ))
+            session.commit()
+
+
+def initialize_and_get_db() -> sqlalchemy.engine.Engine:
+    """Initialize and return the database engine."""
+    global _SQLALCHEMY_ENGINE
+
+    if _SQLALCHEMY_ENGINE is not None:
+        return _SQLALCHEMY_ENGINE
+
+    with _SQLALCHEMY_ENGINE_LOCK:
+        if _SQLALCHEMY_ENGINE is not None:
+            return _SQLALCHEMY_ENGINE
+
+        # Get an engine to the db
+        engine = db_utils.get_engine('recipes')
+
+        # Run migrations
+        _create_table(engine)
+
+        # Insert default templates after migrations
+        _insert_default_templates(engine)
+
+        _SQLALCHEMY_ENGINE = engine
+        return _SQLALCHEMY_ENGINE
+
+
+def _init_db(func: Callable) -> Callable:
+    """Decorator to initialize database connection before function call."""
+
+    @functools.wraps(func)
+    def wrapper(*args, **kwargs):
+        initialize_and_get_db()
+        return func(*args, **kwargs)
+
+    return wrapper
+
+
+# =============================================================================
+# Data classes
+# =============================================================================
+
+
+class Recipe:
+    """Represents a YAML template.
+
+    Recipes are uniquely identified by their name, which must follow the
+    naming convention (letters, numbers, and dashes only).
+    """
+
+    def __init__(
+        self,
+        name: str,
+        content: str,
+        recipe_type: RecipeType,
+        user_id: str,
+        description: Optional[str] = None,
+        pinned: bool = False,
+        user_name: Optional[str] = None,
+        created_at: Optional[float] = None,
+        updated_at: Optional[float] = None,
+        updated_by_id: Optional[str] = None,
+        updated_by_name: Optional[str] = None,
+        is_editable: bool = True,
+        is_pinnable: bool = True,
+    ):
+        self.name = name
+        self.description = description
+        self.content = content
+        self.recipe_type = recipe_type
+        self.pinned = pinned
+        self.user_id = user_id
+        self.user_name = user_name
+        self.created_at = created_at or time.time()
+        self.updated_at = updated_at or time.time()
+        self.updated_by_id = updated_by_id
+        self.updated_by_name = updated_by_name
+        self.is_editable = is_editable
+        self.is_pinnable = is_pinnable
+
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert to dictionary for API responses."""
+        return {
+            'name': self.name,
+            'description': self.description,
+            'content': self.content,
+            'recipe_type': self.recipe_type.value,
+            'pinned': self.pinned,
+            'user_id': self.user_id,
+            'user_name': self.user_name,
+            'created_at': self.created_at,
+            'updated_at': self.updated_at,
+            'updated_by_id': self.updated_by_id,
+            'updated_by_name': self.updated_by_name,
+            'is_editable': self.is_editable,
+            'is_pinnable': self.is_pinnable,
+        }
+
+    @classmethod
+    def from_row(cls, row) -> 'Recipe':
+        """Create from database row."""
+        return cls(
+            name=row.name,
+            description=row.description,
+            content=row.content,
+            recipe_type=RecipeType.from_str(row.recipe_type),
+            pinned=bool(row.pinned),
+            user_id=row.user_id,
+            user_name=row.user_name,
+            created_at=row.created_at,
+            updated_at=row.updated_at,
+            updated_by_id=row.updated_by_id,
+            updated_by_name=row.updated_by_name,
+            is_editable=bool(row.is_editable),
+            is_pinnable=bool(row.is_pinnable),
+        )
+
+
+# =============================================================================
+# Database operations
+# =============================================================================
+
+
+@_init_db
+def create_recipe(
+    name: str,
+    content: str,
+    recipe_type: RecipeType,
+    user_id: str,
+    user_name: Optional[str] = None,
+    description: Optional[str] = None,
+) -> Recipe:
+    """Create a new recipe.
+
+    Args:
+        name: Unique name for the recipe. Must contain only letters, numbers,
+            and dashes.
+        content: The YAML content.
+        recipe_type: Type of recipe.
+        user_id: ID of the user creating the recipe.
+        user_name: Optional display name of the user.
+        description: Optional description.
+
+    Returns:
+        The created Recipe object.
+
+    Raises:
+        exceptions.InvalidRecipeNameError: If the name format is invalid.
+        exceptions.RecipeAlreadyExistsError: If a recipe with this name exists.
+    """
+    assert _SQLALCHEMY_ENGINE is not None
+
+    # Validate name format
+    common_utils.check_recipe_name_is_valid(name)
+
+    now = time.time()
+
+    # Use atomic insert - the primary key constraint on 'name' ensures
+    # uniqueness. Catching IntegrityError avoids race conditions between
+    # check and insert, and works for both SQLite and PostgreSQL.
+    try:
+        with orm.Session(_SQLALCHEMY_ENGINE) as session:
+            session.execute(recipes_table.insert().values(
+                name=name,
+                description=description,
+                content=content,
+                recipe_type=recipe_type.value,
+                pinned=0,
+                user_id=user_id,
+                user_name=user_name,
+                created_at=now,
+                updated_at=now,
+                is_editable=1,
+                is_pinnable=1,
+            ))
+            session.commit()
+    except sqlalchemy.exc.IntegrityError as e:
+        raise exceptions.RecipeAlreadyExistsError(
+            f'A recipe with name "{name}" already exists') from e
+
+    return Recipe(
+        name=name,
+        description=description,
+        content=content,
+        recipe_type=recipe_type,
+        pinned=False,
+        user_id=user_id,
+        user_name=user_name,
+        created_at=now,
+        updated_at=now,
+    )
+
+
+@_init_db
+def get_recipe(recipe_name: str) -> Optional[Recipe]:
+    """Get a recipe by name.
+
+    Args:
+        recipe_name: The recipe's unique name.
+
+    Returns:
+        The Recipe if found, None otherwise.
+    """
+    assert _SQLALCHEMY_ENGINE is not None
+
+    with orm.Session(_SQLALCHEMY_ENGINE) as session:
+        result = session.execute(
+            sqlalchemy.select(recipes_table).where(
+                recipes_table.c.name == recipe_name))
+        row = result.fetchone()
+        if row is None:
+            return None
+        return Recipe.from_row(row)
+
+
+@_init_db
+def list_recipes(
+    user_id: Optional[str] = None,
+    pinned_only: bool = False,
+    my_recipes_only: bool = False,
+    recipe_type: Optional[RecipeType] = None,
+) -> List[Recipe]:
+    """List recipes with optional filters.
+
+    The default behavior returns all recipes. The frontend handles
+    categorization into "My Recipes", "All Recipes", and "Pinned" sections.
+
+    Args:
+        user_id: Filter to recipes owned by this user (for my_recipes_only).
+        pinned_only: If True, only return pinned templates.
+        my_recipes_only: If True, only return recipes owned by user_id.
+        recipe_type: Filter by type.
+
+    Returns:
+        List of matching YamlTemplate objects.
+    """
+    assert _SQLALCHEMY_ENGINE is not None
+
+    query = sqlalchemy.select(recipes_table)
+
+    if pinned_only:
+        query = query.where(recipes_table.c.pinned == 1)
+    elif my_recipes_only and user_id:
+        query = query.where(recipes_table.c.user_id == user_id)
+
+    if recipe_type:
+        query = query.where(recipes_table.c.recipe_type == recipe_type.value)
+
+    query = query.order_by(recipes_table.c.pinned.desc(),
+                           recipes_table.c.name.asc())
+
+    with orm.Session(_SQLALCHEMY_ENGINE) as session:
+        result = session.execute(query)
+        rows = result.fetchall()
+
+    return [Recipe.from_row(row) for row in rows]
+
+
+@_init_db
+def update_recipe(
+    recipe_name: str,
+    user_id: str,
+    user_name: Optional[str] = None,
+    description: Optional[str] = None,
+    content: Optional[str] = None,
+) -> Optional[Recipe]:
+    """Update a recipe.
+
+    Anyone can update a recipe, but only if it's editable.
+    Note: Recipe names cannot be changed as they are the primary identifier.
+
+    Args:
+        recipe_name: The recipe's unique name.
+        user_id: ID of the user making the update.
+        user_name: Name of the user making the update.
+        description: New description (if updating).
+        content: New YAML content (if updating).
+
+    Returns:
+        The updated Recipe if successful.
+
+    Raises:
+        ValueError: If the recipe is not found or not editable.
+    """
+    assert _SQLALCHEMY_ENGINE is not None
+
+    # TODO(lloyd): We might want to change this in the future to change who is
+    # allowed to update a recipe.
+
+    updates: Dict[str, Any] = {}
+    if description is not None:
+        updates['description'] = description
+    if content is not None:
+        updates['content'] = content
+
+    if not updates:
+        # No updates requested, just return current state
+        return get_recipe(recipe_name)
+
+    updates['updated_at'] = time.time()
+    updates['updated_by_id'] = user_id
+    updates['updated_by_name'] = user_name
+
+    # Atomic update with editability check in WHERE clause.
+    # This avoids race conditions between check and update, and works
+    # for both SQLite and PostgreSQL.
+    with orm.Session(_SQLALCHEMY_ENGINE) as session:
+        result = session.execute(recipes_table.update().where(
+            recipes_table.c.name == recipe_name).where(
+                recipes_table.c.is_editable == 1).values(**updates))
+        session.commit()
+
+        if result.rowcount == 0:
+            # No rows updated - either recipe not found or not editable.
+            # Query to determine which case.
+            recipe = get_recipe(recipe_name)
+            if recipe is None:
+                raise ValueError(f'Recipe {recipe_name} not found')
+            # Recipe exists but wasn't updated -> not editable
+            raise ValueError(f'Recipe {recipe_name} is not editable')
+
+    return get_recipe(recipe_name)
+
+
+@_init_db
+def delete_recipe(recipe_name: str, user_id: str) -> bool:
+    """Delete a recipe.
+
+    Only the owner can delete a recipe, and only if it's editable.
+
+    Args:
+        recipe_name: The recipe's unique name.
+        user_id: ID of the user making the deletion (must be owner).
+
+    Returns:
+        True if deleted, False if not found or not authorized.
+
+    Raises:
+        ValueError: If the recipe is not editable (e.g., default recipes).
+    """
+    assert _SQLALCHEMY_ENGINE is not None
+
+    # Atomic delete with ownership and editability checks in WHERE clause.
+    # This avoids race conditions between check and delete, and works
+    # for both SQLite and PostgreSQL.
+    with orm.Session(_SQLALCHEMY_ENGINE) as session:
+        result = session.execute(recipes_table.delete().where(
+            recipes_table.c.name == recipe_name).where(
+                recipes_table.c.is_editable == 1).where(
+                    recipes_table.c.user_id == user_id))
+        session.commit()
+
+        if result.rowcount > 0:
+            return True
+
+        # No rows deleted - determine why (not found, not editable, or
+        # not owner).
+        recipe = get_recipe(recipe_name)
+        if recipe is None:
+            return False
+        if not recipe.is_editable:
+            raise ValueError('This recipe cannot be deleted')
+        # Recipe exists and is editable but wasn't deleted -> not owner
+        return False
+
+
+@_init_db
+def toggle_pin(recipe_name: str, pinned: bool) -> Optional[Recipe]:
+    """Toggle the pinned status of a recipe.
+
+    This is an admin-only operation (authorization should be checked
+    by the caller). Recipes must be pinnable to be pinned/unpinned.
+
+    Args:
+        recipe_name: The recipe's unique name.
+        pinned: New pinned status.
+
+    Returns:
+        The updated Recipe if successful, None if not found.
+
+    Raises:
+        ValueError: If the recipe is not pinnable (e.g., default recipes).
+    """
+    assert _SQLALCHEMY_ENGINE is not None
+
+    # Atomic update with pinnable check in WHERE clause.
+    # This avoids race conditions between check and update, and works
+    # for both SQLite and PostgreSQL.
+    with orm.Session(_SQLALCHEMY_ENGINE) as session:
+        result = session.execute(recipes_table.update().where(
+            recipes_table.c.name == recipe_name).where(
+                recipes_table.c.is_pinnable == 1).values(
+                    pinned=1 if pinned else 0, updated_at=time.time()))
+        session.commit()
+
+        if result.rowcount > 0:
+            return get_recipe(recipe_name)
+
+        # No rows updated - either not found or not pinnable.
+        recipe = get_recipe(recipe_name)
+        if recipe is None:
+            return None
+        # Recipe exists but wasn't updated -> not pinnable
+        raise ValueError('This recipe cannot be pinned or unpinned')
diff --git a/sky/recipes/examples/basic_cluster.yaml b/sky/recipes/examples/basic_cluster.yaml
new file mode 100644
index 00000000000..045e35c6d0f
--- /dev/null
+++ b/sky/recipes/examples/basic_cluster.yaml
@@ -0,0 +1,15 @@
+# Basic Cluster Template
+resources:
+  # 8x NVIDIA A100 GPU
+  accelerators: A100:8
+
+# Typical use: pip install -r requirements.txt
+# Invoked under the workdir (i.e., can use its files).
+setup: |
+  echo "Running setup."
+
+# Typical use: make use of resources, such as running training.
+# Invoked under the workdir (i.e., can use its files).
+run: |
+  echo "Hello, SkyPilot!"
+  conda env list
\ No newline at end of file
diff --git a/sky/recipes/examples/basic_managed_job.yaml b/sky/recipes/examples/basic_managed_job.yaml
new file mode 100644
index 00000000000..916edc2df54
--- /dev/null
+++ b/sky/recipes/examples/basic_managed_job.yaml
@@ -0,0 +1,12 @@
+# Basic Managed Job Template
+name: my-job
+
+resources:
+  accelerators: A10G:1
+
+setup: |
+  pip install torch
+
+run: |
+  echo "Running managed job..."
+  python train.py
diff --git a/sky/recipes/examples/basic_pool.yaml b/sky/recipes/examples/basic_pool.yaml
new file mode 100644
index 00000000000..9b314897243
--- /dev/null
+++ b/sky/recipes/examples/basic_pool.yaml
@@ -0,0 +1,7 @@
+# Basic Job Pool Template
+resources:
+  cpus: 4+
+  memory: 16+
+
+pool:
+  workers: 4
diff --git a/sky/recipes/examples/basic_volume.yaml b/sky/recipes/examples/basic_volume.yaml
new file mode 100644
index 00000000000..3e99ef75474
--- /dev/null
+++ b/sky/recipes/examples/basic_volume.yaml
@@ -0,0 +1,18 @@
+# volume.yaml
+name: my-volume
+type: k8s-pvc
+infra: k8s  # or k8s/<context>
+size: 10Gi
+
+# Optional: To use an existing PVC instead of creating a new one, set to `true` and set `name` to the existing PVC name.
+use_existing: true
+
+# Optional: add labels to the PVC
+labels:
+  key: value
+
+# Optional: additional configuration
+config:
+  namespace: default
+  storage_class_name: csi-mounted-fs-path-sc
+  access_mode: ReadWriteMany  # Required for multi-node clusters4
\ No newline at end of file
diff --git a/sky/recipes/server.py b/sky/recipes/server.py
new file mode 100644
index 00000000000..5ed4db45387
--- /dev/null
+++ b/sky/recipes/server.py
@@ -0,0 +1,124 @@
+"""REST API for Recipe Hub.
+
+This module provides the FastAPI router for recipe management,
+including CRUD operations, pinning, and deployment.
+"""
+import fastapi
+
+from sky.recipes import core
+from sky.server.requests import executor
+from sky.server.requests import payloads
+from sky.server.requests import request_names
+from sky.server.requests import requests as api_requests
+
+router = fastapi.APIRouter()
+
+
+@router.get('')
+async def list_recipes(request: fastapi.Request) -> None:
+    """List recipes (pinned + user's own)."""
+    auth_user = request.state.auth_user
+    auth_user_env_vars_kwargs = {
+        'env_vars': auth_user.to_env_vars()
+    } if auth_user else {}
+    request_body = payloads.RecipeListBody(**auth_user_env_vars_kwargs)
+
+    await executor.schedule_request_async(
+        request_id=request.state.request_id,
+        request_name=request_names.RequestName.RECIPE_LIST,
+        request_body=request_body,
+        func=core.list_recipes,
+        schedule_type=api_requests.ScheduleType.SHORT,
+    )
+
+
+@router.post('/list')
+async def list_recipes_with_filters(
+    request: fastapi.Request,
+    list_body: payloads.RecipeListBody,
+) -> None:
+    """List recipes with filters."""
+    await executor.schedule_request_async(
+        request_id=request.state.request_id,
+        request_name=request_names.RequestName.RECIPE_LIST,
+        request_body=list_body,
+        func=core.list_recipes,
+        schedule_type=api_requests.ScheduleType.SHORT,
+    )
+
+
+@router.post('/get')
+async def get_recipe(
+    request: fastapi.Request,
+    get_body: payloads.RecipeGetBody,
+) -> None:
+    """Get a single recipe by name."""
+    await executor.schedule_request_async(
+        request_id=request.state.request_id,
+        request_name=request_names.RequestName.RECIPE_GET,
+        request_body=get_body,
+        func=core.get_recipe,
+        schedule_type=api_requests.ScheduleType.SHORT,
+    )
+
+
+@router.post('/create')
+async def create_recipe(
+    request: fastapi.Request,
+    create_body: payloads.RecipeCreateBody,
+) -> None:
+    """Create a new recipe."""
+    await executor.schedule_request_async(
+        request_id=request.state.request_id,
+        request_name=request_names.RequestName.RECIPE_CREATE,
+        request_body=create_body,
+        func=core.create_recipe,
+        schedule_type=api_requests.ScheduleType.SHORT,
+    )
+
+
+@router.post('/update')
+async def update_recipe(
+    request: fastapi.Request,
+    update_body: payloads.RecipeUpdateBody,
+) -> None:
+    """Update an existing recipe."""
+    await executor.schedule_request_async(
+        request_id=request.state.request_id,
+        request_name=request_names.RequestName.RECIPE_UPDATE,
+        request_body=update_body,
+        func=core.update_recipe,
+        schedule_type=api_requests.ScheduleType.SHORT,
+    )
+
+
+@router.post('/delete')
+async def delete_recipe(
+    request: fastapi.Request,
+    delete_body: payloads.RecipeDeleteBody,
+) -> None:
+    """Delete a recipe."""
+    await executor.schedule_request_async(
+        request_id=request.state.request_id,
+        request_name=request_names.RequestName.RECIPE_DELETE,
+        request_body=delete_body,
+        func=core.delete_recipe,
+        schedule_type=api_requests.ScheduleType.SHORT,
+    )
+
+
+@router.post('/pin')
+async def pin_recipe(
+    request: fastapi.Request,
+    pin_body: payloads.RecipePinBody,
+) -> None:
+    """Toggle pin status of a recipe (admin only)."""
+    # Note: Admin check should be performed in the core function or
+    # via middleware. For now, the API is available to all authenticated users.
+    await executor.schedule_request_async(
+        request_id=request.state.request_id,
+        request_name=request_names.RequestName.RECIPE_PIN,
+        request_body=pin_body,
+        func=core.toggle_pin,
+        schedule_type=api_requests.ScheduleType.SHORT,
+    )
diff --git a/sky/recipes/utils.py b/sky/recipes/utils.py
new file mode 100644
index 00000000000..64de2d78409
--- /dev/null
+++ b/sky/recipes/utils.py
@@ -0,0 +1,30 @@
+"""Utilities for recipes."""
+import enum
+
+
+class RecipeType(enum.Enum):
+    """Type of recipe in the Recipe Hub."""
+    CLUSTER = 'cluster'
+    JOB = 'job'
+    POOL = 'pool'
+    VOLUME = 'volume'
+
+    @classmethod
+    def from_str(cls, value: str) -> 'RecipeType':
+        """Convert string to RecipeType enum.
+
+        Args:
+            value: String value like 'cluster', 'job', 'pool', 'volume'.
+
+        Returns:
+            The corresponding RecipeType enum.
+
+        Raises:
+            ValueError: If the string is not a valid recipe type.
+        """
+        for recipe_type in cls:
+            if recipe_type.value == value:
+                return recipe_type
+        valid_types = [rt.value for rt in cls]
+        raise ValueError(f'Invalid recipe type: {value!r}. '
+                         f'Must be one of: {", ".join(valid_types)}')
diff --git a/sky/resources.py b/sky/resources.py
index f8b3af1f459..3185d9b063e 100644
--- a/sky/resources.py
+++ b/sky/resources.py
@@ -70,6 +70,8 @@ class AutostopConfig:
     idle_minutes: int = 0
     down: bool = False
     wait_for: Optional[autostop_lib.AutostopWaitFor] = None
+    hook: Optional[str] = None
+    hook_timeout: Optional[int] = None
 
     def to_yaml_config(self) -> Union[Literal[False], Dict[str, Any]]:
         if not self.enabled:
@@ -80,6 +82,10 @@ def to_yaml_config(self) -> Union[Literal[False], Dict[str, Any]]:
         }
         if self.wait_for is not None:
             config['wait_for'] = self.wait_for.value
+        if self.hook is not None:
+            config['hook'] = self.hook
+        if self.hook_timeout is not None:
+            config['hook_timeout'] = self.hook_timeout
         return config
 
     @classmethod
@@ -111,6 +117,10 @@ def from_yaml_config(
             if 'wait_for' in config:
                 autostop_config.wait_for = (
                     autostop_lib.AutostopWaitFor.from_str(config['wait_for']))
+            if 'hook' in config:
+                autostop_config.hook = config['hook']
+            if 'hook_timeout' in config:
+                autostop_config.hook_timeout = config['hook_timeout']
             return autostop_config
 
         return None
@@ -1930,7 +1940,10 @@ def copy(self, **override) -> 'Resources':
             instance_type=override.pop('instance_type', self.instance_type),
             cpus=override.pop('cpus', self._cpus),
             memory=override.pop('memory', self._memory),
-            accelerators=override.pop('accelerators', self.accelerators),
+            # Need to pass `self._accelerators` instead of `self.accelerators`
+            # as the latter can auto-infer, causing potential conflicts with
+            # instance_type override.
+            accelerators=override.pop('accelerators', self._accelerators),
             accelerator_args=override.pop('accelerator_args',
                                           self.accelerator_args),
             use_spot=override.pop('use_spot', use_spot),
diff --git a/sky/schemas/api/responses.py b/sky/schemas/api/responses.py
index d425aa0bd6e..392a638479e 100644
--- a/sky/schemas/api/responses.py
+++ b/sky/schemas/api/responses.py
@@ -204,10 +204,19 @@ class ManagedJobRecord(ResponseBaseModel):
     pool: Optional[str] = None
     pool_hash: Optional[str] = None
     current_cluster_name: Optional[str] = None
+    cluster_name_on_cloud: Optional[str] = None
     job_id_on_pool_cluster: Optional[int] = None
     accelerators: Optional[Dict[str, int]] = None
     labels: Optional[Dict[str, str]] = None
     links: Optional[Dict[str, str]] = None
+    # JobGroup fields
+    # Execution mode: 'parallel' (job group) or 'serial' (pipeline/single job)
+    execution: Optional[str] = None
+    is_job_group: Optional[bool] = None
+    # Whether this task is a primary task (True) or auxiliary task (False)
+    # within a job group. NULL for non-job-group jobs (single jobs and
+    # pipelines).
+    is_primary_in_job_group: Optional[bool] = None
 
 
 class VolumeRecord(ResponseBaseModel):
@@ -231,3 +240,6 @@ class VolumeRecord(ResponseBaseModel):
     usedby_clusters: List[str]
     is_ephemeral: bool = False
     usedby_fetch_failed: bool = False
+    # Error message for volume in ERROR state (e.g., PVC pending due to
+    # access mode mismatch)
+    error_message: Optional[str] = None
diff --git a/sky/schemas/db/global_user_state/012_volume_status_fields.py b/sky/schemas/db/global_user_state/012_volume_status_fields.py
new file mode 100644
index 00000000000..dcd483101f2
--- /dev/null
+++ b/sky/schemas/db/global_user_state/012_volume_status_fields.py
@@ -0,0 +1,47 @@
+"""Add volume status fields for error tracking and usage caching.
+
+Revision ID: 012
+Revises: 011
+Create Date: 2025-01-08
+
+"""
+# pylint: disable=invalid-name
+from typing import Sequence, Union
+
+from alembic import op
+import sqlalchemy as sa
+
+from sky.utils.db import db_utils
+
+# revision identifiers, used by Alembic.
+revision: str = '012'
+down_revision: Union[str, Sequence[str], None] = '011'
+branch_labels: Union[str, Sequence[str], None] = None
+depends_on: Union[str, Sequence[str], None] = None
+
+
+def upgrade():
+    """Add error_message and usedby columns to volumes table.
+
+    - error_message: Stores error/status message for NOT_READY volumes
+    - usedby_pods: JSON-encoded list of pods using the volume
+    - usedby_clusters: JSON-encoded list of clusters using the volume
+    """
+    with op.get_context().autocommit_block():
+        db_utils.add_column_to_table_alembic('volumes',
+                                             'error_message',
+                                             sa.Text(),
+                                             server_default=None)
+        db_utils.add_column_to_table_alembic('volumes',
+                                             'usedby_pods',
+                                             sa.Text(),
+                                             server_default=None)
+        db_utils.add_column_to_table_alembic('volumes',
+                                             'usedby_clusters',
+                                             sa.Text(),
+                                             server_default=None)
+
+
+def downgrade():
+    """No-op for backward compatibility."""
+    pass
diff --git a/sky/schemas/db/global_user_state/013_add_infra_columns.py b/sky/schemas/db/global_user_state/013_add_infra_columns.py
new file mode 100644
index 00000000000..687d7d96edb
--- /dev/null
+++ b/sky/schemas/db/global_user_state/013_add_infra_columns.py
@@ -0,0 +1,58 @@
+"""Add cloud, region, zone columns to clusters and cluster_history tables.
+
+Revision ID: 013
+Revises: 012
+Create Date: 2026-01-20
+
+"""
+# pylint: disable=invalid-name
+from typing import Sequence, Union
+
+import sqlalchemy as sa
+
+from sky.utils.db import db_utils
+
+# revision identifiers, used by Alembic.
+revision: str = '013'
+down_revision: Union[str, Sequence[str], None] = '012'
+branch_labels: Union[str, Sequence[str], None] = None
+depends_on: Union[str, Sequence[str], None] = None
+
+
+def upgrade():
+    """Add cloud, region, zone columns for infrastructure filtering."""
+    from alembic import op  # pylint: disable=import-outside-toplevel
+
+    with op.get_context().autocommit_block():
+        # Add columns to clusters table
+        db_utils.add_column_to_table_alembic('clusters',
+                                             'cloud',
+                                             sa.Text(),
+                                             server_default=None)
+        db_utils.add_column_to_table_alembic('clusters',
+                                             'region',
+                                             sa.Text(),
+                                             server_default=None)
+        db_utils.add_column_to_table_alembic('clusters',
+                                             'zone',
+                                             sa.Text(),
+                                             server_default=None)
+
+        # Add columns to cluster_history table
+        db_utils.add_column_to_table_alembic('cluster_history',
+                                             'cloud',
+                                             sa.Text(),
+                                             server_default=None)
+        db_utils.add_column_to_table_alembic('cluster_history',
+                                             'region',
+                                             sa.Text(),
+                                             server_default=None)
+        db_utils.add_column_to_table_alembic('cluster_history',
+                                             'zone',
+                                             sa.Text(),
+                                             server_default=None)
+
+
+def downgrade():
+    """Remove cloud, region, zone columns."""
+    pass
diff --git a/sky/schemas/db/recipes/001_initial_schema.py b/sky/schemas/db/recipes/001_initial_schema.py
new file mode 100644
index 00000000000..ae7d1d69f8e
--- /dev/null
+++ b/sky/schemas/db/recipes/001_initial_schema.py
@@ -0,0 +1,28 @@
+"""Initial schema for recipes database
+Revision ID: 001
+Revises:
+Create Date: 2025-01-26 12:00:00.000000
+"""
+# pylint: disable=invalid-name
+from alembic import op
+
+from sky.recipes.db import Base
+from sky.utils.db import db_utils
+
+# revision identifiers, used by Alembic.
+revision = '001'
+down_revision = None
+branch_labels = None
+depends_on = None
+
+
+def upgrade():
+    """Create initial schema for recipes table."""
+    with op.get_context().autocommit_block():
+        # Create all tables with their current schema
+        db_utils.add_all_tables_to_db_sqlalchemy(Base.metadata, op.get_bind())
+
+
+def downgrade():
+    """Drop all tables."""
+    Base.metadata.drop_all(bind=op.get_bind())
diff --git a/sky/schemas/db/spot_jobs/012_add_infra_columns.py b/sky/schemas/db/spot_jobs/012_add_infra_columns.py
new file mode 100644
index 00000000000..4ac054c157b
--- /dev/null
+++ b/sky/schemas/db/spot_jobs/012_add_infra_columns.py
@@ -0,0 +1,42 @@
+"""Add cloud, region, zone columns for infrastructure info and sorting.
+
+Revision ID: 012
+Revises: 011
+Create Date: 2026-01-20
+
+"""
+# pylint: disable=invalid-name
+from typing import Sequence, Union
+
+from alembic import op
+import sqlalchemy as sa
+
+from sky.utils.db import db_utils
+
+# revision identifiers, used by Alembic.
+revision: str = '012'
+down_revision: Union[str, Sequence[str], None] = '011'
+branch_labels: Union[str, Sequence[str], None] = None
+depends_on: Union[str, Sequence[str], None] = None
+
+
+def upgrade():
+    """Add cloud, region, zone columns to job_info table."""
+    with op.get_context().autocommit_block():
+        db_utils.add_column_to_table_alembic('job_info',
+                                             'cloud',
+                                             sa.Text(),
+                                             server_default=None)
+        db_utils.add_column_to_table_alembic('job_info',
+                                             'region',
+                                             sa.Text(),
+                                             server_default=None)
+        db_utils.add_column_to_table_alembic('job_info',
+                                             'zone',
+                                             sa.Text(),
+                                             server_default=None)
+
+
+def downgrade():
+    """No downgrade logic."""
+    pass
diff --git a/sky/schemas/db/spot_jobs/013_add_job_group_columns.py b/sky/schemas/db/spot_jobs/013_add_job_group_columns.py
new file mode 100644
index 00000000000..35d3656f4ab
--- /dev/null
+++ b/sky/schemas/db/spot_jobs/013_add_job_group_columns.py
@@ -0,0 +1,49 @@
+"""Add job group columns to job_info table.
+
+Adds:
+- execution (TEXT) to job_info table: 'parallel' (job group) or 'serial'
+  (pipeline/single job)
+
+Note: cluster_name is not stored for job groups because it's deterministic
+(computed from task name and job ID). Job groups don't support pools.
+
+Revision ID: 013
+Revises: 012
+Create Date: 2025-12-29
+
+"""
+# pylint: disable=invalid-name
+from typing import Sequence, Union
+
+from alembic import op
+import sqlalchemy as sa
+
+from sky.dag import DagExecution
+from sky.utils.db import db_utils
+
+# revision identifiers, used by Alembic.
+revision: str = '013'
+down_revision: Union[str, Sequence[str], None] = '012'
+branch_labels: Union[str, Sequence[str], None] = None
+depends_on: Union[str, Sequence[str], None] = None
+
+
+def upgrade():
+    """Add job group columns to job_info table."""
+    with op.get_context().autocommit_block():
+        # Add execution column to job_info table for execution mode:
+        # 'parallel' (job group) or 'serial' (pipeline/single job)
+        db_utils.add_column_to_table_alembic(
+            'job_info',
+            'execution',
+            sa.Text(),
+            server_default=DagExecution.SERIAL.value)
+        # Update existing rows to have 'serial' execution mode
+        op.execute(
+            f'UPDATE job_info SET execution = \'{DagExecution.SERIAL.value}\' '
+            'WHERE execution IS NULL')
+
+
+def downgrade():
+    """No downgrade logic."""
+    pass
diff --git a/sky/schemas/db/spot_jobs/014_add_primary_jobs.py b/sky/schemas/db/spot_jobs/014_add_primary_jobs.py
new file mode 100644
index 00000000000..cbae39cfc5f
--- /dev/null
+++ b/sky/schemas/db/spot_jobs/014_add_primary_jobs.py
@@ -0,0 +1,43 @@
+"""Add is_primary_in_job_group column to spot table.
+
+This migration adds support for primary/auxiliary task markers in job groups:
+- is_primary_in_job_group: Boolean indicating whether this task is "primary"
+  (True) or "auxiliary" (False) within a job group. NULL for non-job-group
+  jobs (single jobs and pipelines). When all primary tasks complete, auxiliary
+  tasks are automatically terminated.
+
+Revision ID: 014
+Revises: 013
+Create Date: 2026-01-19
+
+"""
+# pylint: disable=invalid-name
+from typing import Sequence, Union
+
+from alembic import op
+import sqlalchemy as sa
+
+from sky.utils.db import db_utils
+
+# revision identifiers, used by Alembic.
+revision: str = '014'
+down_revision: Union[str, Sequence[str], None] = '013'
+branch_labels: Union[str, Sequence[str], None] = None
+depends_on: Union[str, Sequence[str], None] = None
+
+
+def upgrade():
+    """Add is_primary_in_job_group column to spot table."""
+    with op.get_context().autocommit_block():
+        # Add is_primary_in_job_group column: Boolean indicating whether this
+        # task is "primary" (True) or "auxiliary" (False) within a job group.
+        # NULL for non-job-group jobs (single jobs and pipelines).
+        db_utils.add_column_to_table_alembic('spot',
+                                             'is_primary_in_job_group',
+                                             sa.Boolean(),
+                                             server_default=None)
+
+
+def downgrade():
+    """No downgrade logic."""
+    pass
diff --git a/sky/schemas/generated/autostopv1_pb2.py b/sky/schemas/generated/autostopv1_pb2.py
index e712f7af46b..53c03d7e41c 100644
--- a/sky/schemas/generated/autostopv1_pb2.py
+++ b/sky/schemas/generated/autostopv1_pb2.py
@@ -14,23 +14,23 @@
 
 
 
-DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n&sky/schemas/generated/autostopv1.proto\x12\x0b\x61utostop.v1\"y\n\x12SetAutostopRequest\x12\x14\n\x0cidle_minutes\x18\x01 \x01(\x05\x12\x0f\n\x07\x62\x61\x63kend\x18\x02 \x01(\t\x12.\n\x08wait_for\x18\x03 \x01(\x0e\x32\x1c.autostop.v1.AutostopWaitFor\x12\x0c\n\x04\x64own\x18\x04 \x01(\x08\"\x15\n\x13SetAutostopResponse\"\x17\n\x15IsAutostoppingRequest\"1\n\x16IsAutostoppingResponse\x12\x17\n\x0fis_autostopping\x18\x01 \x01(\x08*\x90\x01\n\x0f\x41utostopWaitFor\x12!\n\x1d\x41UTOSTOP_WAIT_FOR_UNSPECIFIED\x10\x00\x12\"\n\x1e\x41UTOSTOP_WAIT_FOR_JOBS_AND_SSH\x10\x01\x12\x1a\n\x16\x41UTOSTOP_WAIT_FOR_JOBS\x10\x02\x12\x1a\n\x16\x41UTOSTOP_WAIT_FOR_NONE\x10\x03\x32\xbe\x01\n\x0f\x41utostopService\x12P\n\x0bSetAutostop\x12\x1f.autostop.v1.SetAutostopRequest\x1a .autostop.v1.SetAutostopResponse\x12Y\n\x0eIsAutostopping\x12\".autostop.v1.IsAutostoppingRequest\x1a#.autostop.v1.IsAutostoppingResponseb\x06proto3')
+DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n&sky/schemas/generated/autostopv1.proto\x12\x0b\x61utostop.v1\"\xc1\x01\n\x12SetAutostopRequest\x12\x14\n\x0cidle_minutes\x18\x01 \x01(\x05\x12\x0f\n\x07\x62\x61\x63kend\x18\x02 \x01(\t\x12.\n\x08wait_for\x18\x03 \x01(\x0e\x32\x1c.autostop.v1.AutostopWaitFor\x12\x0c\n\x04\x64own\x18\x04 \x01(\x08\x12\x11\n\x04hook\x18\x05 \x01(\tH\x00\x88\x01\x01\x12\x19\n\x0chook_timeout\x18\x06 \x01(\x05H\x01\x88\x01\x01\x42\x07\n\x05_hookB\x0f\n\r_hook_timeout\"\x15\n\x13SetAutostopResponse\"\x17\n\x15IsAutostoppingRequest\"1\n\x16IsAutostoppingResponse\x12\x17\n\x0fis_autostopping\x18\x01 \x01(\x08*\x90\x01\n\x0f\x41utostopWaitFor\x12!\n\x1d\x41UTOSTOP_WAIT_FOR_UNSPECIFIED\x10\x00\x12\"\n\x1e\x41UTOSTOP_WAIT_FOR_JOBS_AND_SSH\x10\x01\x12\x1a\n\x16\x41UTOSTOP_WAIT_FOR_JOBS\x10\x02\x12\x1a\n\x16\x41UTOSTOP_WAIT_FOR_NONE\x10\x03\x32\xbe\x01\n\x0f\x41utostopService\x12P\n\x0bSetAutostop\x12\x1f.autostop.v1.SetAutostopRequest\x1a .autostop.v1.SetAutostopResponse\x12Y\n\x0eIsAutostopping\x12\".autostop.v1.IsAutostoppingRequest\x1a#.autostop.v1.IsAutostoppingResponseb\x06proto3')
 
 _globals = globals()
 _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals)
 _builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'sky.schemas.generated.autostopv1_pb2', _globals)
 if not _descriptor._USE_C_DESCRIPTORS:
   DESCRIPTOR._loaded_options = None
-  _globals['_AUTOSTOPWAITFOR']._serialized_start=278
-  _globals['_AUTOSTOPWAITFOR']._serialized_end=422
-  _globals['_SETAUTOSTOPREQUEST']._serialized_start=55
-  _globals['_SETAUTOSTOPREQUEST']._serialized_end=176
-  _globals['_SETAUTOSTOPRESPONSE']._serialized_start=178
-  _globals['_SETAUTOSTOPRESPONSE']._serialized_end=199
-  _globals['_ISAUTOSTOPPINGREQUEST']._serialized_start=201
-  _globals['_ISAUTOSTOPPINGREQUEST']._serialized_end=224
-  _globals['_ISAUTOSTOPPINGRESPONSE']._serialized_start=226
-  _globals['_ISAUTOSTOPPINGRESPONSE']._serialized_end=275
-  _globals['_AUTOSTOPSERVICE']._serialized_start=425
-  _globals['_AUTOSTOPSERVICE']._serialized_end=615
+  _globals['_AUTOSTOPWAITFOR']._serialized_start=351
+  _globals['_AUTOSTOPWAITFOR']._serialized_end=495
+  _globals['_SETAUTOSTOPREQUEST']._serialized_start=56
+  _globals['_SETAUTOSTOPREQUEST']._serialized_end=249
+  _globals['_SETAUTOSTOPRESPONSE']._serialized_start=251
+  _globals['_SETAUTOSTOPRESPONSE']._serialized_end=272
+  _globals['_ISAUTOSTOPPINGREQUEST']._serialized_start=274
+  _globals['_ISAUTOSTOPPINGREQUEST']._serialized_end=297
+  _globals['_ISAUTOSTOPPINGRESPONSE']._serialized_start=299
+  _globals['_ISAUTOSTOPPINGRESPONSE']._serialized_end=348
+  _globals['_AUTOSTOPSERVICE']._serialized_start=498
+  _globals['_AUTOSTOPSERVICE']._serialized_end=688
 # @@protoc_insertion_point(module_scope)
diff --git a/sky/schemas/generated/autostopv1_pb2.pyi b/sky/schemas/generated/autostopv1_pb2.pyi
index 349616e3391..c1899663d20 100644
--- a/sky/schemas/generated/autostopv1_pb2.pyi
+++ b/sky/schemas/generated/autostopv1_pb2.pyi
@@ -17,16 +17,20 @@ AUTOSTOP_WAIT_FOR_JOBS: AutostopWaitFor
 AUTOSTOP_WAIT_FOR_NONE: AutostopWaitFor
 
 class SetAutostopRequest(_message.Message):
-    __slots__ = ("idle_minutes", "backend", "wait_for", "down")
+    __slots__ = ("idle_minutes", "backend", "wait_for", "down", "hook", "hook_timeout")
     IDLE_MINUTES_FIELD_NUMBER: _ClassVar[int]
     BACKEND_FIELD_NUMBER: _ClassVar[int]
     WAIT_FOR_FIELD_NUMBER: _ClassVar[int]
     DOWN_FIELD_NUMBER: _ClassVar[int]
+    HOOK_FIELD_NUMBER: _ClassVar[int]
+    HOOK_TIMEOUT_FIELD_NUMBER: _ClassVar[int]
     idle_minutes: int
     backend: str
     wait_for: AutostopWaitFor
     down: bool
-    def __init__(self, idle_minutes: _Optional[int] = ..., backend: _Optional[str] = ..., wait_for: _Optional[_Union[AutostopWaitFor, str]] = ..., down: bool = ...) -> None: ...
+    hook: str
+    hook_timeout: int
+    def __init__(self, idle_minutes: _Optional[int] = ..., backend: _Optional[str] = ..., wait_for: _Optional[_Union[AutostopWaitFor, str]] = ..., down: bool = ..., hook: _Optional[str] = ..., hook_timeout: _Optional[int] = ...) -> None: ...
 
 class SetAutostopResponse(_message.Message):
     __slots__ = ()
diff --git a/sky/schemas/generated/jobsv1_pb2.py b/sky/schemas/generated/jobsv1_pb2.py
index e792cc97bd0..2b500269501 100644
--- a/sky/schemas/generated/jobsv1_pb2.py
+++ b/sky/schemas/generated/jobsv1_pb2.py
@@ -14,7 +14,7 @@
 
 
 
-DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\"sky/schemas/generated/jobsv1.proto\x12\x07jobs.v1\"\x85\x01\n\rAddJobRequest\x12\x15\n\x08job_name\x18\x01 \x01(\tH\x00\x88\x01\x01\x12\x10\n\x08username\x18\x02 \x01(\t\x12\x15\n\rrun_timestamp\x18\x03 \x01(\t\x12\x15\n\rresources_str\x18\x04 \x01(\t\x12\x10\n\x08metadata\x18\x05 \x01(\tB\x0b\n\t_job_name\"1\n\x0e\x41\x64\x64JobResponse\x12\x0e\n\x06job_id\x18\x01 \x01(\x03\x12\x0f\n\x07log_dir\x18\x02 \x01(\t\"\xb3\x01\n\x0fQueueJobRequest\x12\x0e\n\x06job_id\x18\x01 \x01(\x03\x12\x14\n\x07\x63odegen\x18\x02 \x01(\tH\x00\x88\x01\x01\x12\x13\n\x0bscript_path\x18\x03 \x01(\t\x12\x16\n\x0eremote_log_dir\x18\x04 \x01(\t\x12\x31\n\x0bmanaged_job\x18\x05 \x01(\x0b\x32\x17.jobs.v1.ManagedJobInfoH\x01\x88\x01\x01\x42\n\n\x08_codegenB\x0e\n\x0c_managed_job\"\xab\x01\n\x0eManagedJobInfo\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x11\n\x04pool\x18\x02 \x01(\tH\x00\x88\x01\x01\x12\x11\n\tworkspace\x18\x03 \x01(\t\x12\x12\n\nentrypoint\x18\x04 \x01(\t\x12&\n\x05tasks\x18\x05 \x03(\x0b\x32\x17.jobs.v1.ManagedJobTask\x12\x14\n\x07user_id\x18\x06 \x01(\tH\x01\x88\x01\x01\x42\x07\n\x05_poolB\n\n\x08_user_id\"]\n\x0eManagedJobTask\x12\x0f\n\x07task_id\x18\x01 \x01(\x05\x12\x0c\n\x04name\x18\x02 \x01(\t\x12\x15\n\rresources_str\x18\x03 \x01(\t\x12\x15\n\rmetadata_json\x18\x04 \x01(\t\"\x12\n\x10QueueJobResponse\"\x15\n\x13UpdateStatusRequest\"\x16\n\x14UpdateStatusResponse\"L\n\x12GetJobQueueRequest\x12\x16\n\tuser_hash\x18\x01 \x01(\tH\x00\x88\x01\x01\x12\x10\n\x08\x61ll_jobs\x18\x02 \x01(\x08\x42\x0c\n\n_user_hash\"\xa3\x02\n\x07JobInfo\x12\x0e\n\x06job_id\x18\x01 \x01(\x03\x12\x10\n\x08job_name\x18\x02 \x01(\t\x12\x10\n\x08username\x18\x03 \x01(\t\x12\x14\n\x0csubmitted_at\x18\x04 \x01(\x01\x12\"\n\x06status\x18\x05 \x01(\x0e\x32\x12.jobs.v1.JobStatus\x12\x15\n\rrun_timestamp\x18\x06 \x01(\t\x12\x15\n\x08start_at\x18\x07 \x01(\x01H\x00\x88\x01\x01\x12\x13\n\x06\x65nd_at\x18\x08 \x01(\x01H\x01\x88\x01\x01\x12\x11\n\tresources\x18\t \x01(\t\x12\x10\n\x03pid\x18\n \x01(\x03H\x02\x88\x01\x01\x12\x10\n\x08log_path\x18\x0b \x01(\t\x12\x10\n\x08metadata\x18\x0c \x01(\tB\x0b\n\t_start_atB\t\n\x07_end_atB\x06\n\x04_pid\"5\n\x13GetJobQueueResponse\x12\x1e\n\x04jobs\x18\x01 \x03(\x0b\x32\x10.jobs.v1.JobInfo\"^\n\x11\x43\x61ncelJobsRequest\x12\x0f\n\x07job_ids\x18\x01 \x03(\x03\x12\x12\n\ncancel_all\x18\x02 \x01(\x08\x12\x16\n\tuser_hash\x18\x03 \x01(\tH\x00\x88\x01\x01\x42\x0c\n\n_user_hash\"/\n\x12\x43\x61ncelJobsResponse\x12\x19\n\x11\x63\x61ncelled_job_ids\x18\x01 \x03(\x03\"\x1e\n\x1c\x46\x61ilAllInProgressJobsRequest\"\x1f\n\x1d\x46\x61ilAllInProgressJobsResponse\"\x7f\n\x0fTailLogsRequest\x12\x13\n\x06job_id\x18\x01 \x01(\x03H\x00\x88\x01\x01\x12\x1b\n\x0emanaged_job_id\x18\x02 \x01(\x03H\x01\x88\x01\x01\x12\x0e\n\x06\x66ollow\x18\x03 \x01(\x08\x12\x0c\n\x04tail\x18\x04 \x01(\x05\x42\t\n\x07_job_idB\x11\n\x0f_managed_job_id\"7\n\x10TailLogsResponse\x12\x10\n\x08log_line\x18\x01 \x01(\t\x12\x11\n\texit_code\x18\x02 \x01(\x05\"&\n\x13GetJobStatusRequest\x12\x0f\n\x07job_ids\x18\x01 \x03(\x03\"\xa4\x01\n\x14GetJobStatusResponse\x12\x44\n\x0cjob_statuses\x18\x01 \x03(\x0b\x32..jobs.v1.GetJobStatusResponse.JobStatusesEntry\x1a\x46\n\x10JobStatusesEntry\x12\x0b\n\x03key\x18\x01 \x01(\x03\x12!\n\x05value\x18\x02 \x01(\x0e\x32\x12.jobs.v1.JobStatus:\x02\x38\x01\"A\n\x1fGetJobSubmittedTimestampRequest\x12\x13\n\x06job_id\x18\x01 \x01(\x03H\x00\x88\x01\x01\x42\t\n\x07_job_id\"5\n GetJobSubmittedTimestampResponse\x12\x11\n\ttimestamp\x18\x01 \x01(\x02\"=\n\x1bGetJobEndedTimestampRequest\x12\x13\n\x06job_id\x18\x01 \x01(\x03H\x00\x88\x01\x01\x42\t\n\x07_job_id\"1\n\x1cGetJobEndedTimestampResponse\x12\x11\n\ttimestamp\x18\x01 \x01(\x02\"+\n\x18GetLogDirsForJobsRequest\x12\x0f\n\x07job_ids\x18\x01 \x03(\x03\"\x98\x01\n\x19GetLogDirsForJobsResponse\x12H\n\x0cjob_log_dirs\x18\x01 \x03(\x0b\x32\x32.jobs.v1.GetLogDirsForJobsResponse.JobLogDirsEntry\x1a\x31\n\x0fJobLogDirsEntry\x12\x0b\n\x03key\x18\x01 \x01(\x03\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\"8\n\x16GetJobExitCodesRequest\x12\x13\n\x06job_id\x18\x01 \x01(\x03H\x00\x88\x01\x01\x42\t\n\x07_job_id\"-\n\x17GetJobExitCodesResponse\x12\x12\n\nexit_codes\x18\x01 \x03(\x05*\x8d\x02\n\tJobStatus\x12\x1a\n\x16JOB_STATUS_UNSPECIFIED\x10\x00\x12\x13\n\x0fJOB_STATUS_INIT\x10\x01\x12\x16\n\x12JOB_STATUS_PENDING\x10\x02\x12\x19\n\x15JOB_STATUS_SETTING_UP\x10\x03\x12\x16\n\x12JOB_STATUS_RUNNING\x10\x04\x12\x1c\n\x18JOB_STATUS_FAILED_DRIVER\x10\x05\x12\x18\n\x14JOB_STATUS_SUCCEEDED\x10\x06\x12\x15\n\x11JOB_STATUS_FAILED\x10\x07\x12\x1b\n\x17JOB_STATUS_FAILED_SETUP\x10\x08\x12\x18\n\x14JOB_STATUS_CANCELLED\x10\t2\xe7\x07\n\x0bJobsService\x12\x39\n\x06\x41\x64\x64Job\x12\x16.jobs.v1.AddJobRequest\x1a\x17.jobs.v1.AddJobResponse\x12?\n\x08QueueJob\x12\x18.jobs.v1.QueueJobRequest\x1a\x19.jobs.v1.QueueJobResponse\x12K\n\x0cUpdateStatus\x12\x1c.jobs.v1.UpdateStatusRequest\x1a\x1d.jobs.v1.UpdateStatusResponse\x12H\n\x0bGetJobQueue\x12\x1b.jobs.v1.GetJobQueueRequest\x1a\x1c.jobs.v1.GetJobQueueResponse\x12\x45\n\nCancelJobs\x12\x1a.jobs.v1.CancelJobsRequest\x1a\x1b.jobs.v1.CancelJobsResponse\x12\x66\n\x15\x46\x61ilAllInProgressJobs\x12%.jobs.v1.FailAllInProgressJobsRequest\x1a&.jobs.v1.FailAllInProgressJobsResponse\x12\x41\n\x08TailLogs\x12\x18.jobs.v1.TailLogsRequest\x1a\x19.jobs.v1.TailLogsResponse0\x01\x12K\n\x0cGetJobStatus\x12\x1c.jobs.v1.GetJobStatusRequest\x1a\x1d.jobs.v1.GetJobStatusResponse\x12o\n\x18GetJobSubmittedTimestamp\x12(.jobs.v1.GetJobSubmittedTimestampRequest\x1a).jobs.v1.GetJobSubmittedTimestampResponse\x12\x63\n\x14GetJobEndedTimestamp\x12$.jobs.v1.GetJobEndedTimestampRequest\x1a%.jobs.v1.GetJobEndedTimestampResponse\x12Z\n\x11GetLogDirsForJobs\x12!.jobs.v1.GetLogDirsForJobsRequest\x1a\".jobs.v1.GetLogDirsForJobsResponse\x12T\n\x0fGetJobExitCodes\x12\x1f.jobs.v1.GetJobExitCodesRequest\x1a .jobs.v1.GetJobExitCodesResponseb\x06proto3')
+DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\"sky/schemas/generated/jobsv1.proto\x12\x07jobs.v1\"\x85\x01\n\rAddJobRequest\x12\x15\n\x08job_name\x18\x01 \x01(\tH\x00\x88\x01\x01\x12\x10\n\x08username\x18\x02 \x01(\t\x12\x15\n\rrun_timestamp\x18\x03 \x01(\t\x12\x15\n\rresources_str\x18\x04 \x01(\t\x12\x10\n\x08metadata\x18\x05 \x01(\tB\x0b\n\t_job_name\"1\n\x0e\x41\x64\x64JobResponse\x12\x0e\n\x06job_id\x18\x01 \x01(\x03\x12\x0f\n\x07log_dir\x18\x02 \x01(\t\"\xb3\x01\n\x0fQueueJobRequest\x12\x0e\n\x06job_id\x18\x01 \x01(\x03\x12\x14\n\x07\x63odegen\x18\x02 \x01(\tH\x00\x88\x01\x01\x12\x13\n\x0bscript_path\x18\x03 \x01(\t\x12\x16\n\x0eremote_log_dir\x18\x04 \x01(\t\x12\x31\n\x0bmanaged_job\x18\x05 \x01(\x0b\x32\x17.jobs.v1.ManagedJobInfoH\x01\x88\x01\x01\x42\n\n\x08_codegenB\x0e\n\x0c_managed_job\"\xd1\x01\n\x0eManagedJobInfo\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x11\n\x04pool\x18\x02 \x01(\tH\x00\x88\x01\x01\x12\x11\n\tworkspace\x18\x03 \x01(\t\x12\x12\n\nentrypoint\x18\x04 \x01(\t\x12&\n\x05tasks\x18\x05 \x03(\x0b\x32\x17.jobs.v1.ManagedJobTask\x12\x14\n\x07user_id\x18\x06 \x01(\tH\x01\x88\x01\x01\x12\x16\n\texecution\x18\x07 \x01(\tH\x02\x88\x01\x01\x42\x07\n\x05_poolB\n\n\x08_user_idB\x0c\n\n_execution\"\x9f\x01\n\x0eManagedJobTask\x12\x0f\n\x07task_id\x18\x01 \x01(\x05\x12\x0c\n\x04name\x18\x02 \x01(\t\x12\x15\n\rresources_str\x18\x03 \x01(\t\x12\x15\n\rmetadata_json\x18\x04 \x01(\t\x12$\n\x17is_primary_in_job_group\x18\x05 \x01(\x08H\x00\x88\x01\x01\x42\x1a\n\x18_is_primary_in_job_group\"\x12\n\x10QueueJobResponse\"\x15\n\x13UpdateStatusRequest\"\x16\n\x14UpdateStatusResponse\"L\n\x12GetJobQueueRequest\x12\x16\n\tuser_hash\x18\x01 \x01(\tH\x00\x88\x01\x01\x12\x10\n\x08\x61ll_jobs\x18\x02 \x01(\x08\x42\x0c\n\n_user_hash\"\xa3\x02\n\x07JobInfo\x12\x0e\n\x06job_id\x18\x01 \x01(\x03\x12\x10\n\x08job_name\x18\x02 \x01(\t\x12\x10\n\x08username\x18\x03 \x01(\t\x12\x14\n\x0csubmitted_at\x18\x04 \x01(\x01\x12\"\n\x06status\x18\x05 \x01(\x0e\x32\x12.jobs.v1.JobStatus\x12\x15\n\rrun_timestamp\x18\x06 \x01(\t\x12\x15\n\x08start_at\x18\x07 \x01(\x01H\x00\x88\x01\x01\x12\x13\n\x06\x65nd_at\x18\x08 \x01(\x01H\x01\x88\x01\x01\x12\x11\n\tresources\x18\t \x01(\t\x12\x10\n\x03pid\x18\n \x01(\x03H\x02\x88\x01\x01\x12\x10\n\x08log_path\x18\x0b \x01(\t\x12\x10\n\x08metadata\x18\x0c \x01(\tB\x0b\n\t_start_atB\t\n\x07_end_atB\x06\n\x04_pid\"5\n\x13GetJobQueueResponse\x12\x1e\n\x04jobs\x18\x01 \x03(\x0b\x32\x10.jobs.v1.JobInfo\"^\n\x11\x43\x61ncelJobsRequest\x12\x0f\n\x07job_ids\x18\x01 \x03(\x03\x12\x12\n\ncancel_all\x18\x02 \x01(\x08\x12\x16\n\tuser_hash\x18\x03 \x01(\tH\x00\x88\x01\x01\x42\x0c\n\n_user_hash\"/\n\x12\x43\x61ncelJobsResponse\x12\x19\n\x11\x63\x61ncelled_job_ids\x18\x01 \x03(\x03\"\x1e\n\x1c\x46\x61ilAllInProgressJobsRequest\"\x1f\n\x1d\x46\x61ilAllInProgressJobsResponse\"\x7f\n\x0fTailLogsRequest\x12\x13\n\x06job_id\x18\x01 \x01(\x03H\x00\x88\x01\x01\x12\x1b\n\x0emanaged_job_id\x18\x02 \x01(\x03H\x01\x88\x01\x01\x12\x0e\n\x06\x66ollow\x18\x03 \x01(\x08\x12\x0c\n\x04tail\x18\x04 \x01(\x05\x42\t\n\x07_job_idB\x11\n\x0f_managed_job_id\"7\n\x10TailLogsResponse\x12\x10\n\x08log_line\x18\x01 \x01(\t\x12\x11\n\texit_code\x18\x02 \x01(\x05\"&\n\x13GetJobStatusRequest\x12\x0f\n\x07job_ids\x18\x01 \x03(\x03\"\xa4\x01\n\x14GetJobStatusResponse\x12\x44\n\x0cjob_statuses\x18\x01 \x03(\x0b\x32..jobs.v1.GetJobStatusResponse.JobStatusesEntry\x1a\x46\n\x10JobStatusesEntry\x12\x0b\n\x03key\x18\x01 \x01(\x03\x12!\n\x05value\x18\x02 \x01(\x0e\x32\x12.jobs.v1.JobStatus:\x02\x38\x01\"A\n\x1fGetJobSubmittedTimestampRequest\x12\x13\n\x06job_id\x18\x01 \x01(\x03H\x00\x88\x01\x01\x42\t\n\x07_job_id\"5\n GetJobSubmittedTimestampResponse\x12\x11\n\ttimestamp\x18\x01 \x01(\x02\"=\n\x1bGetJobEndedTimestampRequest\x12\x13\n\x06job_id\x18\x01 \x01(\x03H\x00\x88\x01\x01\x42\t\n\x07_job_id\"1\n\x1cGetJobEndedTimestampResponse\x12\x11\n\ttimestamp\x18\x01 \x01(\x02\"+\n\x18GetLogDirsForJobsRequest\x12\x0f\n\x07job_ids\x18\x01 \x03(\x03\"\x98\x01\n\x19GetLogDirsForJobsResponse\x12H\n\x0cjob_log_dirs\x18\x01 \x03(\x0b\x32\x32.jobs.v1.GetLogDirsForJobsResponse.JobLogDirsEntry\x1a\x31\n\x0fJobLogDirsEntry\x12\x0b\n\x03key\x18\x01 \x01(\x03\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\"8\n\x16GetJobExitCodesRequest\x12\x13\n\x06job_id\x18\x01 \x01(\x03H\x00\x88\x01\x01\x42\t\n\x07_job_id\"-\n\x17GetJobExitCodesResponse\x12\x12\n\nexit_codes\x18\x01 \x03(\x05\"\xd8\x02\n\x1dSetJobInfoWithoutJobIdRequest\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x11\n\tworkspace\x18\x02 \x01(\t\x12\x12\n\nentrypoint\x18\x03 \x01(\t\x12\x11\n\x04pool\x18\x04 \x01(\tH\x00\x88\x01\x01\x12\x16\n\tpool_hash\x18\x05 \x01(\tH\x01\x88\x01\x01\x12\x16\n\tuser_hash\x18\x06 \x01(\tH\x02\x88\x01\x01\x12\x10\n\x08task_ids\x18\x07 \x03(\x05\x12\x12\n\ntask_names\x18\x08 \x03(\t\x12\x15\n\rresources_str\x18\t \x01(\t\x12\x16\n\x0emetadata_jsons\x18\n \x03(\t\x12\x10\n\x08num_jobs\x18\x0b \x01(\x05\x12\x11\n\texecution\x18\x0c \x01(\t\x12 \n\x18is_primary_in_job_groups\x18\r \x03(\x08\x42\x07\n\x05_poolB\x0c\n\n_pool_hashB\x0c\n\n_user_hash\"1\n\x1eSetJobInfoWithoutJobIdResponse\x12\x0f\n\x07job_ids\x18\x01 \x03(\x03*\x8d\x02\n\tJobStatus\x12\x1a\n\x16JOB_STATUS_UNSPECIFIED\x10\x00\x12\x13\n\x0fJOB_STATUS_INIT\x10\x01\x12\x16\n\x12JOB_STATUS_PENDING\x10\x02\x12\x19\n\x15JOB_STATUS_SETTING_UP\x10\x03\x12\x16\n\x12JOB_STATUS_RUNNING\x10\x04\x12\x1c\n\x18JOB_STATUS_FAILED_DRIVER\x10\x05\x12\x18\n\x14JOB_STATUS_SUCCEEDED\x10\x06\x12\x15\n\x11JOB_STATUS_FAILED\x10\x07\x12\x1b\n\x17JOB_STATUS_FAILED_SETUP\x10\x08\x12\x18\n\x14JOB_STATUS_CANCELLED\x10\t2\xd2\x08\n\x0bJobsService\x12\x39\n\x06\x41\x64\x64Job\x12\x16.jobs.v1.AddJobRequest\x1a\x17.jobs.v1.AddJobResponse\x12?\n\x08QueueJob\x12\x18.jobs.v1.QueueJobRequest\x1a\x19.jobs.v1.QueueJobResponse\x12K\n\x0cUpdateStatus\x12\x1c.jobs.v1.UpdateStatusRequest\x1a\x1d.jobs.v1.UpdateStatusResponse\x12H\n\x0bGetJobQueue\x12\x1b.jobs.v1.GetJobQueueRequest\x1a\x1c.jobs.v1.GetJobQueueResponse\x12\x45\n\nCancelJobs\x12\x1a.jobs.v1.CancelJobsRequest\x1a\x1b.jobs.v1.CancelJobsResponse\x12\x66\n\x15\x46\x61ilAllInProgressJobs\x12%.jobs.v1.FailAllInProgressJobsRequest\x1a&.jobs.v1.FailAllInProgressJobsResponse\x12\x41\n\x08TailLogs\x12\x18.jobs.v1.TailLogsRequest\x1a\x19.jobs.v1.TailLogsResponse0\x01\x12K\n\x0cGetJobStatus\x12\x1c.jobs.v1.GetJobStatusRequest\x1a\x1d.jobs.v1.GetJobStatusResponse\x12o\n\x18GetJobSubmittedTimestamp\x12(.jobs.v1.GetJobSubmittedTimestampRequest\x1a).jobs.v1.GetJobSubmittedTimestampResponse\x12\x63\n\x14GetJobEndedTimestamp\x12$.jobs.v1.GetJobEndedTimestampRequest\x1a%.jobs.v1.GetJobEndedTimestampResponse\x12Z\n\x11GetLogDirsForJobs\x12!.jobs.v1.GetLogDirsForJobsRequest\x1a\".jobs.v1.GetLogDirsForJobsResponse\x12T\n\x0fGetJobExitCodes\x12\x1f.jobs.v1.GetJobExitCodesRequest\x1a .jobs.v1.GetJobExitCodesResponse\x12i\n\x16SetJobInfoWithoutJobId\x12&.jobs.v1.SetJobInfoWithoutJobIdRequest\x1a\'.jobs.v1.SetJobInfoWithoutJobIdResponseb\x06proto3')
 
 _globals = globals()
 _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals)
@@ -25,8 +25,8 @@
   _globals['_GETJOBSTATUSRESPONSE_JOBSTATUSESENTRY']._serialized_options = b'8\001'
   _globals['_GETLOGDIRSFORJOBSRESPONSE_JOBLOGDIRSENTRY']._loaded_options = None
   _globals['_GETLOGDIRSFORJOBSRESPONSE_JOBLOGDIRSENTRY']._serialized_options = b'8\001'
-  _globals['_JOBSTATUS']._serialized_start=2324
-  _globals['_JOBSTATUS']._serialized_end=2593
+  _globals['_JOBSTATUS']._serialized_start=2827
+  _globals['_JOBSTATUS']._serialized_end=3096
   _globals['_ADDJOBREQUEST']._serialized_start=48
   _globals['_ADDJOBREQUEST']._serialized_end=181
   _globals['_ADDJOBRESPONSE']._serialized_start=183
@@ -34,57 +34,61 @@
   _globals['_QUEUEJOBREQUEST']._serialized_start=235
   _globals['_QUEUEJOBREQUEST']._serialized_end=414
   _globals['_MANAGEDJOBINFO']._serialized_start=417
-  _globals['_MANAGEDJOBINFO']._serialized_end=588
-  _globals['_MANAGEDJOBTASK']._serialized_start=590
-  _globals['_MANAGEDJOBTASK']._serialized_end=683
-  _globals['_QUEUEJOBRESPONSE']._serialized_start=685
-  _globals['_QUEUEJOBRESPONSE']._serialized_end=703
-  _globals['_UPDATESTATUSREQUEST']._serialized_start=705
-  _globals['_UPDATESTATUSREQUEST']._serialized_end=726
-  _globals['_UPDATESTATUSRESPONSE']._serialized_start=728
-  _globals['_UPDATESTATUSRESPONSE']._serialized_end=750
-  _globals['_GETJOBQUEUEREQUEST']._serialized_start=752
-  _globals['_GETJOBQUEUEREQUEST']._serialized_end=828
-  _globals['_JOBINFO']._serialized_start=831
-  _globals['_JOBINFO']._serialized_end=1122
-  _globals['_GETJOBQUEUERESPONSE']._serialized_start=1124
-  _globals['_GETJOBQUEUERESPONSE']._serialized_end=1177
-  _globals['_CANCELJOBSREQUEST']._serialized_start=1179
-  _globals['_CANCELJOBSREQUEST']._serialized_end=1273
-  _globals['_CANCELJOBSRESPONSE']._serialized_start=1275
-  _globals['_CANCELJOBSRESPONSE']._serialized_end=1322
-  _globals['_FAILALLINPROGRESSJOBSREQUEST']._serialized_start=1324
-  _globals['_FAILALLINPROGRESSJOBSREQUEST']._serialized_end=1354
-  _globals['_FAILALLINPROGRESSJOBSRESPONSE']._serialized_start=1356
-  _globals['_FAILALLINPROGRESSJOBSRESPONSE']._serialized_end=1387
-  _globals['_TAILLOGSREQUEST']._serialized_start=1389
-  _globals['_TAILLOGSREQUEST']._serialized_end=1516
-  _globals['_TAILLOGSRESPONSE']._serialized_start=1518
-  _globals['_TAILLOGSRESPONSE']._serialized_end=1573
-  _globals['_GETJOBSTATUSREQUEST']._serialized_start=1575
-  _globals['_GETJOBSTATUSREQUEST']._serialized_end=1613
-  _globals['_GETJOBSTATUSRESPONSE']._serialized_start=1616
-  _globals['_GETJOBSTATUSRESPONSE']._serialized_end=1780
-  _globals['_GETJOBSTATUSRESPONSE_JOBSTATUSESENTRY']._serialized_start=1710
-  _globals['_GETJOBSTATUSRESPONSE_JOBSTATUSESENTRY']._serialized_end=1780
-  _globals['_GETJOBSUBMITTEDTIMESTAMPREQUEST']._serialized_start=1782
-  _globals['_GETJOBSUBMITTEDTIMESTAMPREQUEST']._serialized_end=1847
-  _globals['_GETJOBSUBMITTEDTIMESTAMPRESPONSE']._serialized_start=1849
-  _globals['_GETJOBSUBMITTEDTIMESTAMPRESPONSE']._serialized_end=1902
-  _globals['_GETJOBENDEDTIMESTAMPREQUEST']._serialized_start=1904
-  _globals['_GETJOBENDEDTIMESTAMPREQUEST']._serialized_end=1965
-  _globals['_GETJOBENDEDTIMESTAMPRESPONSE']._serialized_start=1967
-  _globals['_GETJOBENDEDTIMESTAMPRESPONSE']._serialized_end=2016
-  _globals['_GETLOGDIRSFORJOBSREQUEST']._serialized_start=2018
-  _globals['_GETLOGDIRSFORJOBSREQUEST']._serialized_end=2061
-  _globals['_GETLOGDIRSFORJOBSRESPONSE']._serialized_start=2064
-  _globals['_GETLOGDIRSFORJOBSRESPONSE']._serialized_end=2216
-  _globals['_GETLOGDIRSFORJOBSRESPONSE_JOBLOGDIRSENTRY']._serialized_start=2167
-  _globals['_GETLOGDIRSFORJOBSRESPONSE_JOBLOGDIRSENTRY']._serialized_end=2216
-  _globals['_GETJOBEXITCODESREQUEST']._serialized_start=2218
-  _globals['_GETJOBEXITCODESREQUEST']._serialized_end=2274
-  _globals['_GETJOBEXITCODESRESPONSE']._serialized_start=2276
-  _globals['_GETJOBEXITCODESRESPONSE']._serialized_end=2321
-  _globals['_JOBSSERVICE']._serialized_start=2596
-  _globals['_JOBSSERVICE']._serialized_end=3595
+  _globals['_MANAGEDJOBINFO']._serialized_end=626
+  _globals['_MANAGEDJOBTASK']._serialized_start=629
+  _globals['_MANAGEDJOBTASK']._serialized_end=788
+  _globals['_QUEUEJOBRESPONSE']._serialized_start=790
+  _globals['_QUEUEJOBRESPONSE']._serialized_end=808
+  _globals['_UPDATESTATUSREQUEST']._serialized_start=810
+  _globals['_UPDATESTATUSREQUEST']._serialized_end=831
+  _globals['_UPDATESTATUSRESPONSE']._serialized_start=833
+  _globals['_UPDATESTATUSRESPONSE']._serialized_end=855
+  _globals['_GETJOBQUEUEREQUEST']._serialized_start=857
+  _globals['_GETJOBQUEUEREQUEST']._serialized_end=933
+  _globals['_JOBINFO']._serialized_start=936
+  _globals['_JOBINFO']._serialized_end=1227
+  _globals['_GETJOBQUEUERESPONSE']._serialized_start=1229
+  _globals['_GETJOBQUEUERESPONSE']._serialized_end=1282
+  _globals['_CANCELJOBSREQUEST']._serialized_start=1284
+  _globals['_CANCELJOBSREQUEST']._serialized_end=1378
+  _globals['_CANCELJOBSRESPONSE']._serialized_start=1380
+  _globals['_CANCELJOBSRESPONSE']._serialized_end=1427
+  _globals['_FAILALLINPROGRESSJOBSREQUEST']._serialized_start=1429
+  _globals['_FAILALLINPROGRESSJOBSREQUEST']._serialized_end=1459
+  _globals['_FAILALLINPROGRESSJOBSRESPONSE']._serialized_start=1461
+  _globals['_FAILALLINPROGRESSJOBSRESPONSE']._serialized_end=1492
+  _globals['_TAILLOGSREQUEST']._serialized_start=1494
+  _globals['_TAILLOGSREQUEST']._serialized_end=1621
+  _globals['_TAILLOGSRESPONSE']._serialized_start=1623
+  _globals['_TAILLOGSRESPONSE']._serialized_end=1678
+  _globals['_GETJOBSTATUSREQUEST']._serialized_start=1680
+  _globals['_GETJOBSTATUSREQUEST']._serialized_end=1718
+  _globals['_GETJOBSTATUSRESPONSE']._serialized_start=1721
+  _globals['_GETJOBSTATUSRESPONSE']._serialized_end=1885
+  _globals['_GETJOBSTATUSRESPONSE_JOBSTATUSESENTRY']._serialized_start=1815
+  _globals['_GETJOBSTATUSRESPONSE_JOBSTATUSESENTRY']._serialized_end=1885
+  _globals['_GETJOBSUBMITTEDTIMESTAMPREQUEST']._serialized_start=1887
+  _globals['_GETJOBSUBMITTEDTIMESTAMPREQUEST']._serialized_end=1952
+  _globals['_GETJOBSUBMITTEDTIMESTAMPRESPONSE']._serialized_start=1954
+  _globals['_GETJOBSUBMITTEDTIMESTAMPRESPONSE']._serialized_end=2007
+  _globals['_GETJOBENDEDTIMESTAMPREQUEST']._serialized_start=2009
+  _globals['_GETJOBENDEDTIMESTAMPREQUEST']._serialized_end=2070
+  _globals['_GETJOBENDEDTIMESTAMPRESPONSE']._serialized_start=2072
+  _globals['_GETJOBENDEDTIMESTAMPRESPONSE']._serialized_end=2121
+  _globals['_GETLOGDIRSFORJOBSREQUEST']._serialized_start=2123
+  _globals['_GETLOGDIRSFORJOBSREQUEST']._serialized_end=2166
+  _globals['_GETLOGDIRSFORJOBSRESPONSE']._serialized_start=2169
+  _globals['_GETLOGDIRSFORJOBSRESPONSE']._serialized_end=2321
+  _globals['_GETLOGDIRSFORJOBSRESPONSE_JOBLOGDIRSENTRY']._serialized_start=2272
+  _globals['_GETLOGDIRSFORJOBSRESPONSE_JOBLOGDIRSENTRY']._serialized_end=2321
+  _globals['_GETJOBEXITCODESREQUEST']._serialized_start=2323
+  _globals['_GETJOBEXITCODESREQUEST']._serialized_end=2379
+  _globals['_GETJOBEXITCODESRESPONSE']._serialized_start=2381
+  _globals['_GETJOBEXITCODESRESPONSE']._serialized_end=2426
+  _globals['_SETJOBINFOWITHOUTJOBIDREQUEST']._serialized_start=2429
+  _globals['_SETJOBINFOWITHOUTJOBIDREQUEST']._serialized_end=2773
+  _globals['_SETJOBINFOWITHOUTJOBIDRESPONSE']._serialized_start=2775
+  _globals['_SETJOBINFOWITHOUTJOBIDRESPONSE']._serialized_end=2824
+  _globals['_JOBSSERVICE']._serialized_start=3099
+  _globals['_JOBSSERVICE']._serialized_end=4205
 # @@protoc_insertion_point(module_scope)
diff --git a/sky/schemas/generated/jobsv1_pb2.pyi b/sky/schemas/generated/jobsv1_pb2.pyi
index 040b3f3bb9f..69410185f7a 100644
--- a/sky/schemas/generated/jobsv1_pb2.pyi
+++ b/sky/schemas/generated/jobsv1_pb2.pyi
@@ -66,32 +66,36 @@ class QueueJobRequest(_message.Message):
     def __init__(self, job_id: _Optional[int] = ..., codegen: _Optional[str] = ..., script_path: _Optional[str] = ..., remote_log_dir: _Optional[str] = ..., managed_job: _Optional[_Union[ManagedJobInfo, _Mapping]] = ...) -> None: ...
 
 class ManagedJobInfo(_message.Message):
-    __slots__ = ("name", "pool", "workspace", "entrypoint", "tasks", "user_id")
+    __slots__ = ("name", "pool", "workspace", "entrypoint", "tasks", "user_id", "execution")
     NAME_FIELD_NUMBER: _ClassVar[int]
     POOL_FIELD_NUMBER: _ClassVar[int]
     WORKSPACE_FIELD_NUMBER: _ClassVar[int]
     ENTRYPOINT_FIELD_NUMBER: _ClassVar[int]
     TASKS_FIELD_NUMBER: _ClassVar[int]
     USER_ID_FIELD_NUMBER: _ClassVar[int]
+    EXECUTION_FIELD_NUMBER: _ClassVar[int]
     name: str
     pool: str
     workspace: str
     entrypoint: str
     tasks: _containers.RepeatedCompositeFieldContainer[ManagedJobTask]
     user_id: str
-    def __init__(self, name: _Optional[str] = ..., pool: _Optional[str] = ..., workspace: _Optional[str] = ..., entrypoint: _Optional[str] = ..., tasks: _Optional[_Iterable[_Union[ManagedJobTask, _Mapping]]] = ..., user_id: _Optional[str] = ...) -> None: ...
+    execution: str
+    def __init__(self, name: _Optional[str] = ..., pool: _Optional[str] = ..., workspace: _Optional[str] = ..., entrypoint: _Optional[str] = ..., tasks: _Optional[_Iterable[_Union[ManagedJobTask, _Mapping]]] = ..., user_id: _Optional[str] = ..., execution: _Optional[str] = ...) -> None: ...
 
 class ManagedJobTask(_message.Message):
-    __slots__ = ("task_id", "name", "resources_str", "metadata_json")
+    __slots__ = ("task_id", "name", "resources_str", "metadata_json", "is_primary_in_job_group")
     TASK_ID_FIELD_NUMBER: _ClassVar[int]
     NAME_FIELD_NUMBER: _ClassVar[int]
     RESOURCES_STR_FIELD_NUMBER: _ClassVar[int]
     METADATA_JSON_FIELD_NUMBER: _ClassVar[int]
+    IS_PRIMARY_IN_JOB_GROUP_FIELD_NUMBER: _ClassVar[int]
     task_id: int
     name: str
     resources_str: str
     metadata_json: str
-    def __init__(self, task_id: _Optional[int] = ..., name: _Optional[str] = ..., resources_str: _Optional[str] = ..., metadata_json: _Optional[str] = ...) -> None: ...
+    is_primary_in_job_group: bool
+    def __init__(self, task_id: _Optional[int] = ..., name: _Optional[str] = ..., resources_str: _Optional[str] = ..., metadata_json: _Optional[str] = ..., is_primary_in_job_group: bool = ...) -> None: ...
 
 class QueueJobResponse(_message.Message):
     __slots__ = ()
@@ -264,3 +268,39 @@ class GetJobExitCodesResponse(_message.Message):
     EXIT_CODES_FIELD_NUMBER: _ClassVar[int]
     exit_codes: _containers.RepeatedScalarFieldContainer[int]
     def __init__(self, exit_codes: _Optional[_Iterable[int]] = ...) -> None: ...
+
+class SetJobInfoWithoutJobIdRequest(_message.Message):
+    __slots__ = ("name", "workspace", "entrypoint", "pool", "pool_hash", "user_hash", "task_ids", "task_names", "resources_str", "metadata_jsons", "num_jobs", "execution", "is_primary_in_job_groups")
+    NAME_FIELD_NUMBER: _ClassVar[int]
+    WORKSPACE_FIELD_NUMBER: _ClassVar[int]
+    ENTRYPOINT_FIELD_NUMBER: _ClassVar[int]
+    POOL_FIELD_NUMBER: _ClassVar[int]
+    POOL_HASH_FIELD_NUMBER: _ClassVar[int]
+    USER_HASH_FIELD_NUMBER: _ClassVar[int]
+    TASK_IDS_FIELD_NUMBER: _ClassVar[int]
+    TASK_NAMES_FIELD_NUMBER: _ClassVar[int]
+    RESOURCES_STR_FIELD_NUMBER: _ClassVar[int]
+    METADATA_JSONS_FIELD_NUMBER: _ClassVar[int]
+    NUM_JOBS_FIELD_NUMBER: _ClassVar[int]
+    EXECUTION_FIELD_NUMBER: _ClassVar[int]
+    IS_PRIMARY_IN_JOB_GROUPS_FIELD_NUMBER: _ClassVar[int]
+    name: str
+    workspace: str
+    entrypoint: str
+    pool: str
+    pool_hash: str
+    user_hash: str
+    task_ids: _containers.RepeatedScalarFieldContainer[int]
+    task_names: _containers.RepeatedScalarFieldContainer[str]
+    resources_str: str
+    metadata_jsons: _containers.RepeatedScalarFieldContainer[str]
+    num_jobs: int
+    execution: str
+    is_primary_in_job_groups: _containers.RepeatedScalarFieldContainer[bool]
+    def __init__(self, name: _Optional[str] = ..., workspace: _Optional[str] = ..., entrypoint: _Optional[str] = ..., pool: _Optional[str] = ..., pool_hash: _Optional[str] = ..., user_hash: _Optional[str] = ..., task_ids: _Optional[_Iterable[int]] = ..., task_names: _Optional[_Iterable[str]] = ..., resources_str: _Optional[str] = ..., metadata_jsons: _Optional[_Iterable[str]] = ..., num_jobs: _Optional[int] = ..., execution: _Optional[str] = ..., is_primary_in_job_groups: _Optional[_Iterable[bool]] = ...) -> None: ...
+
+class SetJobInfoWithoutJobIdResponse(_message.Message):
+    __slots__ = ("job_ids",)
+    JOB_IDS_FIELD_NUMBER: _ClassVar[int]
+    job_ids: _containers.RepeatedScalarFieldContainer[int]
+    def __init__(self, job_ids: _Optional[_Iterable[int]] = ...) -> None: ...
diff --git a/sky/schemas/generated/jobsv1_pb2_grpc.py b/sky/schemas/generated/jobsv1_pb2_grpc.py
index 055599a7b97..f5cfb5af9db 100644
--- a/sky/schemas/generated/jobsv1_pb2_grpc.py
+++ b/sky/schemas/generated/jobsv1_pb2_grpc.py
@@ -99,6 +99,11 @@ def __init__(self, channel):
                 request_serializer=sky_dot_schemas_dot_generated_dot_jobsv1__pb2.GetJobExitCodesRequest.SerializeToString,
                 response_deserializer=sky_dot_schemas_dot_generated_dot_jobsv1__pb2.GetJobExitCodesResponse.FromString,
                 _registered_method=True)
+        self.SetJobInfoWithoutJobId = channel.unary_unary(
+                '/jobs.v1.JobsService/SetJobInfoWithoutJobId',
+                request_serializer=sky_dot_schemas_dot_generated_dot_jobsv1__pb2.SetJobInfoWithoutJobIdRequest.SerializeToString,
+                response_deserializer=sky_dot_schemas_dot_generated_dot_jobsv1__pb2.SetJobInfoWithoutJobIdResponse.FromString,
+                _registered_method=True)
 
 
 class JobsServiceServicer(object):
@@ -188,6 +193,13 @@ def GetJobExitCodes(self, request, context):
         context.set_details('Method not implemented!')
         raise NotImplementedError('Method not implemented!')
 
+    def SetJobInfoWithoutJobId(self, request, context):
+        """Set job info without creating entries in the jobs table (for managed jobs).
+        """
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+
 
 def add_JobsServiceServicer_to_server(servicer, server):
     rpc_method_handlers = {
@@ -251,6 +263,11 @@ def add_JobsServiceServicer_to_server(servicer, server):
                     request_deserializer=sky_dot_schemas_dot_generated_dot_jobsv1__pb2.GetJobExitCodesRequest.FromString,
                     response_serializer=sky_dot_schemas_dot_generated_dot_jobsv1__pb2.GetJobExitCodesResponse.SerializeToString,
             ),
+            'SetJobInfoWithoutJobId': grpc.unary_unary_rpc_method_handler(
+                    servicer.SetJobInfoWithoutJobId,
+                    request_deserializer=sky_dot_schemas_dot_generated_dot_jobsv1__pb2.SetJobInfoWithoutJobIdRequest.FromString,
+                    response_serializer=sky_dot_schemas_dot_generated_dot_jobsv1__pb2.SetJobInfoWithoutJobIdResponse.SerializeToString,
+            ),
     }
     generic_handler = grpc.method_handlers_generic_handler(
             'jobs.v1.JobsService', rpc_method_handlers)
@@ -584,3 +601,30 @@ def GetJobExitCodes(request,
             timeout,
             metadata,
             _registered_method=True)
+
+    @staticmethod
+    def SetJobInfoWithoutJobId(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(
+            request,
+            target,
+            '/jobs.v1.JobsService/SetJobInfoWithoutJobId',
+            sky_dot_schemas_dot_generated_dot_jobsv1__pb2.SetJobInfoWithoutJobIdRequest.SerializeToString,
+            sky_dot_schemas_dot_generated_dot_jobsv1__pb2.SetJobInfoWithoutJobIdResponse.FromString,
+            options,
+            channel_credentials,
+            insecure,
+            call_credentials,
+            compression,
+            wait_for_ready,
+            timeout,
+            metadata,
+            _registered_method=True)
diff --git a/sky/schemas/generated/managed_jobsv1_pb2.py b/sky/schemas/generated/managed_jobsv1_pb2.py
index 42069854c63..bede449fd5d 100644
--- a/sky/schemas/generated/managed_jobsv1_pb2.py
+++ b/sky/schemas/generated/managed_jobsv1_pb2.py
@@ -14,7 +14,7 @@
 
 
 
-DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n*sky/schemas/generated/managed_jobsv1.proto\x12\x0fmanaged_jobs.v1\"\x15\n\x06JobIds\x12\x0b\n\x03ids\x18\x01 \x03(\x03\"\x1c\n\nUserHashes\x12\x0e\n\x06hashes\x18\x01 \x03(\t\"\x1c\n\x08Statuses\x12\x10\n\x08statuses\x18\x01 \x03(\t\"\x18\n\x06\x46ields\x12\x0e\n\x06\x66ields\x18\x01 \x03(\t\" \n\nWorkspaces\x12\x12\n\nworkspaces\x18\x01 \x03(\t\"\x13\n\x11GetVersionRequest\"0\n\x12GetVersionResponse\x12\x1a\n\x12\x63ontroller_version\x18\x01 \x01(\t\"\xe1\x04\n\x12GetJobTableRequest\x12\x15\n\rskip_finished\x18\x01 \x01(\x08\x12?\n\x15\x61\x63\x63\x65ssible_workspaces\x18\x02 \x01(\x0b\x32\x1b.managed_jobs.v1.WorkspacesH\x00\x88\x01\x01\x12-\n\x07job_ids\x18\x03 \x01(\x0b\x32\x17.managed_jobs.v1.JobIdsH\x01\x88\x01\x01\x12\x1c\n\x0fworkspace_match\x18\x04 \x01(\tH\x02\x88\x01\x01\x12\x17\n\nname_match\x18\x05 \x01(\tH\x03\x88\x01\x01\x12\x17\n\npool_match\x18\x06 \x01(\tH\x04\x88\x01\x01\x12\x11\n\x04page\x18\x07 \x01(\x05H\x05\x88\x01\x01\x12\x12\n\x05limit\x18\x08 \x01(\x05H\x06\x88\x01\x01\x12\x35\n\x0buser_hashes\x18\t \x01(\x0b\x32\x1b.managed_jobs.v1.UserHashesH\x07\x88\x01\x01\x12\x30\n\x08statuses\x18\n \x01(\x0b\x32\x19.managed_jobs.v1.StatusesH\x08\x88\x01\x01\x12#\n\x1bshow_jobs_without_user_hash\x18\x0b \x01(\x08\x12,\n\x06\x66ields\x18\x0c \x01(\x0b\x32\x17.managed_jobs.v1.FieldsH\t\x88\x01\x01\x42\x18\n\x16_accessible_workspacesB\n\n\x08_job_idsB\x12\n\x10_workspace_matchB\r\n\x0b_name_matchB\r\n\x0b_pool_matchB\x07\n\x05_pageB\x08\n\x06_limitB\x0e\n\x0c_user_hashesB\x0b\n\t_statusesB\t\n\x07_fields\"\xb4\t\n\x0eManagedJobInfo\x12\x0e\n\x06job_id\x18\x01 \x01(\x03\x12\x0f\n\x07task_id\x18\x02 \x01(\x03\x12\x10\n\x08job_name\x18\x03 \x01(\t\x12\x11\n\ttask_name\x18\x04 \x01(\t\x12\x14\n\x0cjob_duration\x18\x05 \x01(\x01\x12\x16\n\tworkspace\x18\x06 \x01(\tH\x00\x88\x01\x01\x12\x31\n\x06status\x18\x07 \x01(\x0e\x32!.managed_jobs.v1.ManagedJobStatus\x12@\n\x0eschedule_state\x18\x08 \x01(\x0e\x32(.managed_jobs.v1.ManagedJobScheduleState\x12\x11\n\tresources\x18\t \x01(\t\x12\x19\n\x11\x63luster_resources\x18\n \x01(\t\x12\x1e\n\x16\x63luster_resources_full\x18\x0b \x01(\t\x12\r\n\x05\x63loud\x18\x0c \x01(\t\x12\x0e\n\x06region\x18\r \x01(\t\x12\r\n\x05infra\x18\x0e \x01(\t\x12G\n\x0c\x61\x63\x63\x65lerators\x18\x0f \x03(\x0b\x32\x31.managed_jobs.v1.ManagedJobInfo.AcceleratorsEntry\x12\x16\n\x0erecovery_count\x18\x10 \x01(\x05\x12\x14\n\x07\x64\x65tails\x18\x11 \x01(\tH\x01\x88\x01\x01\x12\x1b\n\x0e\x66\x61ilure_reason\x18\x12 \x01(\tH\x02\x88\x01\x01\x12\x16\n\tuser_name\x18\x13 \x01(\tH\x03\x88\x01\x01\x12\x16\n\tuser_hash\x18\x14 \x01(\tH\x04\x88\x01\x01\x12\x19\n\x0csubmitted_at\x18\x15 \x01(\x01H\x05\x88\x01\x01\x12\x15\n\x08start_at\x18\x16 \x01(\x01H\x06\x88\x01\x01\x12\x13\n\x06\x65nd_at\x18\x17 \x01(\x01H\x07\x88\x01\x01\x12\x16\n\tuser_yaml\x18\x18 \x01(\tH\x08\x88\x01\x01\x12\x17\n\nentrypoint\x18\x19 \x01(\tH\t\x88\x01\x01\x12?\n\x08metadata\x18\x1a \x03(\x0b\x32-.managed_jobs.v1.ManagedJobInfo.MetadataEntry\x12\x11\n\x04pool\x18\x1b \x01(\tH\n\x88\x01\x01\x12\x16\n\tpool_hash\x18\x1c \x01(\tH\x0b\x88\x01\x01\x12\x14\n\x07_job_id\x18\x1d \x01(\x03H\x0c\x88\x01\x01\x12\x39\n\x05links\x18\x1e \x03(\x0b\x32*.managed_jobs.v1.ManagedJobInfo.LinksEntry\x1a\x33\n\x11\x41\x63\x63\x65leratorsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\x02:\x02\x38\x01\x1a/\n\rMetadataEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\x1a,\n\nLinksEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\x42\x0c\n\n_workspaceB\n\n\x08_detailsB\x11\n\x0f_failure_reasonB\x0c\n\n_user_nameB\x0c\n\n_user_hashB\x0f\n\r_submitted_atB\x0b\n\t_start_atB\t\n\x07_end_atB\x0c\n\n_user_yamlB\r\n\x0b_entrypointB\x07\n\x05_poolB\x0c\n\n_pool_hashB\n\n\x08X_job_id\"\xf0\x01\n\x13GetJobTableResponse\x12-\n\x04jobs\x18\x01 \x03(\x0b\x32\x1f.managed_jobs.v1.ManagedJobInfo\x12\r\n\x05total\x18\x02 \x01(\x05\x12\x17\n\x0ftotal_no_filter\x18\x03 \x01(\x05\x12M\n\rstatus_counts\x18\x04 \x03(\x0b\x32\x36.managed_jobs.v1.GetJobTableResponse.StatusCountsEntry\x1a\x33\n\x11StatusCountsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\x05:\x02\x38\x01\"?\n\x19GetAllJobIdsByNameRequest\x12\x15\n\x08job_name\x18\x01 \x01(\tH\x00\x88\x01\x01\x42\x0b\n\t_job_name\"-\n\x1aGetAllJobIdsByNameResponse\x12\x0f\n\x07job_ids\x18\x01 \x03(\x03\"\xd7\x01\n\x11\x43\x61ncelJobsRequest\x12\x19\n\x11\x63urrent_workspace\x18\x01 \x01(\t\x12\x16\n\tuser_hash\x18\x02 \x01(\tH\x01\x88\x01\x01\x12\x13\n\tall_users\x18\x03 \x01(\x08H\x00\x12*\n\x07job_ids\x18\x04 \x01(\x0b\x32\x17.managed_jobs.v1.JobIdsH\x00\x12\x12\n\x08job_name\x18\x05 \x01(\tH\x00\x12\x13\n\tpool_name\x18\x06 \x01(\tH\x00\x42\x17\n\x15\x63\x61ncellation_criteriaB\x0c\n\n_user_hash\"%\n\x12\x43\x61ncelJobsResponse\x12\x0f\n\x07message\x18\x01 \x01(\t\"\x97\x01\n\x11StreamLogsRequest\x12\x15\n\x08job_name\x18\x01 \x01(\tH\x00\x88\x01\x01\x12\x13\n\x06job_id\x18\x02 \x01(\x03H\x01\x88\x01\x01\x12\x0e\n\x06\x66ollow\x18\x03 \x01(\x08\x12\x12\n\ncontroller\x18\x04 \x01(\x08\x12\x11\n\x04tail\x18\x05 \x01(\x05H\x02\x88\x01\x01\x42\x0b\n\t_job_nameB\t\n\x07_job_idB\x07\n\x05_tail\"L\n\x12StreamLogsResponse\x12\x10\n\x08log_line\x18\x01 \x01(\t\x12\x16\n\texit_code\x18\x02 \x01(\x05H\x00\x88\x01\x01\x42\x0c\n\n_exit_code*\x85\x04\n\x10ManagedJobStatus\x12\"\n\x1eMANAGED_JOB_STATUS_UNSPECIFIED\x10\x00\x12\x1e\n\x1aMANAGED_JOB_STATUS_PENDING\x10\x01\x12 \n\x1cMANAGED_JOB_STATUS_SUBMITTED\x10\x02\x12\x1f\n\x1bMANAGED_JOB_STATUS_STARTING\x10\x03\x12\x1e\n\x1aMANAGED_JOB_STATUS_RUNNING\x10\x04\x12!\n\x1dMANAGED_JOB_STATUS_RECOVERING\x10\x05\x12!\n\x1dMANAGED_JOB_STATUS_CANCELLING\x10\x06\x12 \n\x1cMANAGED_JOB_STATUS_SUCCEEDED\x10\x07\x12 \n\x1cMANAGED_JOB_STATUS_CANCELLED\x10\x08\x12\x1d\n\x19MANAGED_JOB_STATUS_FAILED\x10\t\x12#\n\x1fMANAGED_JOB_STATUS_FAILED_SETUP\x10\n\x12\'\n#MANAGED_JOB_STATUS_FAILED_PRECHECKS\x10\x0b\x12)\n%MANAGED_JOB_STATUS_FAILED_NO_RESOURCE\x10\x0c\x12(\n$MANAGED_JOB_STATUS_FAILED_CONTROLLER\x10\r*\x9e\x03\n\x17ManagedJobScheduleState\x12*\n&MANAGED_JOB_SCHEDULE_STATE_UNSPECIFIED\x10\x00\x12\x35\n-DEPRECATED_MANAGED_JOB_SCHEDULE_STATE_INVALID\x10\x01\x1a\x02\x08\x01\x12\'\n#MANAGED_JOB_SCHEDULE_STATE_INACTIVE\x10\x02\x12&\n\"MANAGED_JOB_SCHEDULE_STATE_WAITING\x10\x03\x12,\n(MANAGED_JOB_SCHEDULE_STATE_ALIVE_WAITING\x10\x04\x12(\n$MANAGED_JOB_SCHEDULE_STATE_LAUNCHING\x10\x05\x12,\n(MANAGED_JOB_SCHEDULE_STATE_ALIVE_BACKOFF\x10\x06\x12$\n MANAGED_JOB_SCHEDULE_STATE_ALIVE\x10\x07\x12#\n\x1fMANAGED_JOB_SCHEDULE_STATE_DONE\x10\x08\x32\xe4\x03\n\x12ManagedJobsService\x12U\n\nGetVersion\x12\".managed_jobs.v1.GetVersionRequest\x1a#.managed_jobs.v1.GetVersionResponse\x12X\n\x0bGetJobTable\x12#.managed_jobs.v1.GetJobTableRequest\x1a$.managed_jobs.v1.GetJobTableResponse\x12m\n\x12GetAllJobIdsByName\x12*.managed_jobs.v1.GetAllJobIdsByNameRequest\x1a+.managed_jobs.v1.GetAllJobIdsByNameResponse\x12U\n\nCancelJobs\x12\".managed_jobs.v1.CancelJobsRequest\x1a#.managed_jobs.v1.CancelJobsResponse\x12W\n\nStreamLogs\x12\".managed_jobs.v1.StreamLogsRequest\x1a#.managed_jobs.v1.StreamLogsResponse0\x01\x62\x06proto3')
+DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n*sky/schemas/generated/managed_jobsv1.proto\x12\x0fmanaged_jobs.v1\"\x15\n\x06JobIds\x12\x0b\n\x03ids\x18\x01 \x03(\x03\"\x1c\n\nUserHashes\x12\x0e\n\x06hashes\x18\x01 \x03(\t\"\x1c\n\x08Statuses\x12\x10\n\x08statuses\x18\x01 \x03(\t\"\x18\n\x06\x46ields\x12\x0e\n\x06\x66ields\x18\x01 \x03(\t\" \n\nWorkspaces\x12\x12\n\nworkspaces\x18\x01 \x03(\t\"\x13\n\x11GetVersionRequest\"0\n\x12GetVersionResponse\x12\x1a\n\x12\x63ontroller_version\x18\x01 \x01(\t\"\xab\x05\n\x12GetJobTableRequest\x12\x15\n\rskip_finished\x18\x01 \x01(\x08\x12?\n\x15\x61\x63\x63\x65ssible_workspaces\x18\x02 \x01(\x0b\x32\x1b.managed_jobs.v1.WorkspacesH\x00\x88\x01\x01\x12-\n\x07job_ids\x18\x03 \x01(\x0b\x32\x17.managed_jobs.v1.JobIdsH\x01\x88\x01\x01\x12\x1c\n\x0fworkspace_match\x18\x04 \x01(\tH\x02\x88\x01\x01\x12\x17\n\nname_match\x18\x05 \x01(\tH\x03\x88\x01\x01\x12\x17\n\npool_match\x18\x06 \x01(\tH\x04\x88\x01\x01\x12\x11\n\x04page\x18\x07 \x01(\x05H\x05\x88\x01\x01\x12\x12\n\x05limit\x18\x08 \x01(\x05H\x06\x88\x01\x01\x12\x35\n\x0buser_hashes\x18\t \x01(\x0b\x32\x1b.managed_jobs.v1.UserHashesH\x07\x88\x01\x01\x12\x30\n\x08statuses\x18\n \x01(\x0b\x32\x19.managed_jobs.v1.StatusesH\x08\x88\x01\x01\x12#\n\x1bshow_jobs_without_user_hash\x18\x0b \x01(\x08\x12,\n\x06\x66ields\x18\x0c \x01(\x0b\x32\x17.managed_jobs.v1.FieldsH\t\x88\x01\x01\x12\x14\n\x07sort_by\x18\r \x01(\tH\n\x88\x01\x01\x12\x17\n\nsort_order\x18\x0e \x01(\tH\x0b\x88\x01\x01\x42\x18\n\x16_accessible_workspacesB\n\n\x08_job_idsB\x12\n\x10_workspace_matchB\r\n\x0b_name_matchB\r\n\x0b_pool_matchB\x07\n\x05_pageB\x08\n\x06_limitB\x0e\n\x0c_user_hashesB\x0b\n\t_statusesB\t\n\x07_fieldsB\n\n\x08_sort_byB\r\n\x0b_sort_order\"\xbc\x0b\n\x0eManagedJobInfo\x12\x0e\n\x06job_id\x18\x01 \x01(\x03\x12\x0f\n\x07task_id\x18\x02 \x01(\x03\x12\x10\n\x08job_name\x18\x03 \x01(\t\x12\x11\n\ttask_name\x18\x04 \x01(\t\x12\x14\n\x0cjob_duration\x18\x05 \x01(\x01\x12\x16\n\tworkspace\x18\x06 \x01(\tH\x00\x88\x01\x01\x12\x31\n\x06status\x18\x07 \x01(\x0e\x32!.managed_jobs.v1.ManagedJobStatus\x12@\n\x0eschedule_state\x18\x08 \x01(\x0e\x32(.managed_jobs.v1.ManagedJobScheduleState\x12\x11\n\tresources\x18\t \x01(\t\x12\x19\n\x11\x63luster_resources\x18\n \x01(\t\x12\x1e\n\x16\x63luster_resources_full\x18\x0b \x01(\t\x12\r\n\x05\x63loud\x18\x0c \x01(\t\x12\x0e\n\x06region\x18\r \x01(\t\x12\r\n\x05infra\x18\x0e \x01(\t\x12G\n\x0c\x61\x63\x63\x65lerators\x18\x0f \x03(\x0b\x32\x31.managed_jobs.v1.ManagedJobInfo.AcceleratorsEntry\x12\x16\n\x0erecovery_count\x18\x10 \x01(\x05\x12\x14\n\x07\x64\x65tails\x18\x11 \x01(\tH\x01\x88\x01\x01\x12\x1b\n\x0e\x66\x61ilure_reason\x18\x12 \x01(\tH\x02\x88\x01\x01\x12\x16\n\tuser_name\x18\x13 \x01(\tH\x03\x88\x01\x01\x12\x16\n\tuser_hash\x18\x14 \x01(\tH\x04\x88\x01\x01\x12\x19\n\x0csubmitted_at\x18\x15 \x01(\x01H\x05\x88\x01\x01\x12\x15\n\x08start_at\x18\x16 \x01(\x01H\x06\x88\x01\x01\x12\x13\n\x06\x65nd_at\x18\x17 \x01(\x01H\x07\x88\x01\x01\x12\x16\n\tuser_yaml\x18\x18 \x01(\tH\x08\x88\x01\x01\x12\x17\n\nentrypoint\x18\x19 \x01(\tH\t\x88\x01\x01\x12?\n\x08metadata\x18\x1a \x03(\x0b\x32-.managed_jobs.v1.ManagedJobInfo.MetadataEntry\x12\x11\n\x04pool\x18\x1b \x01(\tH\n\x88\x01\x01\x12\x16\n\tpool_hash\x18\x1c \x01(\tH\x0b\x88\x01\x01\x12\x14\n\x07_job_id\x18\x1d \x01(\x03H\x0c\x88\x01\x01\x12\x39\n\x05links\x18\x1e \x03(\x0b\x32*.managed_jobs.v1.ManagedJobInfo.LinksEntry\x12$\n\x17is_primary_in_job_group\x18\x1f \x01(\x08H\r\x88\x01\x01\x12\x11\n\x04zone\x18  \x01(\tH\x0e\x88\x01\x01\x12;\n\x06labels\x18! \x03(\x0b\x32+.managed_jobs.v1.ManagedJobInfo.LabelsEntry\x12\"\n\x15\x63luster_name_on_cloud\x18\" \x01(\tH\x0f\x88\x01\x01\x1a\x33\n\x11\x41\x63\x63\x65leratorsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\x02:\x02\x38\x01\x1a/\n\rMetadataEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\x1a,\n\nLinksEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\x1a-\n\x0bLabelsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\x42\x0c\n\n_workspaceB\n\n\x08_detailsB\x11\n\x0f_failure_reasonB\x0c\n\n_user_nameB\x0c\n\n_user_hashB\x0f\n\r_submitted_atB\x0b\n\t_start_atB\t\n\x07_end_atB\x0c\n\n_user_yamlB\r\n\x0b_entrypointB\x07\n\x05_poolB\x0c\n\n_pool_hashB\n\n\x08X_job_idB\x1a\n\x18_is_primary_in_job_groupB\x07\n\x05_zoneB\x18\n\x16_cluster_name_on_cloud\"\xf0\x01\n\x13GetJobTableResponse\x12-\n\x04jobs\x18\x01 \x03(\x0b\x32\x1f.managed_jobs.v1.ManagedJobInfo\x12\r\n\x05total\x18\x02 \x01(\x05\x12\x17\n\x0ftotal_no_filter\x18\x03 \x01(\x05\x12M\n\rstatus_counts\x18\x04 \x03(\x0b\x32\x36.managed_jobs.v1.GetJobTableResponse.StatusCountsEntry\x1a\x33\n\x11StatusCountsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\x05:\x02\x38\x01\"?\n\x19GetAllJobIdsByNameRequest\x12\x15\n\x08job_name\x18\x01 \x01(\tH\x00\x88\x01\x01\x42\x0b\n\t_job_name\"-\n\x1aGetAllJobIdsByNameResponse\x12\x0f\n\x07job_ids\x18\x01 \x03(\x03\"\xd7\x01\n\x11\x43\x61ncelJobsRequest\x12\x19\n\x11\x63urrent_workspace\x18\x01 \x01(\t\x12\x16\n\tuser_hash\x18\x02 \x01(\tH\x01\x88\x01\x01\x12\x13\n\tall_users\x18\x03 \x01(\x08H\x00\x12*\n\x07job_ids\x18\x04 \x01(\x0b\x32\x17.managed_jobs.v1.JobIdsH\x00\x12\x12\n\x08job_name\x18\x05 \x01(\tH\x00\x12\x13\n\tpool_name\x18\x06 \x01(\tH\x00\x42\x17\n\x15\x63\x61ncellation_criteriaB\x0c\n\n_user_hash\"%\n\x12\x43\x61ncelJobsResponse\x12\x0f\n\x07message\x18\x01 \x01(\t\"\x97\x01\n\x11StreamLogsRequest\x12\x15\n\x08job_name\x18\x01 \x01(\tH\x00\x88\x01\x01\x12\x13\n\x06job_id\x18\x02 \x01(\x03H\x01\x88\x01\x01\x12\x0e\n\x06\x66ollow\x18\x03 \x01(\x08\x12\x12\n\ncontroller\x18\x04 \x01(\x08\x12\x11\n\x04tail\x18\x05 \x01(\x05H\x02\x88\x01\x01\x42\x0b\n\t_job_nameB\t\n\x07_job_idB\x07\n\x05_tail\"L\n\x12StreamLogsResponse\x12\x10\n\x08log_line\x18\x01 \x01(\t\x12\x16\n\texit_code\x18\x02 \x01(\x05H\x00\x88\x01\x01\x42\x0c\n\n_exit_code*\x85\x04\n\x10ManagedJobStatus\x12\"\n\x1eMANAGED_JOB_STATUS_UNSPECIFIED\x10\x00\x12\x1e\n\x1aMANAGED_JOB_STATUS_PENDING\x10\x01\x12 \n\x1cMANAGED_JOB_STATUS_SUBMITTED\x10\x02\x12\x1f\n\x1bMANAGED_JOB_STATUS_STARTING\x10\x03\x12\x1e\n\x1aMANAGED_JOB_STATUS_RUNNING\x10\x04\x12!\n\x1dMANAGED_JOB_STATUS_RECOVERING\x10\x05\x12!\n\x1dMANAGED_JOB_STATUS_CANCELLING\x10\x06\x12 \n\x1cMANAGED_JOB_STATUS_SUCCEEDED\x10\x07\x12 \n\x1cMANAGED_JOB_STATUS_CANCELLED\x10\x08\x12\x1d\n\x19MANAGED_JOB_STATUS_FAILED\x10\t\x12#\n\x1fMANAGED_JOB_STATUS_FAILED_SETUP\x10\n\x12\'\n#MANAGED_JOB_STATUS_FAILED_PRECHECKS\x10\x0b\x12)\n%MANAGED_JOB_STATUS_FAILED_NO_RESOURCE\x10\x0c\x12(\n$MANAGED_JOB_STATUS_FAILED_CONTROLLER\x10\r*\x9e\x03\n\x17ManagedJobScheduleState\x12*\n&MANAGED_JOB_SCHEDULE_STATE_UNSPECIFIED\x10\x00\x12\x35\n-DEPRECATED_MANAGED_JOB_SCHEDULE_STATE_INVALID\x10\x01\x1a\x02\x08\x01\x12\'\n#MANAGED_JOB_SCHEDULE_STATE_INACTIVE\x10\x02\x12&\n\"MANAGED_JOB_SCHEDULE_STATE_WAITING\x10\x03\x12,\n(MANAGED_JOB_SCHEDULE_STATE_ALIVE_WAITING\x10\x04\x12(\n$MANAGED_JOB_SCHEDULE_STATE_LAUNCHING\x10\x05\x12,\n(MANAGED_JOB_SCHEDULE_STATE_ALIVE_BACKOFF\x10\x06\x12$\n MANAGED_JOB_SCHEDULE_STATE_ALIVE\x10\x07\x12#\n\x1fMANAGED_JOB_SCHEDULE_STATE_DONE\x10\x08\x32\xe4\x03\n\x12ManagedJobsService\x12U\n\nGetVersion\x12\".managed_jobs.v1.GetVersionRequest\x1a#.managed_jobs.v1.GetVersionResponse\x12X\n\x0bGetJobTable\x12#.managed_jobs.v1.GetJobTableRequest\x1a$.managed_jobs.v1.GetJobTableResponse\x12m\n\x12GetAllJobIdsByName\x12*.managed_jobs.v1.GetAllJobIdsByNameRequest\x1a+.managed_jobs.v1.GetAllJobIdsByNameResponse\x12U\n\nCancelJobs\x12\".managed_jobs.v1.CancelJobsRequest\x1a#.managed_jobs.v1.CancelJobsResponse\x12W\n\nStreamLogs\x12\".managed_jobs.v1.StreamLogsRequest\x1a#.managed_jobs.v1.StreamLogsResponse0\x01\x62\x06proto3')
 
 _globals = globals()
 _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals)
@@ -29,12 +29,14 @@
   _globals['_MANAGEDJOBINFO_METADATAENTRY']._serialized_options = b'8\001'
   _globals['_MANAGEDJOBINFO_LINKSENTRY']._loaded_options = None
   _globals['_MANAGEDJOBINFO_LINKSENTRY']._serialized_options = b'8\001'
+  _globals['_MANAGEDJOBINFO_LABELSENTRY']._loaded_options = None
+  _globals['_MANAGEDJOBINFO_LABELSENTRY']._serialized_options = b'8\001'
   _globals['_GETJOBTABLERESPONSE_STATUSCOUNTSENTRY']._loaded_options = None
   _globals['_GETJOBTABLERESPONSE_STATUSCOUNTSENTRY']._serialized_options = b'8\001'
-  _globals['_MANAGEDJOBSTATUS']._serialized_start=2941
-  _globals['_MANAGEDJOBSTATUS']._serialized_end=3458
-  _globals['_MANAGEDJOBSCHEDULESTATE']._serialized_start=3461
-  _globals['_MANAGEDJOBSCHEDULESTATE']._serialized_end=3875
+  _globals['_MANAGEDJOBSTATUS']._serialized_start=3279
+  _globals['_MANAGEDJOBSTATUS']._serialized_end=3796
+  _globals['_MANAGEDJOBSCHEDULESTATE']._serialized_start=3799
+  _globals['_MANAGEDJOBSCHEDULESTATE']._serialized_end=4213
   _globals['_JOBIDS']._serialized_start=63
   _globals['_JOBIDS']._serialized_end=84
   _globals['_USERHASHES']._serialized_start=86
@@ -50,31 +52,33 @@
   _globals['_GETVERSIONRESPONSE']._serialized_start=227
   _globals['_GETVERSIONRESPONSE']._serialized_end=275
   _globals['_GETJOBTABLEREQUEST']._serialized_start=278
-  _globals['_GETJOBTABLEREQUEST']._serialized_end=887
-  _globals['_MANAGEDJOBINFO']._serialized_start=890
-  _globals['_MANAGEDJOBINFO']._serialized_end=2094
-  _globals['_MANAGEDJOBINFO_ACCELERATORSENTRY']._serialized_start=1770
-  _globals['_MANAGEDJOBINFO_ACCELERATORSENTRY']._serialized_end=1821
-  _globals['_MANAGEDJOBINFO_METADATAENTRY']._serialized_start=1823
-  _globals['_MANAGEDJOBINFO_METADATAENTRY']._serialized_end=1870
-  _globals['_MANAGEDJOBINFO_LINKSENTRY']._serialized_start=1872
-  _globals['_MANAGEDJOBINFO_LINKSENTRY']._serialized_end=1916
-  _globals['_GETJOBTABLERESPONSE']._serialized_start=2097
-  _globals['_GETJOBTABLERESPONSE']._serialized_end=2337
-  _globals['_GETJOBTABLERESPONSE_STATUSCOUNTSENTRY']._serialized_start=2286
-  _globals['_GETJOBTABLERESPONSE_STATUSCOUNTSENTRY']._serialized_end=2337
-  _globals['_GETALLJOBIDSBYNAMEREQUEST']._serialized_start=2339
-  _globals['_GETALLJOBIDSBYNAMEREQUEST']._serialized_end=2402
-  _globals['_GETALLJOBIDSBYNAMERESPONSE']._serialized_start=2404
-  _globals['_GETALLJOBIDSBYNAMERESPONSE']._serialized_end=2449
-  _globals['_CANCELJOBSREQUEST']._serialized_start=2452
-  _globals['_CANCELJOBSREQUEST']._serialized_end=2667
-  _globals['_CANCELJOBSRESPONSE']._serialized_start=2669
-  _globals['_CANCELJOBSRESPONSE']._serialized_end=2706
-  _globals['_STREAMLOGSREQUEST']._serialized_start=2709
-  _globals['_STREAMLOGSREQUEST']._serialized_end=2860
-  _globals['_STREAMLOGSRESPONSE']._serialized_start=2862
-  _globals['_STREAMLOGSRESPONSE']._serialized_end=2938
-  _globals['_MANAGEDJOBSSERVICE']._serialized_start=3878
-  _globals['_MANAGEDJOBSSERVICE']._serialized_end=4362
+  _globals['_GETJOBTABLEREQUEST']._serialized_end=961
+  _globals['_MANAGEDJOBINFO']._serialized_start=964
+  _globals['_MANAGEDJOBINFO']._serialized_end=2432
+  _globals['_MANAGEDJOBINFO_ACCELERATORSENTRY']._serialized_start=1998
+  _globals['_MANAGEDJOBINFO_ACCELERATORSENTRY']._serialized_end=2049
+  _globals['_MANAGEDJOBINFO_METADATAENTRY']._serialized_start=2051
+  _globals['_MANAGEDJOBINFO_METADATAENTRY']._serialized_end=2098
+  _globals['_MANAGEDJOBINFO_LINKSENTRY']._serialized_start=2100
+  _globals['_MANAGEDJOBINFO_LINKSENTRY']._serialized_end=2144
+  _globals['_MANAGEDJOBINFO_LABELSENTRY']._serialized_start=2146
+  _globals['_MANAGEDJOBINFO_LABELSENTRY']._serialized_end=2191
+  _globals['_GETJOBTABLERESPONSE']._serialized_start=2435
+  _globals['_GETJOBTABLERESPONSE']._serialized_end=2675
+  _globals['_GETJOBTABLERESPONSE_STATUSCOUNTSENTRY']._serialized_start=2624
+  _globals['_GETJOBTABLERESPONSE_STATUSCOUNTSENTRY']._serialized_end=2675
+  _globals['_GETALLJOBIDSBYNAMEREQUEST']._serialized_start=2677
+  _globals['_GETALLJOBIDSBYNAMEREQUEST']._serialized_end=2740
+  _globals['_GETALLJOBIDSBYNAMERESPONSE']._serialized_start=2742
+  _globals['_GETALLJOBIDSBYNAMERESPONSE']._serialized_end=2787
+  _globals['_CANCELJOBSREQUEST']._serialized_start=2790
+  _globals['_CANCELJOBSREQUEST']._serialized_end=3005
+  _globals['_CANCELJOBSRESPONSE']._serialized_start=3007
+  _globals['_CANCELJOBSRESPONSE']._serialized_end=3044
+  _globals['_STREAMLOGSREQUEST']._serialized_start=3047
+  _globals['_STREAMLOGSREQUEST']._serialized_end=3198
+  _globals['_STREAMLOGSRESPONSE']._serialized_start=3200
+  _globals['_STREAMLOGSRESPONSE']._serialized_end=3276
+  _globals['_MANAGEDJOBSSERVICE']._serialized_start=4216
+  _globals['_MANAGEDJOBSSERVICE']._serialized_end=4700
 # @@protoc_insertion_point(module_scope)
diff --git a/sky/schemas/generated/managed_jobsv1_pb2.pyi b/sky/schemas/generated/managed_jobsv1_pb2.pyi
index 62efa3f7da9..38adc3b5c87 100644
--- a/sky/schemas/generated/managed_jobsv1_pb2.pyi
+++ b/sky/schemas/generated/managed_jobsv1_pb2.pyi
@@ -99,7 +99,7 @@ class GetVersionResponse(_message.Message):
     def __init__(self, controller_version: _Optional[str] = ...) -> None: ...
 
 class GetJobTableRequest(_message.Message):
-    __slots__ = ("skip_finished", "accessible_workspaces", "job_ids", "workspace_match", "name_match", "pool_match", "page", "limit", "user_hashes", "statuses", "show_jobs_without_user_hash", "fields")
+    __slots__ = ("skip_finished", "accessible_workspaces", "job_ids", "workspace_match", "name_match", "pool_match", "page", "limit", "user_hashes", "statuses", "show_jobs_without_user_hash", "fields", "sort_by", "sort_order")
     SKIP_FINISHED_FIELD_NUMBER: _ClassVar[int]
     ACCESSIBLE_WORKSPACES_FIELD_NUMBER: _ClassVar[int]
     JOB_IDS_FIELD_NUMBER: _ClassVar[int]
@@ -112,6 +112,8 @@ class GetJobTableRequest(_message.Message):
     STATUSES_FIELD_NUMBER: _ClassVar[int]
     SHOW_JOBS_WITHOUT_USER_HASH_FIELD_NUMBER: _ClassVar[int]
     FIELDS_FIELD_NUMBER: _ClassVar[int]
+    SORT_BY_FIELD_NUMBER: _ClassVar[int]
+    SORT_ORDER_FIELD_NUMBER: _ClassVar[int]
     skip_finished: bool
     accessible_workspaces: Workspaces
     job_ids: JobIds
@@ -124,10 +126,12 @@ class GetJobTableRequest(_message.Message):
     statuses: Statuses
     show_jobs_without_user_hash: bool
     fields: Fields
-    def __init__(self, skip_finished: bool = ..., accessible_workspaces: _Optional[_Union[Workspaces, _Mapping]] = ..., job_ids: _Optional[_Union[JobIds, _Mapping]] = ..., workspace_match: _Optional[str] = ..., name_match: _Optional[str] = ..., pool_match: _Optional[str] = ..., page: _Optional[int] = ..., limit: _Optional[int] = ..., user_hashes: _Optional[_Union[UserHashes, _Mapping]] = ..., statuses: _Optional[_Union[Statuses, _Mapping]] = ..., show_jobs_without_user_hash: bool = ..., fields: _Optional[_Union[Fields, _Mapping]] = ...) -> None: ...
+    sort_by: str
+    sort_order: str
+    def __init__(self, skip_finished: bool = ..., accessible_workspaces: _Optional[_Union[Workspaces, _Mapping]] = ..., job_ids: _Optional[_Union[JobIds, _Mapping]] = ..., workspace_match: _Optional[str] = ..., name_match: _Optional[str] = ..., pool_match: _Optional[str] = ..., page: _Optional[int] = ..., limit: _Optional[int] = ..., user_hashes: _Optional[_Union[UserHashes, _Mapping]] = ..., statuses: _Optional[_Union[Statuses, _Mapping]] = ..., show_jobs_without_user_hash: bool = ..., fields: _Optional[_Union[Fields, _Mapping]] = ..., sort_by: _Optional[str] = ..., sort_order: _Optional[str] = ...) -> None: ...
 
 class ManagedJobInfo(_message.Message):
-    __slots__ = ("job_id", "task_id", "job_name", "task_name", "job_duration", "workspace", "status", "schedule_state", "resources", "cluster_resources", "cluster_resources_full", "cloud", "region", "infra", "accelerators", "recovery_count", "details", "failure_reason", "user_name", "user_hash", "submitted_at", "start_at", "end_at", "user_yaml", "entrypoint", "metadata", "pool", "pool_hash", "_job_id", "links")
+    __slots__ = ("job_id", "task_id", "job_name", "task_name", "job_duration", "workspace", "status", "schedule_state", "resources", "cluster_resources", "cluster_resources_full", "cloud", "region", "infra", "accelerators", "recovery_count", "details", "failure_reason", "user_name", "user_hash", "submitted_at", "start_at", "end_at", "user_yaml", "entrypoint", "metadata", "pool", "pool_hash", "_job_id", "links", "is_primary_in_job_group", "zone", "labels", "cluster_name_on_cloud")
     class AcceleratorsEntry(_message.Message):
         __slots__ = ("key", "value")
         KEY_FIELD_NUMBER: _ClassVar[int]
@@ -149,6 +153,13 @@ class ManagedJobInfo(_message.Message):
         key: str
         value: str
         def __init__(self, key: _Optional[str] = ..., value: _Optional[str] = ...) -> None: ...
+    class LabelsEntry(_message.Message):
+        __slots__ = ("key", "value")
+        KEY_FIELD_NUMBER: _ClassVar[int]
+        VALUE_FIELD_NUMBER: _ClassVar[int]
+        key: str
+        value: str
+        def __init__(self, key: _Optional[str] = ..., value: _Optional[str] = ...) -> None: ...
     JOB_ID_FIELD_NUMBER: _ClassVar[int]
     TASK_ID_FIELD_NUMBER: _ClassVar[int]
     JOB_NAME_FIELD_NUMBER: _ClassVar[int]
@@ -179,6 +190,10 @@ class ManagedJobInfo(_message.Message):
     POOL_HASH_FIELD_NUMBER: _ClassVar[int]
     _JOB_ID_FIELD_NUMBER: _ClassVar[int]
     LINKS_FIELD_NUMBER: _ClassVar[int]
+    IS_PRIMARY_IN_JOB_GROUP_FIELD_NUMBER: _ClassVar[int]
+    ZONE_FIELD_NUMBER: _ClassVar[int]
+    LABELS_FIELD_NUMBER: _ClassVar[int]
+    CLUSTER_NAME_ON_CLOUD_FIELD_NUMBER: _ClassVar[int]
     job_id: int
     task_id: int
     job_name: str
@@ -209,7 +224,11 @@ class ManagedJobInfo(_message.Message):
     pool_hash: str
     _job_id: int
     links: _containers.ScalarMap[str, str]
-    def __init__(self, job_id: _Optional[int] = ..., task_id: _Optional[int] = ..., job_name: _Optional[str] = ..., task_name: _Optional[str] = ..., job_duration: _Optional[float] = ..., workspace: _Optional[str] = ..., status: _Optional[_Union[ManagedJobStatus, str]] = ..., schedule_state: _Optional[_Union[ManagedJobScheduleState, str]] = ..., resources: _Optional[str] = ..., cluster_resources: _Optional[str] = ..., cluster_resources_full: _Optional[str] = ..., cloud: _Optional[str] = ..., region: _Optional[str] = ..., infra: _Optional[str] = ..., accelerators: _Optional[_Mapping[str, float]] = ..., recovery_count: _Optional[int] = ..., details: _Optional[str] = ..., failure_reason: _Optional[str] = ..., user_name: _Optional[str] = ..., user_hash: _Optional[str] = ..., submitted_at: _Optional[float] = ..., start_at: _Optional[float] = ..., end_at: _Optional[float] = ..., user_yaml: _Optional[str] = ..., entrypoint: _Optional[str] = ..., metadata: _Optional[_Mapping[str, str]] = ..., pool: _Optional[str] = ..., pool_hash: _Optional[str] = ..., _job_id: _Optional[int] = ..., links: _Optional[_Mapping[str, str]] = ...) -> None: ...
+    is_primary_in_job_group: bool
+    zone: str
+    labels: _containers.ScalarMap[str, str]
+    cluster_name_on_cloud: str
+    def __init__(self, job_id: _Optional[int] = ..., task_id: _Optional[int] = ..., job_name: _Optional[str] = ..., task_name: _Optional[str] = ..., job_duration: _Optional[float] = ..., workspace: _Optional[str] = ..., status: _Optional[_Union[ManagedJobStatus, str]] = ..., schedule_state: _Optional[_Union[ManagedJobScheduleState, str]] = ..., resources: _Optional[str] = ..., cluster_resources: _Optional[str] = ..., cluster_resources_full: _Optional[str] = ..., cloud: _Optional[str] = ..., region: _Optional[str] = ..., infra: _Optional[str] = ..., accelerators: _Optional[_Mapping[str, float]] = ..., recovery_count: _Optional[int] = ..., details: _Optional[str] = ..., failure_reason: _Optional[str] = ..., user_name: _Optional[str] = ..., user_hash: _Optional[str] = ..., submitted_at: _Optional[float] = ..., start_at: _Optional[float] = ..., end_at: _Optional[float] = ..., user_yaml: _Optional[str] = ..., entrypoint: _Optional[str] = ..., metadata: _Optional[_Mapping[str, str]] = ..., pool: _Optional[str] = ..., pool_hash: _Optional[str] = ..., _job_id: _Optional[int] = ..., links: _Optional[_Mapping[str, str]] = ..., is_primary_in_job_group: bool = ..., zone: _Optional[str] = ..., labels: _Optional[_Mapping[str, str]] = ..., cluster_name_on_cloud: _Optional[str] = ...) -> None: ...
 
 class GetJobTableResponse(_message.Message):
     __slots__ = ("jobs", "total", "total_no_filter", "status_counts")
diff --git a/sky/schemas/proto/autostopv1.proto b/sky/schemas/proto/autostopv1.proto
index 6a80dd2acc7..89632758093 100644
--- a/sky/schemas/proto/autostopv1.proto
+++ b/sky/schemas/proto/autostopv1.proto
@@ -21,6 +21,8 @@ message SetAutostopRequest {
   string backend = 2;
   AutostopWaitFor wait_for = 3;
   bool down = 4;
+  optional string hook = 5;
+  optional int32 hook_timeout = 6;
 }
 
 message SetAutostopResponse {}
diff --git a/sky/schemas/proto/jobsv1.proto b/sky/schemas/proto/jobsv1.proto
index e78a43825b6..1a3e21777e2 100644
--- a/sky/schemas/proto/jobsv1.proto
+++ b/sky/schemas/proto/jobsv1.proto
@@ -27,6 +27,8 @@ service JobsService {
   rpc GetLogDirsForJobs(GetLogDirsForJobsRequest) returns (GetLogDirsForJobsResponse);
   // Get job exit codes.
   rpc GetJobExitCodes(GetJobExitCodesRequest) returns (GetJobExitCodesResponse);
+  // Set job info without creating entries in the jobs table (for managed jobs).
+  rpc SetJobInfoWithoutJobId(SetJobInfoWithoutJobIdRequest) returns (SetJobInfoWithoutJobIdResponse);
 }
 
 enum JobStatus {
@@ -70,6 +72,8 @@ message ManagedJobInfo {
   string entrypoint = 4;
   repeated ManagedJobTask tasks = 5;
   optional string user_id = 6;
+  // Execution mode: 'parallel' (job group) or 'serial' (pipeline/single job)
+  optional string execution = 7;
 }
 
 message ManagedJobTask {
@@ -77,6 +81,10 @@ message ManagedJobTask {
   string name = 2;
   string resources_str = 3;
   string metadata_json = 4;
+  // Whether this task is a primary task (true) or auxiliary task (false)
+  // within a job group. Auxiliary tasks are terminated when all primary tasks
+  // complete. NULL for non-job-group jobs (single jobs and pipelines).
+  optional bool is_primary_in_job_group = 5;
 }
 
 message QueueJobResponse {}
@@ -174,3 +182,23 @@ message GetJobExitCodesRequest {
 message GetJobExitCodesResponse {
   repeated int32 exit_codes = 1;
 }
+
+message SetJobInfoWithoutJobIdRequest {
+  string name = 1;
+  string workspace = 2;
+  string entrypoint = 3;
+  optional string pool = 4;
+  optional string pool_hash = 5;
+  optional string user_hash = 6;
+  repeated int32 task_ids = 7;
+  repeated string task_names = 8;
+  string resources_str = 9;
+  repeated string metadata_jsons = 10;
+  int32 num_jobs = 11;
+  string execution = 12;
+  repeated bool is_primary_in_job_groups = 13;
+}
+
+message SetJobInfoWithoutJobIdResponse {
+  repeated int64 job_ids = 1;
+}
diff --git a/sky/schemas/proto/managed_jobsv1.proto b/sky/schemas/proto/managed_jobsv1.proto
index 71c1b6a65e6..09d3836ffc7 100644
--- a/sky/schemas/proto/managed_jobsv1.proto
+++ b/sky/schemas/proto/managed_jobsv1.proto
@@ -87,6 +87,9 @@ message GetJobTableRequest {
   optional Statuses statuses = 10;
   bool show_jobs_without_user_hash = 11;
   optional Fields fields = 12;
+  // Sorting parameters, added in ManagedJobsService v14.
+  optional string sort_by = 13;
+  optional string sort_order = 14;
 }
 
 message ManagedJobInfo {
@@ -123,6 +126,16 @@ message ManagedJobInfo {
   optional int64 _job_id = 29;
   // Links (e.g., instance console URLs) as key-value pairs
   map<string, string> links = 30;
+  // Whether this task is a primary task (true) or auxiliary task (false)
+  // within a job group. Auxiliary tasks are terminated when all primary tasks
+  // complete. NULL for non-job-group jobs (single jobs and pipelines).
+  optional bool is_primary_in_job_group = 31;
+  // Zone where the cluster is running (populated from cluster handle)
+  optional string zone = 32;
+  // Labels from the cluster resources (populated from cluster handle)
+  map<string, string> labels = 33;
+  // Name of the cluster on the cloud provider (populated from cluster handle)
+  optional string cluster_name_on_cloud = 34;
 }
 
 message GetJobTableResponse {
diff --git a/sky/serve/autoscalers.py b/sky/serve/autoscalers.py
index 9110d269e6a..e613f25db8c 100644
--- a/sky/serve/autoscalers.py
+++ b/sky/serve/autoscalers.py
@@ -9,6 +9,7 @@
 from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
 
 from sky import sky_logging
+from sky.jobs import state as managed_job_state
 from sky.serve import constants
 from sky.serve import serve_state
 from sky.serve import serve_utils
@@ -211,7 +212,9 @@ def _clip_target_num_replicas(self, target_num_replicas: int) -> int:
     def from_spec(cls, service_name: str,
                   spec: 'service_spec.SkyServiceSpec') -> 'Autoscaler':
         # TODO(MaoZiming): use NAME to get the class.
-        if spec.use_ondemand_fallback:
+        if spec.pool:
+            return QueueLengthAutoscaler(service_name, spec)
+        elif spec.use_ondemand_fallback:
             return FallbackRequestRateAutoscaler(service_name, spec)
         elif isinstance(spec.target_qps_per_replica, dict):
             # Use instance-aware autoscaler
@@ -1065,3 +1068,221 @@ def _generate_scaling_decisions(
             _generate_scale_down_decisions(all_replica_ids_to_scale_down))
 
         return scaling_decisions
+
+
+class QueueLengthAutoscaler(_AutoscalerWithHysteresis):
+    """QueueLengthAutoscaler: Autoscale pools based on queue length.
+
+    Scales pool workers based on the number of pending jobs in the queue.
+    When queue length exceeds the threshold, scales up by 1 worker.
+    When queue length is below the threshold, scales down by 1 worker.
+    Uses hysteresis to prevent rapid scaling decisions.
+    """
+
+    def __init__(self, service_name: str,
+                 spec: 'service_spec.SkyServiceSpec') -> None:
+        """Initialize the queue length autoscaler.
+
+        Variables:
+            queue_length_threshold: Threshold for queue length to trigger
+            scaling up or down.
+            service_name: The pool name (used to query pending jobs).
+        """
+        super().__init__(service_name, spec)
+        # Use default threshold if not specified
+        self.queue_length_threshold = (
+            spec.queue_length_threshold
+            if spec.queue_length_threshold is not None else
+            constants.AUTOSCALER_DEFAULT_QUEUE_LENGTH_THRESHOLD)
+        self._service_name: str = service_name
+        logger.info(f'QueueLengthAutoscaler for pool "{service_name}": '
+                    f'min_replicas={self.min_replicas}, '
+                    f'max_replicas={self.max_replicas}, '
+                    f'queue_length_threshold={self.queue_length_threshold}')
+
+    def _calculate_target_num_replicas(self) -> int:
+        """Calculate target number of replicas based on queue length."""
+        queue_length = managed_job_state.get_pending_jobs_count_by_pool(
+            self._service_name)
+        current_num_replicas = self.target_num_replicas
+
+        logger.info(f'[QueueLengthAutoscaler] Pool "{self._service_name}": '
+                    f'queue_length={queue_length}, '
+                    f'threshold={self.queue_length_threshold}, '
+                    f'current_target_replicas={current_num_replicas}, '
+                    f'min_replicas={self.min_replicas}, '
+                    f'max_replicas={self.max_replicas}')
+
+        # Determine target based on queue length vs threshold
+        if queue_length == 0:
+            # There are no pending jobs, we should quickly scale down to 0.
+            target_num_replicas = 0
+            decision = 'SCALE_DOWN_TO_ZERO'
+        elif queue_length > self.queue_length_threshold:
+            # Scale up by 1
+            # TODO(lloyd): we probably want support for scaling up by more than
+            # 1 in the future. We are punting on this currently because without
+            # an understanding of the workload the right number of replicas to
+            # scale up by is not clear and the user can just tweak the upscale
+            # delay to control the rate of scaling up.
+            target_num_replicas = current_num_replicas + 1
+            decision = 'SCALE_UP'
+        elif queue_length < self.queue_length_threshold:
+            # Scale down by 1
+            target_num_replicas = current_num_replicas - 1
+            decision = 'SCALE_DOWN'
+        else:
+            # Queue length equals threshold, keep current
+            target_num_replicas = current_num_replicas
+            decision = 'NO_CHANGE'
+        logger.info(f'[QueueLengthAutoscaler] Decision: {decision} '
+                    f'{current_num_replicas} -> {target_num_replicas}')
+
+        # Special case: if target_num_replicas is 0 and queue_length is greater
+        # than 0, we should not scale down to 0. This is to prevent the service
+        # from scaling to zero when there are jobs in the queue.
+        if target_num_replicas == 0 and queue_length > 0:
+            target_num_replicas = 1
+            logger.info('Preventing scale to zero since there are jobs in the'
+                        f'queue: {queue_length}')
+
+        clipped_target = self._clip_target_num_replicas(target_num_replicas)
+        if clipped_target != target_num_replicas:
+            logger.info(f'[QueueLengthAutoscaler] Clipped target: '
+                        f'{target_num_replicas} -> {clipped_target} '
+                        f'(bounds: [{self.min_replicas}, {self.max_replicas}])')
+
+        return clipped_target
+
+    def update_version(self, version: int, spec: 'service_spec.SkyServiceSpec',
+                       update_mode: serve_utils.UpdateMode) -> None:
+        super().update_version(version, spec, update_mode)
+        # Update threshold.
+        if isinstance(spec.queue_length_threshold, int):
+            self.queue_length_threshold = spec.queue_length_threshold
+
+    def collect_request_information(
+            self, request_aggregator_info: Dict[str, Any]) -> None:
+        """Collect request information from aggregator for autoscaling.
+
+        Not needed for queue-based autoscaling, we query the job queue directly.
+        """
+        pass
+
+    def _get_idle_replicas(
+        self,
+        replica_infos: List['replica_managers.ReplicaInfo'],
+    ) -> List['replica_managers.ReplicaInfo']:
+        """Get replicas that have no active jobs (idle replicas).
+
+        Args:
+            replica_infos: List of replica information to check.
+
+        Returns:
+            List of replicas that have no active jobs running on them.
+        """
+        idle_replicas = []
+        for info in replica_infos:
+            # Check if this replica has any active jobs
+            active_jobs = managed_job_state.get_nonterminal_job_ids_by_pool(
+                self._service_name, cluster_name=info.cluster_name)
+            if not active_jobs:
+                idle_replicas.append(info)
+                logger.debug(
+                    f'[QueueLengthAutoscaler] Replica {info.replica_id} '
+                    f'({info.cluster_name}) is idle (no active jobs)')
+            else:
+                logger.debug(
+                    f'[QueueLengthAutoscaler] Replica {info.replica_id} '
+                    f'({info.cluster_name}) has {len(active_jobs)} active jobs,'
+                    ' skipping for scale-down')
+        return idle_replicas
+
+    def _generate_scaling_decisions(
+        self,
+        replica_infos: List['replica_managers.ReplicaInfo'],
+    ) -> List[AutoscalerDecision]:
+        """Generate Autoscaling decisions based on queue length.
+
+        Overrides parent to ensure we only scale down replicas that are idle
+        (not running any jobs).
+        """
+        # Use standard hysteresis-based logic
+        self._set_target_num_replicas_with_hysteresis()
+
+        latest_nonterminal_replicas: List['replica_managers.ReplicaInfo'] = []
+
+        for info in replica_infos:
+            if info.version == self.latest_version:
+                if not info.is_terminal:
+                    latest_nonterminal_replicas.append(info)
+
+        scaling_decisions: List[AutoscalerDecision] = []
+
+        # Case 1. when latest_nonterminal_replicas is less
+        # than num_to_provision, we always scale up new replicas.
+        target_num_replicas = self.get_final_target_num_replicas()
+        if len(latest_nonterminal_replicas) < target_num_replicas:
+            num_replicas_to_scale_up = (target_num_replicas -
+                                        len(latest_nonterminal_replicas))
+            logger.info('[QueueLengthAutoscaler] Number of replicas to scale up'
+                        f': {num_replicas_to_scale_up}')
+            scaling_decisions.extend(
+                _generate_scale_up_decisions(num_replicas_to_scale_up, None))
+
+        # Case 2: when latest_nonterminal_replicas is more
+        # than target_num_replicas, we scale down new replicas.
+        # IMPORTANT: Only scale down replicas that are idle (no active jobs).
+        replicas_to_scale_down = []
+        if len(latest_nonterminal_replicas) > target_num_replicas:
+            num_replicas_to_scale_down = (len(latest_nonterminal_replicas) -
+                                          target_num_replicas)
+
+            # Get idle replicas (replicas with no active jobs)
+            idle_replicas = self._get_idle_replicas(latest_nonterminal_replicas)
+            num_idle_replicas = len(idle_replicas)
+
+            # Clip the number of replicas to scale down to the number of idle
+            # replicas.
+            actual_num_to_scale_down = min(num_replicas_to_scale_down,
+                                           num_idle_replicas)
+
+            if actual_num_to_scale_down < num_replicas_to_scale_down:
+                logger.info(
+                    f'[QueueLengthAutoscaler] Clipping scale-down: requested '
+                    f'{num_replicas_to_scale_down} replicas, but only '
+                    f'{num_idle_replicas} idle replicas available. Scaling down'
+                    f' {actual_num_to_scale_down} replicas.')
+
+            if actual_num_to_scale_down > 0:
+                # Select replicas to scale down from idle replicas only
+                replicas_to_scale_down = (
+                    _select_nonterminal_replicas_to_scale_down(
+                        actual_num_to_scale_down, idle_replicas))
+                logger.info(
+                    f'[QueueLengthAutoscaler] Number of replicas to scale down:'
+                    f' {actual_num_to_scale_down} {replicas_to_scale_down}')
+            elif num_replicas_to_scale_down > 0:
+                logger.info(
+                    f'[QueueLengthAutoscaler] Cannot scale down: requested '
+                    f'{num_replicas_to_scale_down} replicas, but all replicas '
+                    'have active jobs. Skipping scale-down.')
+
+        scaling_decisions.extend(
+            _generate_scale_down_decisions(replicas_to_scale_down))
+
+        return scaling_decisions
+
+    def _dump_dynamic_states(self) -> Dict[str, Any]:
+        """Dump dynamic states from autoscaler.
+
+        Hysteresis state is handled by base class, no additional state needed.
+        """
+        return {}
+
+    def _load_dynamic_states(self, dynamic_states: Dict[str, Any]) -> None:
+        """Load dynamic states to autoscaler.
+
+        Hysteresis state is handled by base class, no additional state needed.
+        """
+        pass
diff --git a/sky/serve/client/sdk_async.py b/sky/serve/client/sdk_async.py
index 3eda1e4c892..1bc35973125 100644
--- a/sky/serve/client/sdk_async.py
+++ b/sky/serve/client/sdk_async.py
@@ -1,11 +1,11 @@
 """Async SDK for SkyServe."""
+import asyncio
 import typing
 from typing import Any, Dict, List, Optional, Tuple, Union
 
 from sky.client import sdk_async
 from sky.serve.client import sdk
 from sky.usage import usage_lib
-from sky.utils import context_utils
 
 if typing.TYPE_CHECKING:
     import io
@@ -25,8 +25,8 @@ async def up(
         sdk_async.StreamConfig] = sdk_async.DEFAULT_STREAM_CONFIG
 ) -> Tuple[str, str]:
     """Async version of up() that spins up a service."""
-    request_id = await context_utils.to_thread(sdk.up, task, service_name,
-                                               _need_confirmation)
+    request_id = await asyncio.to_thread(sdk.up, task, service_name,
+                                         _need_confirmation)
     if stream_logs is not None:
         return await sdk_async._stream_and_get(request_id, stream_logs)  # pylint: disable=protected-access
     else:
@@ -45,8 +45,8 @@ async def update(
         sdk_async.StreamConfig] = sdk_async.DEFAULT_STREAM_CONFIG
 ) -> None:
     """Async version of update() that updates an existing service."""
-    request_id = await context_utils.to_thread(sdk.update, task, service_name,
-                                               mode, _need_confirmation)
+    request_id = await asyncio.to_thread(sdk.update, task, service_name, mode,
+                                         _need_confirmation)
     if stream_logs is not None:
         return await sdk_async._stream_and_get(request_id, stream_logs)  # pylint: disable=protected-access
     else:
@@ -62,8 +62,7 @@ async def down(
         sdk_async.StreamConfig] = sdk_async.DEFAULT_STREAM_CONFIG
 ) -> None:
     """Async version of down() that tears down a service."""
-    request_id = await context_utils.to_thread(sdk.down, service_names, all,
-                                               purge)
+    request_id = await asyncio.to_thread(sdk.down, service_names, all, purge)
     if stream_logs is not None:
         return await sdk_async._stream_and_get(request_id, stream_logs)  # pylint: disable=protected-access
     else:
@@ -80,8 +79,8 @@ async def terminate_replica(
 ) -> None:
     """Async version of terminate_replica() that tears down a specific
     replica."""
-    request_id = await context_utils.to_thread(sdk.terminate_replica,
-                                               service_name, replica_id, purge)
+    request_id = await asyncio.to_thread(sdk.terminate_replica, service_name,
+                                         replica_id, purge)
     if stream_logs is not None:
         return await sdk_async._stream_and_get(request_id, stream_logs)  # pylint: disable=protected-access
     else:
@@ -95,7 +94,7 @@ async def status(
         sdk_async.StreamConfig] = sdk_async.DEFAULT_STREAM_CONFIG
 ) -> List[Dict[str, Any]]:
     """Async version of status() that sdk_async.gets service statuses."""
-    request_id = await context_utils.to_thread(sdk.status, service_names)
+    request_id = await asyncio.to_thread(sdk.status, service_names)
     if stream_logs is not None:
         return await sdk_async._stream_and_get(request_id, stream_logs)  # pylint: disable=protected-access
     else:
@@ -109,8 +108,8 @@ async def tail_logs(service_name: str,
                     follow: bool = True,
                     output_stream: Optional['io.TextIOBase'] = None) -> None:
     """Async version of tail_logs() that tails logs for a service."""
-    return await context_utils.to_thread(sdk.tail_logs, service_name, target,
-                                         replica_id, follow, output_stream)
+    return await asyncio.to_thread(sdk.tail_logs, service_name, target,
+                                   replica_id, follow, output_stream)
 
 
 @usage_lib.entrypoint
@@ -123,8 +122,8 @@ async def sync_down_logs(service_name: str,
                          replica_ids: Optional[List[int]] = None) -> None:
     """Async version of sync_down_logs() that syncs down logs from service
       components."""
-    return await context_utils.to_thread(sdk.sync_down_logs,
-                                         service_name,
-                                         local_dir,
-                                         targets=targets,
-                                         replica_ids=replica_ids)
+    return await asyncio.to_thread(sdk.sync_down_logs,
+                                   service_name,
+                                   local_dir,
+                                   targets=targets,
+                                   replica_ids=replica_ids)
diff --git a/sky/serve/constants.py b/sky/serve/constants.py
index 79974d66e36..37f8c23d3d1 100644
--- a/sky/serve/constants.py
+++ b/sky/serve/constants.py
@@ -60,6 +60,10 @@
 # We will downscale only if the target number of instances
 # is smaller than the current launched instances for delay amount of time.
 AUTOSCALER_DEFAULT_DOWNSCALE_DELAY_SECONDS = 1200
+# Default queue length threshold for pool autoscaling.
+# When max_workers is set but queue_length_threshold is not specified,
+# this default threshold will be used.
+AUTOSCALER_DEFAULT_QUEUE_LENGTH_THRESHOLD = 1
 # The default controller resources. We need 200 GB disk space to enable using
 # Azure as controller, since its default image size is 150 GB.
 # TODO(tian): We might need to be careful that service logs can take a lot of
diff --git a/sky/serve/replica_managers.py b/sky/serve/replica_managers.py
index ec57d401728..50b9061b3a8 100644
--- a/sky/serve/replica_managers.py
+++ b/sky/serve/replica_managers.py
@@ -1096,7 +1096,8 @@ def _handle_preemption(self, info: ReplicaInfo) -> bool:
             info.cluster_name,
             force_refresh_statuses=set(status_lib.ClusterStatus))
 
-        if cluster_status == status_lib.ClusterStatus.UP:
+        if cluster_status in (status_lib.ClusterStatus.UP,
+                              status_lib.ClusterStatus.AUTOSTOPPING):
             return False
         # The cluster is (partially) preempted. It can be down, INIT or STOPPED,
         # based on the interruption behavior of the cloud.
@@ -1227,7 +1228,12 @@ def _refresh_thread_pool(self) -> None:
                 # Delete old version metadata.
                 serve_state.delete_version(self._service_name, version)
                 # Delete storage buckets of older versions.
-                service.cleanup_storage(yaml_content)
+                if not self._is_pool:
+                    # For pools, we don't clean up the storage, because the
+                    # storage is shared between all replicas. We clean up the
+                    # storage in sky/serve/service.py when the pool is
+                    # terminated so storage will not be leaked.
+                    service.cleanup_storage(yaml_content)
             # newest version will be cleaned in serve down
             self.least_recent_version = current_least_recent_version
 
diff --git a/sky/serve/serve_utils.py b/sky/serve/serve_utils.py
index 5d4394740f1..c9c7e9cd8e9 100644
--- a/sky/serve/serve_utils.py
+++ b/sky/serve/serve_utils.py
@@ -1040,6 +1040,18 @@ def get_next_cluster_name(
 
         managed_job_state.set_current_cluster_name(job_id,
                                                    replica_info.cluster_name)
+
+        # Set infrastructure info for sorting/filtering
+        handle = replica_info.handle()
+        if handle is not None and handle.launched_resources is not None:
+            lr = handle.launched_resources
+            managed_job_state.set_job_infra(
+                job_id,
+                cloud=str(lr.cloud) if lr.cloud is not None else None,
+                region=lr.region,
+                zone=lr.zone,
+            )
+
         return replica_info.cluster_name
 
 
@@ -1302,7 +1314,8 @@ def _process_line(
     # We should tail the detailed logs for user.
     def cluster_is_up() -> bool:
         status = global_user_state.get_status_from_cluster_name(cluster_name)
-        return status == status_lib.ClusterStatus.UP
+        return status in (status_lib.ClusterStatus.UP,
+                          status_lib.ClusterStatus.AUTOSTOPPING)
 
     provision_api_log_prompt = re.match(_SKYPILOT_PROVISION_API_LOG_PATTERN,
                                         line)
diff --git a/sky/serve/server/impl.py b/sky/serve/server/impl.py
index 6cec9dccf00..6b99a75e3d7 100644
--- a/sky/serve/server/impl.py
+++ b/sky/serve/server/impl.py
@@ -522,7 +522,7 @@ def update(
                     f'{workers} is not supported. Ignoring the update.')
 
         # Load the existing task configuration from the service's YAML file
-        yaml_content = service_record['pool_yaml']
+        yaml_content = service_record['yaml_content']
 
         # Load the existing task configuration
         task = task_lib.Task.from_yaml_str(yaml_content)
diff --git a/sky/serve/server/server.py b/sky/serve/server/server.py
index 051b5249dd1..563400ad2f5 100644
--- a/sky/serve/server/server.py
+++ b/sky/serve/server/server.py
@@ -31,6 +31,7 @@ async def up(
         func=core.up,
         schedule_type=api_requests.ScheduleType.LONG,
         request_cluster_name=common.SKY_SERVE_CONTROLLER_NAME,
+        auth_user=request.state.auth_user,
     )
 
 
@@ -46,6 +47,7 @@ async def update(
         func=core.update,
         schedule_type=api_requests.ScheduleType.SHORT,
         request_cluster_name=common.SKY_SERVE_CONTROLLER_NAME,
+        auth_user=request.state.auth_user,
     )
 
 
@@ -61,6 +63,7 @@ async def down(
         func=core.down,
         schedule_type=api_requests.ScheduleType.SHORT,
         request_cluster_name=common.SKY_SERVE_CONTROLLER_NAME,
+        auth_user=request.state.auth_user,
     )
 
 
@@ -76,6 +79,7 @@ async def terminate_replica(
         func=core.terminate_replica,
         schedule_type=api_requests.ScheduleType.SHORT,
         request_cluster_name=common.SKY_SERVE_CONTROLLER_NAME,
+        auth_user=request.state.auth_user,
     )
 
 
@@ -91,6 +95,7 @@ async def status(
         func=core.status,
         schedule_type=api_requests.ScheduleType.SHORT,
         request_cluster_name=common.SKY_SERVE_CONTROLLER_NAME,
+        auth_user=request.state.auth_user,
     )
 
 
@@ -107,6 +112,7 @@ async def tail_logs(
         func=core.tail_logs,
         schedule_type=api_requests.ScheduleType.SHORT,
         request_cluster_name=common.SKY_SERVE_CONTROLLER_NAME,
+        auth_user=request.state.auth_user,
     )
     task = executor.execute_request_in_coroutine(request_task)
     # Cancel the coroutine after the request is done or client disconnects
@@ -140,4 +146,5 @@ async def download_logs(
         func=core.sync_down_logs,
         schedule_type=api_requests.ScheduleType.SHORT,
         request_cluster_name=common.SKY_SERVE_CONTROLLER_NAME,
+        auth_user=request.state.auth_user,
     )
diff --git a/sky/serve/service_spec.py b/sky/serve/service_spec.py
index dfeb1902799..4d2544a2a20 100644
--- a/sky/serve/service_spec.py
+++ b/sky/serve/service_spec.py
@@ -5,6 +5,7 @@
 from typing import Any, Dict, List, Optional, Union
 
 from sky import serve
+from sky import sky_logging
 from sky.serve import constants
 from sky.serve import load_balancing_policies as lb_policies
 from sky.serve import serve_utils
@@ -14,6 +15,8 @@
 from sky.utils import ux_utils
 from sky.utils import yaml_utils
 
+logger = sky_logging.init_logger(__name__)
+
 
 class SkyServiceSpec:
     """SkyServe service specification."""
@@ -38,31 +41,54 @@ def __init__(
         downscale_delay_seconds: Optional[int] = None,
         load_balancing_policy: Optional[str] = None,
         pool: Optional[bool] = None,
+        queue_length_threshold: Optional[int] = None,
     ) -> None:
         if pool:
-            for unsupported_field in [
-                    'max_replicas',
-                    'num_overprovision',
-                    'target_qps_per_replica',
+            # For pools, max_replicas should never be specified directly by the
+            # user. It should only be set via max_workers in the pool config.
+            # However, if queue_length_threshold is set, that means max_replicas
+            # was set internally from max_workers, so we allow it
+            unsupported_fields = [
+                'num_overprovision',
+                'target_qps_per_replica',
+                'base_ondemand_fallback_replicas',
+                'dynamic_ondemand_fallback',
+                'spot_placer',
+                'load_balancing_policy',
+                'ports',
+                'post_data',
+                'tls_credential',
+                'readiness_headers',
+            ]
+            # Only restrict delay fields if autoscaling is not enabled
+            # Autoscaling is enabled when max_replicas (from max_workers) is set
+            if max_replicas is None:
+                unsupported_fields.extend([
                     'upscale_delay_seconds',
                     'downscale_delay_seconds',
-                    'base_ondemand_fallback_replicas',
-                    'dynamic_ondemand_fallback',
-                    'spot_placer',
-                    'load_balancing_policy',
-                    'ports',
-                    'post_data',
-                    'tls_credential',
-                    'readiness_headers',
-            ]:
+                ])
+
+            for unsupported_field in unsupported_fields:
                 if locals()[unsupported_field] is not None:
                     with ux_utils.print_exception_no_traceback():
-                        raise ValueError(
+                        error_msg = (
                             f'{unsupported_field} is not supported for pool.')
-            if max_replicas is not None and max_replicas != min_replicas:
-                with ux_utils.print_exception_no_traceback():
-                    raise ValueError('Autoscaling is not supported for pool '
-                                     'for now.')
+                        raise ValueError(error_msg)
+
+            # Validate queue_length_threshold if provided
+            if queue_length_threshold is not None:
+                if queue_length_threshold <= 0:
+                    with ux_utils.print_exception_no_traceback():
+                        raise ValueError('queue_length_threshold must be > 0. '
+                                         f'Got: {queue_length_threshold}')
+                # If queue_length_threshold is set, max_workers (max_replicas)
+                # must also be set.
+                if max_replicas is None:
+                    with ux_utils.print_exception_no_traceback():
+                        raise ValueError(
+                            'max_workers must be set when '
+                            'queue_length_threshold is specified for pool '
+                            'autoscaling.')
 
         if max_replicas is not None and max_replicas < min_replicas:
             with ux_utils.print_exception_no_traceback():
@@ -77,7 +103,10 @@ def __init__(
                     raise ValueError('max_replicas must be set where '
                                      'target_qps_per_replica is set.')
         else:
-            if max_replicas is not None and max_replicas != min_replicas:
+            # Allow different min/max replicas for pools with queue-length
+            # autoscaling
+            if (not pool and max_replicas is not None and
+                    max_replicas != min_replicas):
                 with ux_utils.print_exception_no_traceback():
                     raise ValueError(
                         'Detected different min_replicas and max_replicas '
@@ -118,6 +147,7 @@ def __init__(
         self._downscale_delay_seconds: Optional[int] = downscale_delay_seconds
         self._load_balancing_policy: Optional[str] = load_balancing_policy
         self._pool: Optional[bool] = pool
+        self._queue_length_threshold: Optional[int] = queue_length_threshold
 
         self._use_ondemand_fallback: bool = (
             self.dynamic_ondemand_fallback is not None and
@@ -187,7 +217,8 @@ def from_yaml_config(config: Dict[str, Any]) -> 'SkyServiceSpec':
         if policy_section is not None and pool_config:
             with ux_utils.print_exception_no_traceback():
                 raise ValueError('Cannot specify `replica_policy` for cluster '
-                                 'pool. Only `workers: <num>` is supported '
+                                 'pool. Only `workers: <num>` or `min_workers: '
+                                 '<num> max_workers: <num>` is supported '
                                  'for pool now.')
 
         simplified_policy_section = config.get('replicas', None)
@@ -202,17 +233,76 @@ def from_yaml_config(config: Dict[str, Any]) -> 'SkyServiceSpec':
                                  'Please use `workers` instead.')
         if simplified_policy_section is None:
             simplified_policy_section = workers_config
+
+        # Parse pool config if it's a dict (for autoscaling support)
+        queue_length_threshold = None
+        pool_min_workers = None
+        pool_max_workers = None
+        pool_upscale_delay = None
+        pool_downscale_delay = None
+        if pool_config is not None and isinstance(pool_config, dict):
+            queue_length_threshold = pool_config.get('queue_length_threshold',
+                                                     None)
+            pool_min_workers = pool_config.get('min_workers', None)
+            pool_max_workers = pool_config.get('max_workers', None)
+            pool_upscale_delay = pool_config.get('upscale_delay_seconds', None)
+            pool_downscale_delay = pool_config.get('downscale_delay_seconds',
+                                                   None)
+            workers_config = pool_config.get('workers', workers_config)
+            # Validate: one of workers or max_workers and min_workers must be
+            # set.
+            if (pool_min_workers is None and pool_max_workers is None and
+                    workers_config is None):
+                with ux_utils.print_exception_no_traceback():
+                    raise ValueError(
+                        'One of workers, or both min_workers and max_workers'
+                        ' must be set for pool autoscaling.')
+            # Validate: if queue_length_threshold is set, max_workers must also
+            # be set
+            if queue_length_threshold is not None and pool_max_workers is None:
+                with ux_utils.print_exception_no_traceback():
+                    raise ValueError(
+                        'max_workers must be set when queue_length_threshold '
+                        'is specified for pool autoscaling.')
+            # Validate: if min_workers is set, max_workers must also be set
+            if pool_min_workers is not None and pool_max_workers is None:
+                with ux_utils.print_exception_no_traceback():
+                    raise ValueError(
+                        'max_workers must be set when min_workers is '
+                        'specified for pool autoscaling.')
+            # Validate: min_workers <= max_workers when both are set
+            if pool_min_workers is not None and pool_max_workers is not None:
+                if pool_min_workers > pool_max_workers:
+                    with ux_utils.print_exception_no_traceback():
+                        raise ValueError(
+                            f'min_workers ({pool_min_workers}) must be <= '
+                            f'max_workers ({pool_max_workers}) for pool '
+                            'autoscaling.')
         if policy_section is None or simplified_policy_section is not None:
             if simplified_policy_section is not None:
                 min_replicas = simplified_policy_section
+            elif workers_config is not None:
+                # Use workers_config from pool dict if available
+                min_replicas = workers_config
             else:
                 min_replicas = constants.DEFAULT_MIN_REPLICAS
+            # For pools with autoscaling set the relevant config values.
+            if pool_config is not None and pool_max_workers is not None:
+                if queue_length_threshold is None:
+                    queue_length_threshold = (
+                        constants.AUTOSCALER_DEFAULT_QUEUE_LENGTH_THRESHOLD)
+                    logger.info(
+                        'Set default queue_length_threshold='
+                        f'{queue_length_threshold} for pool with max_workers='
+                        f'{pool_max_workers}')
+                min_replicas = (pool_min_workers if pool_min_workers is not None
+                                else min_replicas)
             service_config['min_replicas'] = min_replicas
-            service_config['max_replicas'] = None
+            service_config['max_replicas'] = pool_max_workers
+            service_config['upscale_delay_seconds'] = pool_upscale_delay
+            service_config['downscale_delay_seconds'] = pool_downscale_delay
             service_config['num_overprovision'] = None
             service_config['target_qps_per_replica'] = None
-            service_config['upscale_delay_seconds'] = None
-            service_config['downscale_delay_seconds'] = None
         else:
             service_config['min_replicas'] = policy_section['min_replicas']
             service_config['max_replicas'] = policy_section.get(
@@ -233,6 +323,9 @@ def from_yaml_config(config: Dict[str, Any]) -> 'SkyServiceSpec':
             service_config['spot_placer'] = policy_section.get(
                 'spot_placer', None)
 
+        # Set queue_length_threshold from pool config
+        service_config['queue_length_threshold'] = queue_length_threshold
+
         service_config['load_balancing_policy'] = config.get(
             'load_balancing_policy', None)
 
@@ -310,8 +403,17 @@ def add_if_not_none(section: str,
         add_if_not_none('pool', None, self._pool)
 
         if self.pool:
-            # For pool, currently only `workers: <num>` is supported.
-            add_if_not_none('workers', None, self.min_replicas)
+            if self.max_replicas is not None:
+                add_if_not_none('pool', 'max_workers', self.max_replicas)
+                add_if_not_none('pool', 'queue_length_threshold',
+                                self.queue_length_threshold)
+                add_if_not_none('pool', 'min_workers', self.min_replicas)
+                add_if_not_none('pool', 'upscale_delay_seconds',
+                                self.upscale_delay_seconds)
+                add_if_not_none('pool', 'downscale_delay_seconds',
+                                self.downscale_delay_seconds)
+            else:
+                add_if_not_none('pool', 'workers', self.min_replicas)
             return config
 
         add_if_not_none('readiness_probe', 'path', self.readiness_path)
@@ -382,7 +484,14 @@ def spot_policy_str(self) -> str:
 
     def autoscaling_policy_str(self):
         if self.pool:
-            # We only support fixed-size pool for now.
+            if self.queue_length_threshold is not None:
+                # Autoscaling pool
+                max_plural = '' if self.max_replicas == 1 else 's'
+                min_plural = '' if self.min_replicas == 1 else 's'
+                return (f'Autoscaling from {self.min_replicas} to '
+                        f'{self.max_replicas} worker{max_plural} '
+                        f'(queue threshold: {self.queue_length_threshold})')
+            # Fixed-size pool
             return f'Fixed-size ({self.min_replicas} workers)'
         # TODO(MaoZiming): Update policy_str
         noun = 'worker' if self.pool else 'replica'
@@ -512,6 +621,10 @@ def pool(self) -> bool:
             return False
         return bool(self._pool)
 
+    @property
+    def queue_length_threshold(self) -> Optional[int]:
+        return self._queue_length_threshold
+
     def copy(self, **override) -> 'SkyServiceSpec':
         return SkyServiceSpec(
             readiness_path=override.pop('readiness_path', self._readiness_path),
@@ -543,4 +656,6 @@ def copy(self, **override) -> 'SkyServiceSpec':
             load_balancing_policy=override.pop('load_balancing_policy',
                                                self._load_balancing_policy),
             pool=override.pop('pool', self._pool),
+            queue_length_threshold=override.pop('queue_length_threshold',
+                                                self._queue_length_threshold),
         )
diff --git a/sky/server/auth/authn.py b/sky/server/auth/authn.py
deleted file mode 100644
index 19061f3c1eb..00000000000
--- a/sky/server/auth/authn.py
+++ /dev/null
@@ -1,50 +0,0 @@
-"""Authentication module."""
-import json
-from typing import Optional
-
-import fastapi
-
-from sky import models
-from sky import sky_logging
-from sky.skylet import constants
-
-logger = sky_logging.init_logger(__name__)
-
-
-# TODO(hailong): Remove this function and use request.state.auth_user instead.
-async def override_user_info_in_request_body(request: fastapi.Request,
-                                             auth_user: Optional[models.User]):
-    # Skip for upload requests to avoid consuming the body prematurely, which
-    # will break the streaming upload.
-    if request.url.path.startswith('/upload'):
-        return
-    if auth_user is None:
-        return
-
-    body = await request.body()
-    if body:
-        try:
-            original_json = await request.json()
-        except (json.JSONDecodeError, UnicodeDecodeError) as e:
-            logger.error(f'Error parsing request JSON: {e}')
-        else:
-            logger.debug(f'Overriding user for {request.state.request_id}: '
-                         f'{auth_user.name}, {auth_user.id}')
-            if 'env_vars' in original_json:
-                if isinstance(original_json.get('env_vars'), dict):
-                    original_json['env_vars'][
-                        constants.USER_ID_ENV_VAR] = auth_user.id
-                    original_json['env_vars'][
-                        constants.USER_ENV_VAR] = auth_user.name
-                else:
-                    logger.warning(
-                        f'"env_vars" in request body is not a dictionary '
-                        f'for request {request.state.request_id}. '
-                        'Skipping user info injection into body.')
-            else:
-                original_json['env_vars'] = {}
-                original_json['env_vars'][
-                    constants.USER_ID_ENV_VAR] = auth_user.id
-                original_json['env_vars'][
-                    constants.USER_ENV_VAR] = auth_user.name
-            request._body = json.dumps(original_json).encode('utf-8')  # pylint: disable=protected-access
diff --git a/sky/server/auth/oauth2_proxy.py b/sky/server/auth/oauth2_proxy.py
index 97aca4fa837..2fb73786cef 100644
--- a/sky/server/auth/oauth2_proxy.py
+++ b/sky/server/auth/oauth2_proxy.py
@@ -15,26 +15,14 @@
 from sky import global_user_state
 from sky import models
 from sky import sky_logging
+from sky.server import constants as server_constants
 from sky.server import middleware_utils
-from sky.server.auth import authn
 from sky.server.auth import loopback
 from sky.users import permission
 from sky.utils import common_utils
 
 logger = sky_logging.init_logger(__name__)
 
-# We do not support setting these in config.yaml because:
-# 1. config.yaml can be updated dynamically, but auth middleware does not
-#    support hot reload yet.
-# 2. If we introduce hot reload for auth middleware, bad config might
-#    invalidate all authenticated sessions and thus cannot be rolled back
-#    by API users.
-# TODO(aylei): we should introduce server.yaml for static server admin config,
-# which is more structured than multiple environment variables and can be less
-# confusing to users.
-OAUTH2_PROXY_BASE_URL_ENV_VAR = 'SKYPILOT_AUTH_OAUTH2_PROXY_BASE_URL'
-OAUTH2_PROXY_ENABLED_ENV_VAR = 'SKYPILOT_AUTH_OAUTH2_PROXY_ENABLED'
-
 
 @middleware_utils.websocket_aware
 class OAuth2ProxyMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
@@ -42,11 +30,12 @@ class OAuth2ProxyMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
 
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
-        self.enabled: bool = (os.getenv(OAUTH2_PROXY_ENABLED_ENV_VAR,
-                                        'false') == 'true')
+        self.enabled: bool = (os.getenv(
+            server_constants.OAUTH2_PROXY_ENABLED_ENV_VAR, 'false') == 'true')
         self.proxy_base: str = ''
         if self.enabled:
-            proxy_base = os.getenv(OAUTH2_PROXY_BASE_URL_ENV_VAR)
+            proxy_base = os.getenv(
+                server_constants.OAUTH2_PROXY_BASE_URL_ENV_VAR)
             if not proxy_base:
                 raise ValueError('OAuth2 Proxy is enabled but base_url is not '
                                  'set')
@@ -159,8 +148,6 @@ async def _authenticate(self, request: fastapi.Request, call_next,
                     permission.permission_service.add_user_if_not_exists(
                         auth_user.id)
                 request.state.auth_user = auth_user
-                await authn.override_user_info_in_request_body(
-                    request, auth_user)
                 return await call_next(request)
             elif auth_response.status == http.HTTPStatus.UNAUTHORIZED:
                 # For /api/health, we should allow unauthenticated requests to
@@ -171,6 +158,13 @@ async def _authenticate(self, request: fastapi.Request, call_next,
                     request.state.anonymous_user = True
                     return await call_next(request)
 
+                # Allow unauthenticated access to the polling auth endpoint.
+                # This endpoint is used by the CLI to poll for auth tokens
+                # during the login flow before authentication is complete.
+                if request.url.path == '/api/v1/auth/token':
+                    request.state.anonymous_user = True
+                    return await call_next(request)
+
                 # TODO(aylei): in unified authentication, the redirection
                 # or rejection should be done after all the authentication
                 # methods are performed.
diff --git a/sky/server/auth/sessions.py b/sky/server/auth/sessions.py
new file mode 100644
index 00000000000..911e311b01b
--- /dev/null
+++ b/sky/server/auth/sessions.py
@@ -0,0 +1,89 @@
+"""SQLite-based auth session storage for CLI login flow.
+
+This module provides server-side session storage for the polling-based
+CLI authentication flow. Sessions are keyed by code_challenge and
+expire after a configurable timeout. Uses SQLite for cross-worker access.
+"""
+import os
+import sqlite3
+import time
+from typing import Optional
+
+from sky.server import constants as server_constants
+from sky.utils import common_utils
+from sky.utils.db import db_utils
+
+# Table name for auth sessions
+_AUTH_SESSIONS_TABLE = 'auth_sessions'
+
+
+class AuthSessionStore:
+    """SQLite-backed storage for auth sessions."""
+
+    def __init__(self):
+        self._db_path = os.path.expanduser(
+            server_constants.API_SERVER_REQUEST_DB_PATH)
+
+    def _get_cursor(self):
+        """Get a cursor to the database, creating table if needed."""
+        return db_utils.safe_cursor(self._db_path)
+
+    def _ensure_table(self, cursor: sqlite3.Cursor) -> None:
+        """Ensure the auth_sessions table exists."""
+        cursor.execute(f"""
+            CREATE TABLE IF NOT EXISTS {_AUTH_SESSIONS_TABLE} (
+                code_challenge TEXT PRIMARY KEY,
+                token TEXT NOT NULL,
+                created_at REAL NOT NULL
+            )
+        """)
+
+    def _cleanup_expired(self, cursor: sqlite3.Cursor) -> None:
+        """Remove expired sessions."""
+        expiry_time = time.time(
+        ) - server_constants.AUTH_SESSION_TIMEOUT_SECONDS
+        cursor.execute(
+            f'DELETE FROM {_AUTH_SESSIONS_TABLE} WHERE created_at < ?',
+            (expiry_time,))
+
+    def create_session(self, code_challenge: str, token: str) -> None:
+        """Create an authorized session with the given token."""
+        with self._get_cursor() as cursor:
+            self._ensure_table(cursor)
+            self._cleanup_expired(cursor)
+
+            # Insert or replace (in case of duplicate authorize clicks)
+            cursor.execute(
+                f'INSERT OR REPLACE INTO {_AUTH_SESSIONS_TABLE} '
+                '(code_challenge, token, created_at) VALUES (?, ?, ?)',
+                (code_challenge, token, time.time()))
+
+    def poll_session(self, code_verifier: str) -> Optional[str]:
+        """Poll a session for its token using code_verifier.
+
+        Computes code_challenge from code_verifier to look up the session.
+        If found and valid, atomically consumes the session and returns the
+        token. Uses DELETE ... RETURNING for atomicity (requires SQLite 3.35+,
+        which is already required by the API server).
+
+        Returns:
+            The token if session exists and is valid, None otherwise.
+        """
+        code_challenge = common_utils.compute_code_challenge(code_verifier)
+        expiry_threshold = time.time(
+        ) - server_constants.AUTH_SESSION_TIMEOUT_SECONDS
+
+        with self._get_cursor() as cursor:
+            self._ensure_table(cursor)
+
+            # Atomically delete and return token if not expired
+            cursor.execute(
+                f'DELETE FROM {_AUTH_SESSIONS_TABLE} '
+                f'WHERE code_challenge = ? AND created_at > ? '
+                f'RETURNING token', (code_challenge, expiry_threshold))
+            row = cursor.fetchone()
+            return row[0] if row else None
+
+
+# Global session store instance
+auth_session_store = AuthSessionStore()
diff --git a/sky/server/common.py b/sky/server/common.py
index bf689004584..65fdaf348ec 100644
--- a/sky/server/common.py
+++ b/sky/server/common.py
@@ -592,8 +592,6 @@ def handle_request_error(response: 'requests.Response') -> None:
 def get_request_id(response: 'requests.Response') -> RequestId[T]:
     handle_request_error(response)
     request_id = response.headers.get('X-Skypilot-Request-ID')
-    if request_id is None:
-        request_id = response.headers.get('X-Request-ID')
     if request_id is None:
         with ux_utils.print_exception_no_traceback():
             raise RuntimeError(
@@ -989,7 +987,7 @@ def _get_client_file_mounts_path(
     translated_client_task_path = client_dir / f'{task_id}_translated.yaml'
     yaml_utils.dump_yaml(str(translated_client_task_path), task_configs)
 
-    dag = dag_utils.load_chain_dag_from_yaml(str(translated_client_task_path))
+    dag = dag_utils.load_dag_from_yaml(str(translated_client_task_path))
     return dag
 
 
diff --git a/sky/server/config.py b/sky/server/config.py
index 7fd790ca678..bdb02813d5a 100644
--- a/sky/server/config.py
+++ b/sky/server/config.py
@@ -2,12 +2,17 @@
 
 import dataclasses
 import enum
-from typing import Optional
+import functools
+import os
+from typing import Literal, Optional
 
 from sky import sky_logging
 from sky.server import constants as server_constants
 from sky.server import daemons
+from sky.skylet import constants
 from sky.utils import common_utils
+from sky.utils import config_utils
+from sky.utils import yaml_utils
 
 # Constants based on profiling the peak memory usage while serving various
 # sky commands. These estimation are highly related to usage patterns
@@ -46,6 +51,11 @@
 # system usage stats.
 _BURSTABLE_WORKERS_FOR_LOCAL = 1024
 
+SERVER_CONFIG_PATH = '~/.sky/.server.yaml'
+
+_DEFAULT_HEADER_NAME = 'X-Auth-Request-Email'
+_DEFAULT_IDENTITY_CLAIM = 'sub'
+
 logger = sky_logging.init_logger(__name__)
 
 
@@ -244,3 +254,146 @@ def _max_short_worker_parallism(mem_size_gb: float,
     available_mem = max(0, mem_size_gb - reserved_mem)
     n = max(_get_min_short_workers(), int(available_mem / SHORT_WORKER_MEM_GB))
     return n
+
+
+def _get_server_config_schema() -> dict:
+    """Returns the schema for validating ~/.sky/server.yaml.
+
+    This configuration file is used by the API server to configure
+    authentication and other server-specific settings.
+    """
+    external_proxy_schema = {
+        'type': 'object',
+        'additionalProperties': False,
+        'properties': {
+            'enabled': {
+                'type': 'boolean',
+            },
+            'header_name': {
+                'type': 'string',
+            },
+            'header_format': {
+                'type': 'string',
+                'enum': ['plaintext', 'jwt'],
+            },
+            'jwt_identity_claim': {
+                'type': 'string',
+            },
+        },
+    }
+
+    auth_schema = {
+        'type': 'object',
+        'additionalProperties': False,
+        'properties': {
+            'external_proxy': external_proxy_schema,
+        },
+    }
+
+    return {
+        '$schema': 'https://json-schema.org/draft/2020-12/schema',
+        'type': 'object',
+        'additionalProperties': False,
+        'properties': {
+            'auth': auth_schema,
+        },
+    }
+
+
+@dataclasses.dataclass
+class ExternalProxyConfig:
+    """Configuration for external authentication proxy.
+
+    Attributes:
+        enabled: Whether external proxy authentication is enabled.
+        header_name: The HTTP header containing user identity.
+        header_format: The format of the header value ('plaintext' or 'jwt').
+        jwt_identity_claim: The JWT claim containing user identity (only used
+            when header_format is 'jwt').
+    """
+    enabled: bool = False
+    header_name: str = _DEFAULT_HEADER_NAME
+    header_format: Literal['plaintext', 'jwt'] = 'plaintext'
+    jwt_identity_claim: str = _DEFAULT_IDENTITY_CLAIM
+
+
+def load_server_config() -> config_utils.Config:
+    """Load server configuration from ~/.sky/server.yaml.
+
+    Returns:
+        A Config object containing the server configuration.
+    """
+    config_path = os.path.expanduser(SERVER_CONFIG_PATH)
+    if not os.path.exists(config_path):
+        return config_utils.Config()
+
+    config_data = yaml_utils.read_yaml(config_path)
+    if not config_data:
+        return config_utils.Config()
+
+    common_utils.validate_schema(config_data, _get_server_config_schema(),
+                                 config_path)
+    return config_utils.Config.from_dict(config_data)
+
+
+@functools.lru_cache(maxsize=1)
+def load_external_proxy_config() -> ExternalProxyConfig:
+    """Load external proxy configuration.
+
+    Returns:
+        ExternalProxyConfig with the resolved configuration.
+
+    Raises:
+        ValueError: If both JWT header format and legacy header env var are set
+    """
+    server_config = load_server_config()
+
+    enabled = server_config.get_nested(('auth', 'external_proxy', 'enabled'),
+                                       None)
+    if enabled is None:
+        # Backward compatibility: the server may be deployed with a legacy
+        # config (e.g. legacy helm chart) which will not explicitly enable
+        # external proxy auth when needed (e.g. when oauth2-proxy is configured
+        # on the ingress). So:
+        # - If any of the built-in auth schemes is enabled, disable it since
+        # built-in auth is exclusive with auth on the ingress
+        # - Otherwise we enable it since user should have at least one ingress
+        # auth scheme set in this case.
+        enabled = True
+        if (os.getenv(constants.ENV_VAR_ENABLE_BASIC_AUTH, 'false') == 'true' or
+            (os.getenv(server_constants.OAUTH2_PROXY_ENABLED_ENV_VAR, 'false')
+             == 'true')):
+            enabled = False
+    if not enabled:
+        return ExternalProxyConfig(enabled=False)
+
+    header_format = server_config.get_nested(
+        ('auth', 'external_proxy', 'header_format'), 'plaintext')
+    header_name = server_config.get_nested(
+        ('auth', 'external_proxy', 'header_name'), _DEFAULT_HEADER_NAME)
+    jwt_identity_claim = server_config.get_nested(
+        ('auth', 'external_proxy', 'jwt_identity_claim'),
+        _DEFAULT_IDENTITY_CLAIM)
+
+    # Check for legacy env var
+    legacy_header = os.getenv(constants.ENV_VAR_SERVER_AUTH_USER_HEADER)
+    if legacy_header:
+        if header_format == 'jwt':
+            raise ValueError(
+                'Configuration error: Environment variable '
+                f'{constants.ENV_VAR_SERVER_AUTH_USER_HEADER} is set but '
+                'header_format is "jwt". The legacy header environment '
+                'variable only supports plaintext format. Please either:\n'
+                f'  1. Remove the {constants.ENV_VAR_SERVER_AUTH_USER_HEADER} '
+                'environment variable, OR\n'
+                '  2. Set header_format to "plaintext" in server.yaml')
+        # Legacy env var overrides header_name from config
+        header_name = legacy_header
+        logger.debug(f'Using legacy auth header from env var: {legacy_header}')
+
+    return ExternalProxyConfig(
+        enabled=True,
+        header_name=header_name,
+        header_format=header_format,
+        jwt_identity_claim=jwt_identity_claim,
+    )
diff --git a/sky/server/constants.py b/sky/server/constants.py
index b7babc104f9..ada5c371257 100644
--- a/sky/server/constants.py
+++ b/sky/server/constants.py
@@ -10,7 +10,7 @@
 # based on version info is needed.
 # For more details and code guidelines, refer to:
 # https://docs.skypilot.co/en/latest/developers/CONTRIBUTING.html#backward-compatibility-guidelines
-API_VERSION = 27  # add slurm_cluster_name param to GPU availability API
+API_VERSION = 33  # add recipes endpoint
 
 # The minimum peer API version that the code should still work with.
 # Notes (dev):
@@ -62,9 +62,25 @@
 # The interval (seconds) for the event to be restarted in the background.
 DAEMON_RESTART_INTERVAL_SECONDS = 20
 
+# Timeout for CLI authentication sessions (polling-based auth flow).
+# Used by both client (polling timeout) and server (session expiration).
+AUTH_SESSION_TIMEOUT_SECONDS = 300  # 5 minutes
+
 # Cookie header for stream request id.
 STREAM_REQUEST_HEADER = 'X-SkyPilot-Stream-Request-ID'
 
 # Valid empty values for pickled fields (base64-encoded pickled None)
 # base64.b64encode(pickle.dumps(None)).decode('utf-8')
 EMPTY_PICKLED_VALUE = 'gAROLg=='
+
+# We do not support setting these in config.yaml because:
+# 1. config.yaml can be updated dynamically, but auth middleware does not
+#    support hot reload yet.
+# 2. If we introduce hot reload for auth middleware, bad config might
+#    invalidate all authenticated sessions and thus cannot be rolled back
+#    by API users.
+# TODO(aylei): we should introduce server.yaml for static server admin config,
+# which is more structured than multiple environment variables and can be less
+# confusing to users.
+OAUTH2_PROXY_BASE_URL_ENV_VAR = 'SKYPILOT_AUTH_OAUTH2_PROXY_BASE_URL'
+OAUTH2_PROXY_ENABLED_ENV_VAR = 'SKYPILOT_AUTH_OAUTH2_PROXY_ENABLED'
diff --git a/sky/server/html/authorize_page.html b/sky/server/html/authorize_page.html
new file mode 100644
index 00000000000..e750c2ea3d0
--- /dev/null
+++ b/sky/server/html/authorize_page.html
@@ -0,0 +1,204 @@
+<!-- TODO(cooperc): Migrate this page to the React dashboard for consistency. -->
+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>Sign in to SkyPilot CLI</title>
+    <style>
+        body {
+            font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Helvetica, Arial, sans-serif;
+            display: flex;
+            flex-direction: column;
+            align-items: center;
+            justify-content: center;
+            min-height: 100vh;
+            margin: 0;
+            background-color: #f8f9fa;
+            color: #202124;
+            padding: 20px;
+            box-sizing: border-box;
+        }
+        .container {
+            background-color: #ffffff;
+            padding: 48px;
+            border-radius: 8px;
+            box-shadow: 0 1px 3px rgba(0,0,0,0.12), 0 1px 2px rgba(0,0,0,0.24);
+            text-align: center;
+            max-width: 500px;
+            width: 100%;
+        }
+        .logo {
+            width: 64px;
+            height: 64px;
+            margin-bottom: 20px;
+            display: inline-block;
+        }
+        .logo svg {
+            width: 100%;
+            height: 100%;
+        }
+        h1 {
+            font-size: 24px;
+            font-weight: 500;
+            margin-bottom: 20px;
+            color: #202124;
+        }
+        .user-identifier {
+            font-size: 12px;
+            color: #80868b;
+            margin-bottom: 8px;
+        }
+        .description {
+            font-size: 14px;
+            line-height: 1.5;
+            color: #5f6368;
+            margin-bottom: 32px;
+        }
+        .button-group {
+            display: flex;
+            gap: 12px;
+            justify-content: center;
+        }
+        .authorize-button {
+            background-color: #1a73e8;
+            color: white;
+            border: none;
+            border-radius: 4px;
+            padding: 10px 24px;
+            font-size: 14px;
+            font-weight: 500;
+            cursor: pointer;
+            transition: background-color 0.3s;
+        }
+        .authorize-button:hover {
+            background-color: #287ae6;
+        }
+        .authorize-button:active {
+            background-color: #1b66c9;
+        }
+        .authorize-button:disabled {
+            background-color: #94c4f8;
+            cursor: not-allowed;
+        }
+        .cancel-button {
+            background-color: #ffffff;
+            color: #5f6368;
+            border: 1px solid #dadce0;
+            border-radius: 4px;
+            padding: 10px 24px;
+            font-size: 14px;
+            font-weight: 500;
+            cursor: pointer;
+            transition: background-color 0.3s;
+        }
+        .cancel-button:hover {
+            background-color: #f8f9fa;
+        }
+        .footer-text {
+            font-size: 12px;
+            color: #5f6368;
+            margin-top: 30px;
+        }
+        .error-message {
+            color: #d93025;
+            font-size: 14px;
+            margin-top: 16px;
+            display: none;
+        }
+        .spinner {
+            display: inline-block;
+            width: 16px;
+            height: 16px;
+            border: 2px solid #ffffff;
+            border-radius: 50%;
+            border-top-color: transparent;
+            animation: spin 0.8s linear infinite;
+            margin-right: 8px;
+            vertical-align: middle;
+        }
+        @keyframes spin {
+            to { transform: rotate(360deg); }
+        }
+    </style>
+</head>
+<body>
+    <div class="container">
+        <div class="logo">
+            <!-- SkyPilot Logo Icon -->
+            <svg viewBox="0 0 50 50" fill="none" xmlns="http://www.w3.org/2000/svg">
+                <path d="M25.1258 30.8274L19.2842 31.6783L33.8316 46.2268L31.492 37.1925L25.1258 30.8274Z" fill="#372F8A"/>
+                <path d="M46.9433 0.000976562L0.719727 13.1148L15.2661 27.6601L16.633 21.3925L10.3728 15.1323L40.183 6.74118C40.183 6.74118 46.102 0.855027 46.9444 0.00203721L46.9433 0.000976562Z" fill="#372F8A"/>
+                <path d="M40.1821 6.74021L31.4922 37.1925L33.8318 46.2257L46.9445 0C46.1022 0.85299 40.1831 6.73915 40.1831 6.73915L40.1821 6.74021Z" fill="#372F8A"/>
+                <path d="M21.3356 25.6089L19.2842 31.6783L25.1258 30.8275L30.3741 16.6011L30.3275 16.617L21.3356 25.6089Z" fill="#195D7F"/>
+                <path d="M16.632 21.3918L15.2651 27.6605L21.3357 25.6091L30.3276 16.6172L16.632 21.3918Z" fill="#39A4DD"/>
+            </svg>
+        </div>
+        <h1>Sign in to SkyPilot CLI</h1>
+        <p class="user-identifier">USER_PLACEHOLDER</p>
+        <p class="description">
+            Click <strong>Authorize</strong> to link the SkyPilot CLI to your account.
+        </p>
+        <div class="button-group">
+            <button class="cancel-button" onclick="cancelAuth()">Cancel</button>
+            <button id="authorize-btn" class="authorize-button" onclick="authorize()">Authorize</button>
+        </div>
+        <p id="error-message" class="error-message"></p>
+        <p class="footer-text">Only authorize if you initiated this request from the SkyPilot CLI.</p>
+    </div>
+
+    <script>
+        const authorizeBtn = document.getElementById('authorize-btn');
+        const errorMessage = document.getElementById('error-message');
+
+        function getCodeChallenge() {
+            const params = new URLSearchParams(window.location.search);
+            return params.get('code_challenge');
+        }
+
+        function showMessage(title, message) {
+            document.body.innerHTML = `
+                <div style="display: flex; align-items: center; justify-content: center;
+                            height: 100vh; font-family: sans-serif; color: #5f6368;">
+                    <div style="text-align: center;">
+                        <h2>${title}</h2>
+                        <p>${message}</p>
+                    </div>
+                </div>
+            `;
+        }
+
+        async function authorize() {
+            authorizeBtn.disabled = true;
+            authorizeBtn.innerHTML = '<span class="spinner"></span>Authorizing...';
+            errorMessage.style.display = 'none';
+
+            try {
+                const response = await fetch('/api/v1/auth/authorize', {
+                    method: 'POST',
+                    credentials: 'include',
+                    headers: {'Content-Type': 'application/json'},
+                    body: JSON.stringify({code_challenge: getCodeChallenge()})
+                });
+
+                if (response.ok) {
+                    showMessage('Authorization Complete',
+                                'You can close this tab and return to your terminal.');
+                } else {
+                    const data = await response.json();
+                    throw new Error(data.detail || 'Authorization failed');
+                }
+            } catch (error) {
+                errorMessage.textContent = error.message || 'An error occurred. Please try again.';
+                errorMessage.style.display = 'block';
+                authorizeBtn.disabled = false;
+                authorizeBtn.textContent = 'Authorize';
+            }
+        }
+
+        function cancelAuth() {
+            showMessage('Authorization Cancelled', 'You can close this tab.');
+        }
+    </script>
+</body>
+</html>
diff --git a/sky/server/html/token_page.html b/sky/server/html/token_page.html
index 6e5bcbbcd36..7345c697f6e 100644
--- a/sky/server/html/token_page.html
+++ b/sky/server/html/token_page.html
@@ -1,3 +1,4 @@
+<!-- TODO(cooperc): Migrate this page to the React dashboard for consistency. -->
 <!DOCTYPE html>
 <html lang="en">
 <head>
diff --git a/sky/server/plugin_utils.py b/sky/server/plugin_utils.py
new file mode 100644
index 00000000000..2b7abf28345
--- /dev/null
+++ b/sky/server/plugin_utils.py
@@ -0,0 +1,102 @@
+"""Utils for managing plugin wheels.
+
+This module provides utilities for uploading prebuilt plugin wheels
+to remote clusters.
+"""
+import os
+import pathlib
+from typing import Dict, Optional, Tuple
+
+from sky import sky_logging
+from sky.server import plugins
+from sky.skylet import constants
+
+logger = sky_logging.init_logger(__name__)
+
+# Remote directory for plugin wheels
+_REMOTE_PLUGINS_WHEEL_DIR = '~/.sky/plugins/wheels'
+
+
+def get_plugin_mounts_and_commands() -> Tuple[Dict[str, str], str]:
+    """Get file mounts and installation commands for plugin wheels.
+
+    This function reads the controller wheel directory path from the plugin
+    config (plugins.yaml), finds all .whl files in that directory,
+    and returns both the file mounts for uploading them to remote clusters and
+    the shell commands for installing them.
+
+    Returns:
+        A tuple of:
+        - Dictionary mapping remote paths to local paths for plugin wheels
+        - Shell commands to install all plugin wheels
+    """
+
+    remote_plugin_packages = plugins.get_remote_plugin_packages()
+
+    if not remote_plugin_packages:
+        return {}, ''
+
+    # Get the controller wheel directory path from the plugin config
+    wheel_dir_str = plugins.get_remote_controller_wheel_path()
+    if not wheel_dir_str:
+        logger.warning(
+            'Remote plugins are specified but '
+            'controller_wheel_path is not specified in plugins.yaml. '
+            'Skipping wheel upload.')
+        return {}, ''
+
+    # Expand user path and validate
+    wheel_dir = pathlib.Path(os.path.expanduser(wheel_dir_str))
+    if not wheel_dir.exists():
+        logger.warning(
+            f'Controller wheel directory does not exist: {wheel_dir}')
+        return {}, ''
+
+    if not wheel_dir.is_dir():
+        logger.warning(f'Controller wheel path is not a directory: {wheel_dir}')
+        return {}, ''
+
+    # Find all .whl files in the directory
+    wheel_files = list(wheel_dir.glob('*.whl'))
+    if not wheel_files:
+        logger.debug(
+            f'No .whl files found in controller wheel directory: {wheel_dir}')
+        return {}, ''
+
+    file_mounts: Dict[str, str] = {}
+    commands = []
+
+    for wheel_path in wheel_files:
+        # File mount: upload the wheel file directly to the remote cluster
+        # Use the wheel filename as the remote path
+        remote_wheel_path = (f'{_REMOTE_PLUGINS_WHEEL_DIR}/'
+                             f'{wheel_path.name}')
+        file_mounts[remote_wheel_path] = str(wheel_path)
+
+        # Installation command: install the wheel on the remote cluster
+        # Use ~ which will be expanded by the shell when the command runs.
+        # Note: We don't quote the path so that ~ gets expanded by the shell
+        install_cmd = (f'{constants.SKY_UV_PIP_CMD} install '
+                       f'{remote_wheel_path}')
+        commands.append(install_cmd)
+
+    return file_mounts, ' && '.join(commands)
+
+
+def get_filtered_plugins_config_path() -> Optional[str]:
+    """Return the path to remote_plugins.yaml if it exists.
+
+    The controller should only attempt to load plugins that are specified in
+    remote_plugins.yaml. Plugins in plugins.yaml are intended for API server
+    use only and should not be loaded on the controller.
+
+    Returns:
+        Path to the remote_plugins.yaml file if it exists and contains plugins,
+        or None if no remote plugins are configured.
+    """
+    remote_plugin_packages = plugins.get_remote_plugin_packages()
+    if not remote_plugin_packages:
+        return None
+
+    # Get the path to remote_plugins.yaml
+    return plugins.get_remote_plugins_config_path()
diff --git a/sky/server/plugins.py b/sky/server/plugins.py
index afa464d1859..5038301a0cb 100644
--- a/sky/server/plugins.py
+++ b/sky/server/plugins.py
@@ -3,7 +3,7 @@
 import dataclasses
 import importlib
 import os
-from typing import Dict, List, Optional, Tuple
+from typing import Any, Dict, List, Optional, Tuple
 
 from fastapi import FastAPI
 
@@ -15,9 +15,17 @@
 
 logger = sky_logging.init_logger(__name__)
 
+# Default paths for plugins configuration
 _DEFAULT_PLUGINS_CONFIG_PATH = '~/.sky/plugins.yaml'
+# Default path for remote plugins configuration
+_DEFAULT_REMOTE_PLUGINS_CONFIG_PATH = '~/.sky/remote_plugins.yaml'
+# Remote path for plugins config on the cluster
+REMOTE_PLUGINS_CONFIG_PATH = '~/.sky/plugins.yaml'
+
 _PLUGINS_CONFIG_ENV_VAR = (
     f'{skylet_constants.SKYPILOT_SERVER_ENV_VAR_PREFIX}PLUGINS_CONFIG')
+_REMOTE_PLUGINS_CONFIG_ENV_VAR = (
+    f'{skylet_constants.SKYPILOT_SERVER_ENV_VAR_PREFIX}REMOTE_PLUGINS_CONFIG')
 
 
 class ExtensionContext:
@@ -100,6 +108,17 @@ def js_extension_path(self) -> Optional[str]:
         """Optional API route to the JavaScript extension to load."""
         return None
 
+    @property
+    def requires_early_init(self) -> bool:
+        """Whether this plugin needs to initialize before dashboard API calls.
+
+        Set to True if the plugin needs to intercept fetch requests or
+        otherwise must be ready before the dashboard makes API calls.
+        The dashboard will wait for window.__skyPluginsReady before
+        proceeding with API calls when this is True.
+        """
+        return False
+
     @property
     def version(self) -> Optional[str]:
         """Plugin version."""
@@ -110,6 +129,39 @@ def commit(self) -> Optional[str]:
         """Plugin git commit hash."""
         return None
 
+    @property
+    def hidden_from_display(self) -> bool:
+        """Whether this plugin should be hidden from version display.
+
+        Set to True to exclude this plugin from appearing in the version
+        information tooltip. Defaults to False.
+        """
+        return False
+
+    @property
+    def rbac_rules(self) -> List[Tuple[str, 'RBACRule']]:
+        """RBAC rules for this plugin.
+
+        Override this property to declare RBAC rules that should be
+        enforced for plugin endpoints. Rules are collected during
+        server initialization before plugins are fully installed.
+
+        Returns:
+            List of (role, RBACRule) tuples. Rules added to 'user' role
+            block regular users but allow admins.
+
+        Example:
+            @property
+            def rbac_rules(self):
+                return [
+                    ('user', RBACRule(path='/plugins/api/foo/*',
+                      method='POST')),
+                    ('user', RBACRule(path='/plugins/api/foo/*',
+                      method='DELETE')),
+                ]
+        """
+        return []
+
     @abc.abstractmethod
     def install(self, extension_context: ExtensionContext):
         """Hook called by API server to let the plugin install itself."""
@@ -121,6 +173,46 @@ def shutdown(self):
 
 
 def _config_schema():
+    """Schema for plugins.yaml (API server plugins only)."""
+    plugin_schema = {
+        'type': 'object',
+        'required': ['class'],
+        'additionalProperties': False,
+        'properties': {
+            'class': {
+                'type': 'string',
+            },
+            'parameters': {
+                'type': 'object',
+                'required': [],
+                'additionalProperties': True,
+            },
+        },
+    }
+    return {
+        'type': 'object',
+        'required': [],
+        'additionalProperties': False,
+        'properties': {
+            'controller_wheel_path': {
+                # Path to a directory containing prebuilt
+                # plugin wheel files (.whl).
+                # All .whl files in this directory will be uploaded
+                # to controllers.
+                # The wheels will be uploaded to remote clusters and installed.
+                'type': 'string',
+            },
+            'plugins': {
+                'type': 'array',
+                'items': plugin_schema,
+                'default': [],
+            },
+        },
+    }
+
+
+def _remote_config_schema():
+    """Schema for remote_plugins.yaml (plugins for remote controllers)."""
     plugin_schema = {
         'type': 'object',
         'required': ['class'],
@@ -164,6 +256,65 @@ def _load_plugin_config() -> Optional[config_utils.Config]:
     return config_utils.Config.from_dict(config)
 
 
+def _load_remote_plugin_config() -> Optional[config_utils.Config]:
+    """Load remote plugin config from remote_plugins.yaml."""
+    config_path = os.getenv(_REMOTE_PLUGINS_CONFIG_ENV_VAR,
+                            _DEFAULT_REMOTE_PLUGINS_CONFIG_PATH)
+    config_path = os.path.expanduser(config_path)
+    if not os.path.exists(config_path):
+        return None
+    config = yaml_utils.read_yaml(config_path) or {}
+    common_utils.validate_schema(
+        config,
+        _remote_config_schema(),
+        err_msg_prefix='Invalid remote plugins config: ')
+    return config_utils.Config.from_dict(config)
+
+
+def get_remote_plugin_packages() -> List[Dict[str, Any]]:
+    """Get the list of remote plugin packages with their configurations.
+
+    Returns:
+        A list of dictionaries containing plugin configurations from
+        remote_plugins.yaml, each with at least 'class' and
+        optionally 'parameters'.
+    """
+    config = _load_remote_plugin_config()
+    if not config:
+        return []
+
+    plugin_configs = config.get('plugins', [])
+    return [dict(p) for p in plugin_configs]
+
+
+def get_remote_controller_wheel_path() -> Optional[str]:
+    """Get the controller wheel path from the plugin config.
+
+    Returns:
+        The controller_wheel_path if specified in plugins.yaml,
+        None otherwise.
+    """
+    config = _load_plugin_config()
+    if not config:
+        return None
+    return config.get('controller_wheel_path')
+
+
+def get_remote_plugins_config_path() -> Optional[str]:
+    """Get the path to the remote plugins config file.
+
+    Returns:
+        The expanded path to remote_plugins.yaml if it exists,
+        None otherwise.
+    """
+    config_path = os.getenv(_REMOTE_PLUGINS_CONFIG_ENV_VAR,
+                            _DEFAULT_REMOTE_PLUGINS_CONFIG_PATH)
+    config_path = os.path.expanduser(config_path)
+    if not os.path.exists(config_path):
+        return None
+    return config_path
+
+
 _PLUGINS: Dict[str, BasePlugin] = {}
 _EXTENSION_CONTEXT: Optional[ExtensionContext] = None
 
@@ -208,10 +359,59 @@ def get_plugins() -> List[BasePlugin]:
     return list(_PLUGINS.values())
 
 
+_PLUGIN_RBAC_RULES: Dict[str, List[Dict[str, str]]] = {}
+
+
+def load_plugin_rbac_rules() -> Dict[str, List[Dict[str, str]]]:
+    """Load RBAC rules from plugins without calling install().
+
+    This is called in the main process before permission service
+    initialization to collect plugin RBAC rules. It instantiates
+    plugins and reads their rbac_rules property without calling
+    install(), avoiding side effects.
+
+    Returns:
+        Dictionary mapping role names to lists of blocklist rules.
+    """
+    global _PLUGIN_RBAC_RULES
+
+    config = _load_plugin_config()
+    if not config:
+        return {}
+
+    rules_by_role: Dict[str, List[Dict[str, str]]] = {}
+
+    for plugin_config in config.get('plugins', []):
+        class_path = plugin_config['class']
+        module_path, class_name = class_path.rsplit('.', 1)
+        try:
+            module = importlib.import_module(module_path)
+            plugin_cls = getattr(module, class_name)
+            if not issubclass(plugin_cls, BasePlugin):
+                continue
+            parameters = plugin_config.get('parameters') or {}
+            plugin = plugin_cls(**parameters)
+
+            # Collect rules from the rbac_rules property
+            for role, rule in plugin.rbac_rules:
+                rules_by_role.setdefault(role, []).append({
+                    'path': rule.path,
+                    'method': rule.method,
+                })
+                logger.debug(f'Collected RBAC rule from {class_path}: '
+                             f'{role} {rule.method} {rule.path}')
+        except Exception as e:  # pylint: disable=broad-except
+            logger.warning(f'Failed to load RBAC rules from {class_path}: {e}')
+
+    _PLUGIN_RBAC_RULES = rules_by_role
+    return rules_by_role
+
+
 def get_plugin_rbac_rules() -> Dict[str, List[Dict[str, str]]]:
     """Collect RBAC rules from all loaded plugins.
 
-    Collects rules from the ExtensionContext.
+    Returns rules collected by load_plugin_rbac_rules() which runs in the
+    main process before permission service initialization.
 
     Returns:
         Dictionary mapping role names to lists of blocklist rules.
@@ -223,16 +423,4 @@ def get_plugin_rbac_rules() -> Dict[str, List[Dict[str, str]]]:
             ]
         }
     """
-    rules_by_role: Dict[str, List[Dict[str, str]]] = {}
-
-    # Collect rules registered via ExtensionContext
-    if _EXTENSION_CONTEXT:
-        for role, rule in _EXTENSION_CONTEXT.rbac_rules:
-            if role not in rules_by_role:
-                rules_by_role[role] = []
-            rules_by_role[role].append({
-                'path': rule.path,
-                'method': rule.method,
-            })
-
-    return rules_by_role
+    return _PLUGIN_RBAC_RULES
diff --git a/sky/server/requests/executor.py b/sky/server/requests/executor.py
index 496e8a9e786..96136cfe1be 100644
--- a/sky/server/requests/executor.py
+++ b/sky/server/requests/executor.py
@@ -731,9 +731,21 @@ async def prepare_request_async(
     request_cluster_name: Optional[str] = None,
     schedule_type: api_requests.ScheduleType = (api_requests.ScheduleType.LONG),
     is_skypilot_system: bool = False,
+    auth_user: Optional[models.User] = None,
 ) -> api_requests.Request:
     """Prepare a request for execution."""
-    user_id = request_body.env_vars[constants.USER_ID_ENV_VAR]
+    if auth_user is not None:
+        assert auth_user.name is not None
+        # Use the authenticated user identity as the single source of truth
+        # if present.
+        user_id = auth_user.id
+        # Set user identity for executors.
+        request_body.env_vars[constants.USER_ID_ENV_VAR] = user_id
+        request_body.env_vars[constants.USER_ENV_VAR] = auth_user.name
+    else:
+        # Fallback to legacy environment variable based identity if no
+        # authentication is set.
+        user_id = request_body.env_vars[constants.USER_ID_ENV_VAR]
     if is_skypilot_system:
         user_id = constants.SKYPILOT_SYSTEM_USER_ID
         global_user_state.add_or_update_user(
@@ -757,18 +769,19 @@ async def prepare_request_async(
     return request
 
 
-async def schedule_request_async(request_id: str,
-                                 request_name: request_names.RequestName,
-                                 request_body: payloads.RequestBody,
-                                 func: Callable[P, Any],
-                                 request_cluster_name: Optional[str] = None,
-                                 ignore_return_value: bool = False,
-                                 schedule_type: api_requests.ScheduleType = (
-                                     api_requests.ScheduleType.LONG),
-                                 is_skypilot_system: bool = False,
-                                 precondition: Optional[
-                                     preconditions.Precondition] = None,
-                                 retryable: bool = False) -> None:
+async def schedule_request_async(
+        request_id: str,
+        request_name: request_names.RequestName,
+        request_body: payloads.RequestBody,
+        func: Callable[P, Any],
+        request_cluster_name: Optional[str] = None,
+        ignore_return_value: bool = False,
+        schedule_type: api_requests.ScheduleType = (
+            api_requests.ScheduleType.LONG),
+        is_skypilot_system: bool = False,
+        precondition: Optional[preconditions.Precondition] = None,
+        retryable: bool = False,
+        auth_user: Optional[models.User] = None) -> None:
     """Enqueue a request to the request queue.
 
     Args:
@@ -789,11 +802,14 @@ async def schedule_request_async(request_id: str,
             The precondition is waited asynchronously and does not block the
             caller.
     """
-    request_task = await prepare_request_async(request_id, request_name,
-                                               request_body, func,
+    request_task = await prepare_request_async(request_id,
+                                               request_name,
+                                               request_body,
+                                               func,
                                                request_cluster_name,
                                                schedule_type,
-                                               is_skypilot_system)
+                                               is_skypilot_system,
+                                               auth_user=auth_user)
     schedule_prepared_request(request_task, ignore_return_value, precondition,
                               retryable)
 
diff --git a/sky/server/requests/payloads.py b/sky/server/requests/payloads.py
index 5e2b036eebe..af95fd6e8c6 100644
--- a/sky/server/requests/payloads.py
+++ b/sky/server/requests/payloads.py
@@ -210,7 +210,7 @@ def to_kwargs(self) -> Dict[str, Any]:
 
         kwargs = super().to_kwargs()
 
-        dag = dag_utils.load_chain_dag_from_yaml_str(self.dag)
+        dag = dag_utils.load_dag_from_yaml_str(self.dag)
         # We should not validate the dag here, as the file mounts are not
         # processed yet, but we need to validate the resources during the
         # optimization to make sure the resources are available.
@@ -314,6 +314,8 @@ def to_kwargs(self) -> Dict[str, Any]:
 class StopOrDownBody(RequestBody):
     cluster_name: str
     purge: bool = False
+    graceful: bool = False
+    graceful_timeout: Optional[int] = None
 
 
 class StatusBody(RequestBody):
@@ -346,6 +348,8 @@ class AutostopBody(RequestBody):
     idle_minutes: int
     wait_for: Optional[autostop_lib.AutostopWaitFor] = None
     down: bool = False
+    hook: Optional[str] = None
+    hook_timeout: Optional[int] = None
 
 
 class QueueBody(RequestBody):
@@ -378,6 +382,13 @@ class ProvisionLogsBody(RequestBody):
     worker: Optional[int] = None
 
 
+class AutostopLogsBody(RequestBody):
+    """Autostop logs request body."""
+    cluster_name: str
+    follow: bool = True
+    tail: int = 0
+
+
 class ClusterJobBody(RequestBody):
     """The request body for the cluster job endpoint."""
     cluster_name: str
@@ -487,7 +498,7 @@ class VolumeDeleteBody(RequestBody):
 
 class VolumeListBody(RequestBody):
     """The request body for the volume list endpoint."""
-    pass
+    refresh: bool = False
 
 
 class VolumeValidateBody(RequestBody):
@@ -556,6 +567,9 @@ class JobsQueueV2Body(RequestBody):
     # The fields to return in the response.
     # Refer to the fields in the `class ManagedJobRecord` in `response.py`
     fields: Optional[List[str]] = None
+    # Sorting parameters, added in ManagedJobsService v14.
+    sort_by: Optional[str] = None  # Field to sort by (e.g., 'job_id', 'name')
+    sort_order: Optional[str] = None  # 'asc' or 'desc'
 
 
 class JobsCancelBody(RequestBody):
@@ -575,6 +589,8 @@ class JobsLogsBody(RequestBody):
     controller: bool = False
     refresh: bool = False
     tail: Optional[int] = None
+    # Task identifier: int for task_id, str for task_name
+    task: Optional[Union[str, int]] = None
 
 
 class RequestCancelBody(RequestBody):
@@ -884,3 +900,90 @@ class GetJobEventsBody(RequestBody):
     job_id: int
     task_id: Optional[int] = None
     limit: Optional[int] = 10  # Default to 10 most recent task events
+
+
+# =============================================================================
+# YAML Hub payloads
+# =============================================================================
+
+
+class RecipeListBody(RequestBody):
+    """The request body for listing recipes."""
+    pinned_only: bool = False
+    my_recipes_only: bool = False
+    recipe_type: Optional[
+        str] = None  # See RecipeType: 'cluster', 'job', 'pool', 'volume'
+
+    def to_kwargs(self) -> Dict[str, Any]:
+        kwargs = super().to_kwargs()
+        # Inject user_id from env_vars for filtering by user
+        # Fallback to 'local' for unauthenticated local servers
+        kwargs['user_id'] = self.env_vars.get(constants.USER_ID_ENV_VAR,
+                                              'local')
+        return kwargs
+
+
+class RecipeGetBody(RequestBody):
+    """The request body for getting a single recipe."""
+    recipe_name: str
+
+
+class RecipeCreateBody(RequestBody):
+    """The request body for creating a new recipe."""
+    name: str
+    content: str
+    recipe_type: str  # See RecipeType: 'cluster', 'job', 'pool', 'volume'
+    description: Optional[str] = None
+    owner_name: Optional[str] = None  # Override user_name for unauthenticated
+
+    def to_kwargs(self) -> Dict[str, Any]:
+        kwargs = super().to_kwargs()
+        # Inject user_id and user_name from env_vars
+        # Fallback to 'local' for unauthenticated local servers
+        kwargs['user_id'] = self.env_vars.get(constants.USER_ID_ENV_VAR,
+                                              'local')
+        # Use owner_name if provided (for unauthenticated users), else use env
+        # var.
+        if self.owner_name:
+            kwargs['user_name'] = self.owner_name
+        else:
+            kwargs['user_name'] = self.env_vars.get(constants.USER_ENV_VAR,
+                                                    'local')
+        # Remove owner_name from kwargs - it's only used to set user_name above
+        kwargs.pop('owner_name', None)
+        return kwargs
+
+
+class RecipeUpdateBody(RequestBody):
+    """The request body for updating an existing recipe."""
+    recipe_name: str
+    description: Optional[str] = None
+    content: Optional[str] = None
+
+    def to_kwargs(self) -> Dict[str, Any]:
+        kwargs = super().to_kwargs()
+        # Inject user_id and user_name from env_vars
+        # Fallback to 'local' for unauthenticated local servers
+        kwargs['user_id'] = self.env_vars.get(constants.USER_ID_ENV_VAR,
+                                              'local')
+        kwargs['user_name'] = self.env_vars.get(constants.USER_ENV_VAR, 'local')
+        return kwargs
+
+
+class RecipeDeleteBody(RequestBody):
+    """The request body for deleting a recipe."""
+    recipe_name: str
+
+    def to_kwargs(self) -> Dict[str, Any]:
+        kwargs = super().to_kwargs()
+        # Inject user_id from env_vars for ownership check
+        # Fallback to 'local' for unauthenticated local servers
+        kwargs['user_id'] = self.env_vars.get(constants.USER_ID_ENV_VAR,
+                                              'local')
+        return kwargs
+
+
+class RecipePinBody(RequestBody):
+    """The request body for toggling pin status."""
+    recipe_name: str
+    pinned: bool
diff --git a/sky/server/requests/process.py b/sky/server/requests/process.py
index 1c0a74258ba..03b8d196ba4 100644
--- a/sky/server/requests/process.py
+++ b/sky/server/requests/process.py
@@ -37,7 +37,7 @@ def __init__(self, max_workers: int, **kwargs):
         #    currently idle.
         self.running: atomic.AtomicInt = atomic.AtomicInt(0)
 
-    def submit(self, fn, *args, **kwargs) -> concurrent.futures.Future:
+    def submit(self, fn, /, *args, **kwargs) -> concurrent.futures.Future:
         """Submit a task for execution.
 
         If reuse_worker is False, wraps the function to exit after completion.
@@ -51,7 +51,10 @@ def has_idle_workers(self) -> bool:
         """Check if there are any idle workers."""
         return self.running.get() < self.max_workers
 
-    def shutdown(self, wait: bool = True) -> None:
+    def shutdown(self,
+                 wait: bool = True,
+                 *,
+                 cancel_futures: bool = False) -> None:
         """Shutdown the executor."""
         # Here wait means wait for the proactive cancellation complete.
         # TODO(aylei): we may support wait=True in the future if needed.
diff --git a/sky/server/requests/request_names.py b/sky/server/requests/request_names.py
index 63a968c6d7e..a3ca8fa96ef 100644
--- a/sky/server/requests/request_names.py
+++ b/sky/server/requests/request_names.py
@@ -30,6 +30,7 @@ class RequestName(str, enum.Enum):
     CLUSTER_JOB_CANCEL = 'cancel'
     CLUSTER_JOB_LOGS = 'logs'
     CLUSTER_JOB_DOWNLOAD_LOGS = 'download_logs'
+    CLUSTER_AUTOSTOP_LOGS = 'autostop_logs'
     CLUSTER_COST_REPORT = 'cost_report'
     CLUSTER_EVENTS = 'cluster_events'
     # Storage requests
@@ -76,6 +77,13 @@ class RequestName(str, enum.Enum):
     # SSH node pools requests
     SSH_NODE_POOLS_UP = 'ssh_node_pools.up'
     SSH_NODE_POOLS_DOWN = 'ssh_node_pools.down'
+    # Recipes requests
+    RECIPE_LIST = 'recipes.list'
+    RECIPE_GET = 'recipes.get'
+    RECIPE_CREATE = 'recipes.create'
+    RECIPE_UPDATE = 'recipes.update'
+    RECIPE_DELETE = 'recipes.delete'
+    RECIPE_PIN = 'recipes.pin'
     # Internal request daemons
     REQUEST_DAEMON_STATUS_REFRESH = 'status-refresh'
     REQUEST_DAEMON_VOLUME_REFRESH = 'volume-refresh'
diff --git a/sky/server/requests/serializers/decoders.py b/sky/server/requests/serializers/decoders.py
index 0b58b76d4e9..c388a7363c1 100644
--- a/sky/server/requests/serializers/decoders.py
+++ b/sky/server/requests/serializers/decoders.py
@@ -130,10 +130,10 @@ def decode_jobs_queue_v2(
     """
     # Case 1: dict shape {jobs, total, total_no_filter, status_counts}
     if isinstance(return_value, dict):
-        jobs = return_value.get('jobs', [])
-        total = return_value.get('total', len(jobs))
-        total_no_filter = return_value.get('total_no_filter', total)
-        status_counts = return_value.get('status_counts', {})
+        jobs: List[Dict[str, Any]] = return_value.get('jobs', [])
+        total: int = return_value.get('total', len(jobs))
+        total_no_filter: int = return_value.get('total_no_filter', total)
+        status_counts: Dict[str, int] = return_value.get('status_counts', {})
         for job in jobs:
             job['status'] = managed_jobs.ManagedJobStatus(job['status'])
         jobs = [responses.ManagedJobRecord(**job) for job in jobs]
diff --git a/sky/server/requests/serializers/return_value_serializers.py b/sky/server/requests/serializers/return_value_serializers.py
index 52546a0d552..abe3043f5d1 100644
--- a/sky/server/requests/serializers/return_value_serializers.py
+++ b/sky/server/requests/serializers/return_value_serializers.py
@@ -67,4 +67,9 @@ def serialize_kubernetes_node_info(return_value: Dict[str, Any]) -> str:
                 node_info.pop('memory_gb', None)
                 node_info.pop('cpu_free', None)
                 node_info.pop('memory_free_gb', None)
+            if remote_api_version < 28:
+                # Remove is_cordoned and taints fields for old clients that
+                # don't recognize them
+                node_info.pop('is_cordoned', None)
+                node_info.pop('taints', None)
     return orjson.dumps(return_value).decode('utf-8')
diff --git a/sky/server/requests/threads.py b/sky/server/requests/threads.py
index ffa381c02b5..80adf91cb75 100644
--- a/sky/server/requests/threads.py
+++ b/sky/server/requests/threads.py
@@ -80,7 +80,7 @@ def check_available(self, borrow: bool = False) -> int:
             self.running.decrement()
         return count
 
-    def submit(self, fn: Callable[_P, _T], *args: _P.args,
+    def submit(self, fn: Callable[_P, _T], /, *args: _P.args,
                **kwargs: _P.kwargs) -> 'concurrent.futures.Future[_T]':
         with self._shutdown_lock:
             if self._shutdown:
@@ -106,7 +106,10 @@ def submit(self, fn: Callable[_P, _T], *args: _P.args,
             assert thread.ident is not None, 'Thread should be started'
             return fut
 
-    def shutdown(self, wait=True):
+    def shutdown(self,
+                 wait: bool = True,
+                 *,
+                 cancel_futures: bool = False) -> None:
         with self._shutdown_lock:
             self._shutdown = True
         if not wait:
diff --git a/sky/server/server.py b/sky/server/server.py
index 218bb640e75..720b47f784a 100644
--- a/sky/server/server.py
+++ b/sky/server/server.py
@@ -8,6 +8,7 @@
 import datetime
 from enum import IntEnum
 import hashlib
+import html
 import json
 import multiprocessing
 import os
@@ -33,6 +34,7 @@
 import fastapi
 from fastapi import responses as fastapi_responses
 from fastapi.middleware import cors
+import jwt as pyjwt
 import starlette.middleware.base
 import uvloop
 
@@ -54,6 +56,7 @@
 from sky.provision import metadata_utils
 from sky.provision.kubernetes import utils as kubernetes_utils
 from sky.provision.slurm import utils as slurm_utils
+from sky.recipes import server as recipes_rest
 from sky.schemas.api import responses
 from sky.serve.server import server as serve_rest
 from sky.server import common
@@ -63,14 +66,13 @@
 from sky.server import metrics
 from sky.server import middleware_utils
 from sky.server import plugins
-from sky.server import server_utils
 from sky.server import state
 from sky.server import stream_utils
 from sky.server import version_check
 from sky.server import versions
-from sky.server.auth import authn
 from sky.server.auth import loopback
 from sky.server.auth import oauth2_proxy
+from sky.server.auth import sessions as auth_sessions
 from sky.server.requests import executor
 from sky.server.requests import payloads
 from sky.server.requests import preconditions
@@ -192,17 +194,93 @@ async def dispatch(self, request: fastapi.Request, call_next):
         return response
 
 
-def _get_auth_user_header(request: fastapi.Request) -> Optional[models.User]:
-    header_name = os.environ.get(constants.ENV_VAR_SERVER_AUTH_USER_HEADER,
-                                 'X-Auth-Request-Email')
-    if header_name not in request.headers:
+def _extract_identity_from_jwt(jwt_token: str, claim: str) -> Optional[str]:
+    """Extract identity claim from a JWT token without verification.
+
+    This is for trusted proxy scenarios where the external proxy has already
+    verified the token. We only decode to extract the claim.
+
+    Args:
+        jwt_token: The JWT token string.
+        claim: The claim name to extract (e.g., 'email', 'sub').
+
+    Returns:
+        The claim value if found, None otherwise.
+    """
+    try:
+        # Trusted proxy scenario - skip all verification since the proxy
+        # has already authenticated the request
+        payload = pyjwt.decode(jwt_token,
+                               options={
+                                   'verify_signature': False,
+                                   'verify_exp': False,
+                                   'verify_aud': False,
+                               })
+        return payload.get(claim)
+    except pyjwt.exceptions.DecodeError as e:
+        logger.debug(f'Failed to decode JWT from header: {e}')
+        return None
+    except Exception as e:  # pylint: disable=broad-except
+        logger.warning(f'Unexpected error decoding JWT: {e}')
+        return None
+
+
+def _extract_user_from_header(
+    request: fastapi.Request,
+    proxy_config: server_config.ExternalProxyConfig,
+) -> Optional[models.User]:
+    """Extract user identity from request header.
+
+    Supports both plaintext headers (e.g., X-Auth-Request-Email) and
+    JWT-encoded headers.
+    """
+    if proxy_config.header_name not in request.headers:
+        return None
+
+    header_value = request.headers[proxy_config.header_name]
+
+    if proxy_config.header_format == 'jwt':
+        user_name = _extract_identity_from_jwt(header_value,
+                                               proxy_config.jwt_identity_claim)
+    else:
+        user_name = header_value
+
+    if not user_name:
         return None
-    user_name = request.headers[header_name]
+
     user_hash = hashlib.md5(
         user_name.encode()).hexdigest()[:common_utils.USER_HASH_LENGTH]
     return models.User(id=user_hash, name=user_name)
 
 
+def _get_auth_user_header(request: fastapi.Request) -> Optional[models.User]:
+    """Legacy function for backward compatibility.
+
+    This function is used by _generate_auth_token() which does not have
+    access to the middleware config. It uses the default configuration
+    which is backward compatible.
+    """
+    proxy_config = server_config.load_external_proxy_config()
+    return _extract_user_from_header(request, proxy_config)
+
+
+def _generate_auth_token(request: fastapi.Request) -> str:
+    """Generate an auth token from the request.
+
+    The token contains the user info and cookies, base64 encoded.
+    Used by both /token and /api/v1/auth/authorize endpoints.
+    """
+    user = _get_auth_user_header(request)
+    token_data = {
+        # Token version number, bump for backwards incompatible changes.
+        'v': 1,
+        'user': user.id if user is not None else None,
+        'cookies': dict(request.cookies),
+    }
+    json_bytes = json.dumps(token_data).encode('utf-8')
+    return base64.b64encode(json_bytes).decode('utf-8')
+
+
 @middleware_utils.websocket_aware
 class InitializeRequestAuthUserMiddleware(
         starlette.middleware.base.BaseHTTPMiddleware):
@@ -261,7 +339,6 @@ async def dispatch(self, request: fastapi.Request, call_next):
                     common.crypt_ctx.verify(password, user.password)):
                 valid_user = True
                 request.state.auth_user = user
-                await authn.override_user_info_in_request_body(request, user)
                 break
         if not valid_user:
             return _basic_auth_401_response('Invalid credentials')
@@ -384,9 +461,6 @@ async def _handle_service_account_token(self, request: fastapi.Request,
                                     name=user_name or user_info.name)
             request.state.auth_user = auth_user
 
-            # Override user info in request body for service account requests
-            await authn.override_user_info_in_request_body(request, auth_user)
-
             logger.debug(f'Authenticated service account: {user_id}')
 
         except Exception as e:  # pylint: disable=broad-except
@@ -403,10 +477,28 @@ async def _handle_service_account_token(self, request: fastapi.Request,
 
 @middleware_utils.websocket_aware
 class AuthProxyMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
-    """Middleware to handle auth proxy."""
+    """Middleware to handle external auth proxy.
+
+    This middleware extracts user identity from HTTP headers set by an
+    external authentication proxy (e.g., oauth2-proxy)
+    """
+
+    # pylint: disable=redefined-outer-name
+    def __init__(self, app, **kwargs):
+        super().__init__(app, **kwargs)
+        self.config = server_config.load_external_proxy_config()
+        if self.config.enabled:
+            logger.debug('AuthProxyMiddleware enabled with header: '
+                         f'{self.config.header_name}, '
+                         f'format: {self.config.header_format}')
+        else:
+            logger.debug('AuthProxyMiddleware disabled via configuration')
 
     async def dispatch(self, request: fastapi.Request, call_next):
-        auth_user = _get_auth_user_header(request)
+        if not self.config.enabled:
+            return await call_next(request)
+
+        auth_user = _extract_user_from_header(request, self.config)
 
         if request.state.auth_user is not None:
             # Previous middleware is trusted more than this middleware.  For
@@ -431,7 +523,6 @@ async def dispatch(self, request: fastapi.Request, call_next):
         if auth_user is not None:
             request.state.auth_user = auth_user
 
-        await authn.override_user_info_in_request_body(request, auth_user)
         return await call_next(request)
 
 
@@ -491,8 +582,7 @@ async def schedule_on_boot_check_async():
         await executor.schedule_request_async(
             request_id='skypilot-server-on-boot-check',
             request_name=request_names.RequestName.CHECK,
-            request_body=server_utils.build_body_at_server(
-                request=None, body_type=payloads.CheckBody),
+            request_body=payloads.CheckBody(),
             func=sky_check.check,
             schedule_type=requests_lib.ScheduleType.SHORT,
             is_skypilot_system=True,
@@ -515,8 +605,7 @@ async def lifespan(app: fastapi.FastAPI):  # pylint: disable=redefined-outer-nam
             await executor.schedule_request_async(
                 request_id=event.id,
                 request_name=event.name,
-                request_body=server_utils.build_body_at_server(
-                    request=None, body_type=payloads.RequestBody),
+                request_body=payloads.RequestBody(),
                 func=event.run_event,
                 schedule_type=requests_lib.ScheduleType.SHORT,
                 is_skypilot_system=True,
@@ -667,7 +756,10 @@ async def dispatch(self, request: fastapi.Request, call_next):
 # auth user.
 app.add_middleware(AuthProxyMiddleware)
 enable_basic_auth = os.environ.get(constants.ENV_VAR_ENABLE_BASIC_AUTH, 'false')
-if str(enable_basic_auth).lower() == 'true':
+disable_basic_auth_middleware = os.environ.get(
+    constants.SKYPILOT_DISABLE_BASIC_AUTH_MIDDLEWARE, 'false')
+if (str(enable_basic_auth).lower() == 'true' and
+        str(disable_basic_auth_middleware).lower() != 'true'):
     app.add_middleware(BasicAuthMiddleware)
 # Bearer token middleware should always be present to handle service account
 # authentication
@@ -698,6 +790,7 @@ async def dispatch(self, request: fastapi.Request, call_next):
 app.include_router(ssh_node_pools_rest.router,
                    prefix='/ssh_node_pools',
                    tags=['ssh_node_pools'])
+app.include_router(recipes_rest.router, prefix='/recipes', tags=['recipes'])
 # increase the resource limit for the server
 soft, hard = resource.getrlimit(resource.RLIMIT_NOFILE)
 resource.setrlimit(resource.RLIMIT_NOFILE, (hard, hard))
@@ -726,16 +819,9 @@ def handle_concurrent_worker_exhausted_error(
 async def token(request: fastapi.Request,
                 local_port: Optional[int] = None) -> fastapi.responses.Response:
     del local_port  # local_port is used by the served js, but ignored by server
-    user = _get_auth_user_header(request)
-
-    token_data = {
-        'v': 1,  # Token version number, bump for backwards incompatible.
-        'user': user.id if user is not None else None,
-        'cookies': request.cookies,
-    }
     # Use base64 encoding to avoid having to escape anything in the HTML.
-    json_bytes = json.dumps(token_data).encode('utf-8')
-    base64_str = base64.b64encode(json_bytes).decode('utf-8')
+    base64_str = _generate_auth_token(request)
+    user = _get_auth_user_header(request)
 
     html_dir = pathlib.Path(__file__).parent / 'html'
     token_page_path = html_dir / 'token_page.html'
@@ -746,7 +832,8 @@ async def token(request: fastapi.Request,
         raise fastapi.HTTPException(
             status_code=500, detail='Token page template not found.') from e
 
-    user_info_string = f'Logged in as {user.name}' if user is not None else ''
+    user_info_string = html.escape(
+        f'Logged in as {user.name}') if user is not None else ''
     html_content = html_content.replace(
         'SKYPILOT_API_SERVER_USER_TOKEN_PLACEHOLDER',
         base64_str).replace('USER_PLACEHOLDER', user_info_string)
@@ -761,6 +848,97 @@ async def token(request: fastapi.Request,
         })
 
 
+@app.get('/api/v1/auth/token')
+async def poll_auth_token(
+        code_verifier: Optional[str] = None) -> fastapi.responses.Response:
+    """Poll for auth token using code_verifier.
+
+    Computes code_challenge from code_verifier to look up the session.
+
+    Query params:
+        code_verifier: The original code verifier (required)
+
+    Returns:
+        - 200 with token if session is authorized
+        - 404 if session not found (user hasn't clicked Authorize yet)
+    """
+    if not code_verifier:
+        raise fastapi.HTTPException(status_code=400,
+                                    detail='code_verifier is required')
+
+    auth_token = auth_sessions.auth_session_store.poll_session(code_verifier)
+
+    if auth_token is None:
+        raise fastapi.HTTPException(status_code=404, detail='Session not found')
+
+    return fastapi.responses.JSONResponse(content={'token': auth_token},
+                                          headers={'Cache-Control': 'no-store'})
+
+
+@app.post('/api/v1/auth/authorize')
+async def authorize_auth_session(
+        request: fastapi.Request) -> fastapi.responses.JSONResponse:
+    """Authorize an auth session (called when user clicks Authorize button).
+
+    This endpoint requires authentication (via auth proxy cookies).
+    It generates the token and creates a session for the CLI to retrieve.
+
+    Request body:
+        code_challenge: The code challenge from the CLI
+
+    Returns:
+        - 200 if successfully authorized
+    """
+    try:
+        body = await request.json()
+    except json.JSONDecodeError as e:
+        raise fastapi.HTTPException(status_code=400,
+                                    detail='Invalid JSON body') from e
+
+    code_challenge = body.get('code_challenge')
+    if not code_challenge:
+        raise fastapi.HTTPException(status_code=400,
+                                    detail='code_challenge is required')
+    # Validate format: base64url-encoded SHA256, 43 chars of A-Za-z0-9_-
+    if not re.match(r'^[A-Za-z0-9_-]{43}$', code_challenge):
+        raise fastapi.HTTPException(status_code=400,
+                                    detail='Invalid code_challenge format')
+
+    auth_token = _generate_auth_token(request)
+
+    # Create the session with the token
+    auth_sessions.auth_session_store.create_session(code_challenge, auth_token)
+
+    return fastapi.responses.JSONResponse(content={'status': 'authorized'},
+                                          headers={'Cache-Control': 'no-store'})
+
+
+@app.get('/auth/authorize')
+async def authorize_page(
+        request: fastapi.Request) -> fastapi.responses.Response:
+    """Serve the authorization page where users click to authorize the CLI.
+
+    This page requires authentication (via auth proxy). The code_challenge
+    query param is read by JavaScript and sent to the POST endpoint.
+    """
+    user = request.state.auth_user
+    if user is None:
+        user = _get_auth_user_header(request)
+    user_info = html.escape(
+        f'Logged in as {user.name}') if user is not None else ''
+
+    html_dir = pathlib.Path(__file__).parent / 'html'
+    authorize_page_path = html_dir / 'authorize_page.html'
+    with open(authorize_page_path, 'r', encoding='utf-8') as f:
+        html_content = f.read()
+
+    html_content = html_content.replace('USER_PLACEHOLDER', user_info)
+
+    return fastapi.responses.HTMLResponse(
+        content=html_content,
+        headers={'Cache-Control': 'no-cache, no-transform'})
+
+
 @app.post('/check')
 async def check(request: fastapi.Request,
                 check_body: payloads.CheckBody) -> None:
@@ -771,6 +949,7 @@ async def check(request: fastapi.Request,
         request_body=check_body,
         func=sky_check.check,
         schedule_type=requests_lib.ScheduleType.SHORT,
+        auth_user=request.state.auth_user,
     )
 
 
@@ -782,13 +961,11 @@ async def enabled_clouds(request: fastapi.Request,
     await executor.schedule_request_async(
         request_id=request.state.request_id,
         request_name=request_names.RequestName.ENABLED_CLOUDS,
-        request_body=server_utils.build_body_at_server(
-            request=request,
-            body_type=payloads.EnabledCloudsBody,
-            workspace=workspace,
-            expand=expand),
+        request_body=payloads.EnabledCloudsBody(workspace=workspace,
+                                                expand=expand),
         func=core.enabled_clouds,
         schedule_type=requests_lib.ScheduleType.SHORT,
+        auth_user=request.state.auth_user,
     )
 
 
@@ -805,6 +982,7 @@ async def realtime_kubernetes_gpu_availability(
         request_body=realtime_gpu_availability_body,
         func=core.realtime_kubernetes_gpu_availability,
         schedule_type=requests_lib.ScheduleType.SHORT,
+        auth_user=request.state.auth_user,
     )
 
 
@@ -820,6 +998,7 @@ async def kubernetes_node_info(
         request_body=kubernetes_node_info_body,
         func=kubernetes_utils.get_kubernetes_node_info,
         schedule_type=requests_lib.ScheduleType.SHORT,
+        auth_user=request.state.auth_user,
     )
 
 
@@ -835,6 +1014,7 @@ async def slurm_gpu_availability(
         request_body=slurm_gpu_availability_body,
         func=core.realtime_slurm_gpu_availability,
         schedule_type=requests_lib.ScheduleType.SHORT,
+        auth_user=request.state.auth_user,
     )
 
 
@@ -850,6 +1030,7 @@ async def slurm_node_info(
         request_body=slurm_node_info_body,
         func=slurm_utils.slurm_node_info,
         schedule_type=requests_lib.ScheduleType.SHORT,
+        auth_user=request.state.auth_user,
     )
 
 
@@ -860,10 +1041,10 @@ async def status_kubernetes(request: fastapi.Request) -> None:
     await executor.schedule_request_async(
         request_id=request.state.request_id,
         request_name=request_names.RequestName.STATUS_KUBERNETES,
-        request_body=server_utils.build_body_at_server(
-            request=request, body_type=payloads.RequestBody),
+        request_body=payloads.RequestBody(),
         func=core.status_kubernetes,
         schedule_type=requests_lib.ScheduleType.SHORT,
+        auth_user=request.state.auth_user,
     )
 
 
@@ -878,6 +1059,7 @@ async def list_accelerators(
         request_body=list_accelerator_counts_body,
         func=catalog.list_accelerators,
         schedule_type=requests_lib.ScheduleType.SHORT,
+        auth_user=request.state.auth_user,
     )
 
 
@@ -893,6 +1075,7 @@ async def list_accelerator_counts(
         request_body=list_accelerator_counts_body,
         func=catalog.list_accelerator_counts,
         schedule_type=requests_lib.ScheduleType.SHORT,
+        auth_user=request.state.auth_user,
     )
 
 
@@ -932,10 +1115,10 @@ def validate_dag(dag: dag_utils.dag_lib.Dag):
             dag.validate(skip_file_mounts=True, skip_workdir=True)
 
     try:
-        dag = dag_utils.load_chain_dag_from_yaml_str(validate_body.dag)
+        dag = dag_utils.load_dag_from_yaml_str(validate_body.dag)
         # Apply admin policy and validate DAG is blocking, run it in a separate
         # thread executor to avoid blocking the uvicorn event loop.
-        await context_utils.to_thread(validate_dag, dag)
+        await asyncio.to_thread(validate_dag, dag)
     except Exception as e:  # pylint: disable=broad-except
         # Print the exception to the API server log.
         if env_options.Options.SHOW_DEBUG_INFO.get():
@@ -957,6 +1140,7 @@ async def optimize(optimize_body: payloads.OptimizeBody,
         ignore_return_value=True,
         func=core.optimize,
         schedule_type=requests_lib.ScheduleType.SHORT,
+        auth_user=request.state.auth_user,
     )
 
 
@@ -1074,7 +1258,7 @@ def get_missing_chunks(total_chunks: int) -> Set[str]:
     logger.info(f'Uploaded zip file: {zip_file_path}')
     await unzip_file(zip_file_path, client_file_mounts_dir)
     if total_chunks > 1:
-        await context_utils.to_thread(shutil.rmtree, chunk_dir)
+        await asyncio.to_thread(shutil.rmtree, chunk_dir)
     return payloads.UploadZipFileResponse(
         status=responses.UploadStatus.COMPLETED.value)
 
@@ -1102,6 +1286,16 @@ def _do_unzip() -> None:
                     new_path = client_file_mounts_dir / original_path.lstrip(
                         '/')
 
+                    # Security check: ensure extracted path stays within target
+                    # directory to prevent Zip Slip attacks (path traversal via
+                    # malicious "../" sequences in archive member names).
+                    resolved_path = new_path.resolve()
+                    if not _is_relative_to(resolved_path,
+                                           client_file_mounts_dir):
+                        raise ValueError(
+                            f'Zip member {member.filename!r} would extract '
+                            'outside target directory. Aborted.')
+
                     if (member.external_attr >> 28) == 0xA:
                         # Symlink. Read the target path and create a symlink.
                         new_path.parent.mkdir(parents=True, exist_ok=True)
@@ -1151,7 +1345,7 @@ def _do_unzip() -> None:
             # success/failure handling above
             zip_file_path.unlink(missing_ok=True)
 
-    await context_utils.to_thread(_do_unzip)
+    await asyncio.to_thread(_do_unzip)
 
 
 @app.post('/launch')
@@ -1168,6 +1362,7 @@ async def launch(launch_body: payloads.LaunchBody,
         schedule_type=requests_lib.ScheduleType.LONG,
         request_cluster_name=launch_body.cluster_name,
         retryable=launch_body.retry_until_up,
+        auth_user=request.state.auth_user,
     )
 
 
@@ -1187,6 +1382,7 @@ async def exec(request: fastapi.Request, exec_body: payloads.ExecBody) -> None:
         ),
         schedule_type=requests_lib.ScheduleType.LONG,
         request_cluster_name=cluster_name,
+        auth_user=request.state.auth_user,
     )
 
 
@@ -1201,6 +1397,7 @@ async def stop(request: fastapi.Request,
         func=core.stop,
         schedule_type=requests_lib.ScheduleType.SHORT,
         request_cluster_name=stop_body.cluster_name,
+        auth_user=request.state.auth_user,
     )
 
 
@@ -1222,6 +1419,7 @@ async def status(
         schedule_type=(requests_lib.ScheduleType.LONG if
                        status_body.refresh != common_lib.StatusRefreshMode.NONE
                        else requests_lib.ScheduleType.SHORT),
+        auth_user=request.state.auth_user,
     )
 
 
@@ -1236,6 +1434,7 @@ async def endpoints(request: fastapi.Request,
         func=core.endpoints,
         schedule_type=requests_lib.ScheduleType.SHORT,
         request_cluster_name=endpoint_body.cluster,
+        auth_user=request.state.auth_user,
     )
 
 
@@ -1250,6 +1449,7 @@ async def down(request: fastapi.Request,
         func=core.down,
         schedule_type=requests_lib.ScheduleType.SHORT,
         request_cluster_name=down_body.cluster_name,
+        auth_user=request.state.auth_user,
     )
 
 
@@ -1264,6 +1464,7 @@ async def start(request: fastapi.Request,
         func=core.start,
         schedule_type=requests_lib.ScheduleType.LONG,
         request_cluster_name=start_body.cluster_name,
+        auth_user=request.state.auth_user,
     )
 
 
@@ -1278,6 +1479,7 @@ async def autostop(request: fastapi.Request,
         func=core.autostop,
         schedule_type=requests_lib.ScheduleType.SHORT,
         request_cluster_name=autostop_body.cluster_name,
+        auth_user=request.state.auth_user,
     )
 
 
@@ -1292,6 +1494,7 @@ async def queue(request: fastapi.Request,
         func=core.queue,
         schedule_type=requests_lib.ScheduleType.SHORT,
         request_cluster_name=queue_body.cluster_name,
+        auth_user=request.state.auth_user,
     )
 
 
@@ -1306,6 +1509,7 @@ async def job_status(request: fastapi.Request,
         func=core.job_status,
         schedule_type=requests_lib.ScheduleType.SHORT,
         request_cluster_name=job_status_body.cluster_name,
+        auth_user=request.state.auth_user,
     )
 
 
@@ -1320,6 +1524,7 @@ async def cancel(request: fastapi.Request,
         func=core.cancel,
         schedule_type=requests_lib.ScheduleType.SHORT,
         request_cluster_name=cancel_body.cluster_name,
+        auth_user=request.state.auth_user,
     )
 
 
@@ -1340,6 +1545,7 @@ async def logs(
         func=core.tail_logs,
         schedule_type=requests_lib.ScheduleType.SHORT,
         request_cluster_name=cluster_job_body.cluster_name,
+        auth_user=request.state.auth_user,
     )
     task = executor.execute_request_in_coroutine(request_task)
     background_tasks.add_task(task.cancel)
@@ -1371,6 +1577,7 @@ async def download_logs(
         func=core.download_logs,
         schedule_type=requests_lib.ScheduleType.SHORT,
         request_cluster_name=cluster_jobs_body.cluster_name,
+        auth_user=request.state.auth_user,
     )
 
 
@@ -1418,8 +1625,7 @@ def _zip_files_and_folders(folder_paths, zip_path):
                 # CLI-friendly (default): entries with full paths for mapping
                 storage_utils.zip_files_and_folders(folders, zip_path)
 
-        await context_utils.to_thread(_zip_files_and_folders, folder_paths,
-                                      zip_path)
+        await asyncio.to_thread(_zip_files_and_folders, folder_paths, zip_path)
 
         # Add home path to the response headers, so that the client can replace
         # the remote path in the zip file to the local path.
@@ -1511,6 +1717,32 @@ def provision_logs(provision_logs_body: payloads.ProvisionLogsBody,
     )
 
 
+@app.post('/autostop_logs')
+async def autostop_logs(
+    request: fastapi.Request, autostop_logs_body: payloads.AutostopLogsBody,
+    background_tasks: fastapi.BackgroundTasks
+) -> fastapi.responses.StreamingResponse:
+    """Tails the autostop hook logs of a cluster."""
+    executor.check_request_thread_executor_available()
+    request_task = await executor.prepare_request_async(
+        request_id=request.state.request_id,
+        request_name=request_names.RequestName.CLUSTER_AUTOSTOP_LOGS,
+        request_body=autostop_logs_body,
+        func=core.tail_autostop_logs,
+        schedule_type=requests_lib.ScheduleType.SHORT,
+        request_cluster_name=autostop_logs_body.cluster_name,
+        auth_user=request.state.auth_user,
+    )
+    task = executor.execute_request_in_coroutine(request_task)
+    background_tasks.add_task(task.cancel)
+    return stream_utils.stream_response_for_long_request(
+        request_id=request.state.request_id,
+        logs_path=request_task.log_path,
+        background_tasks=background_tasks,
+        kill_request_on_disconnect=False,
+    )
+
+
 @app.post('/cost_report')
 async def cost_report(request: fastapi.Request,
                       cost_report_body: payloads.CostReportBody) -> None:
@@ -1521,6 +1753,7 @@ async def cost_report(request: fastapi.Request,
         request_body=cost_report_body,
         func=core.cost_report,
         schedule_type=requests_lib.ScheduleType.SHORT,
+        auth_user=request.state.auth_user,
     )
 
 
@@ -1536,6 +1769,7 @@ async def cluster_events(
         func=core.get_cluster_events,
         schedule_type=requests_lib.ScheduleType.SHORT,
         request_cluster_name=cluster_events_body.cluster_name or '',
+        auth_user=request.state.auth_user,
     )
 
 
@@ -1545,10 +1779,10 @@ async def storage_ls(request: fastapi.Request) -> None:
     await executor.schedule_request_async(
         request_id=request.state.request_id,
         request_name=request_names.RequestName.STORAGE_LS,
-        request_body=server_utils.build_body_at_server(
-            request=request, body_type=payloads.RequestBody),
+        request_body=payloads.RequestBody(),
         func=core.storage_ls,
         schedule_type=requests_lib.ScheduleType.SHORT,
+        auth_user=request.state.auth_user,
     )
 
 
@@ -1562,6 +1796,7 @@ async def storage_delete(request: fastapi.Request,
         request_body=storage_body,
         func=core.storage_delete,
         schedule_type=requests_lib.ScheduleType.LONG,
+        auth_user=request.state.auth_user,
     )
 
 
@@ -1575,6 +1810,7 @@ async def local_up(request: fastapi.Request,
         request_body=local_up_body,
         func=core.local_up,
         schedule_type=requests_lib.ScheduleType.LONG,
+        auth_user=request.state.auth_user,
     )
 
 
@@ -1588,6 +1824,7 @@ async def local_down(request: fastapi.Request,
         request_body=local_down_body,
         func=core.local_down,
         schedule_type=requests_lib.ScheduleType.LONG,
+        auth_user=request.state.auth_user,
     )
 
 
@@ -1796,6 +2033,7 @@ async def api_cancel(request: fastapi.Request,
         request_body=request_cancel_body,
         func=requests_lib.kill_requests_with_prefix,
         schedule_type=requests_lib.ScheduleType.SHORT,
+        auth_user=request.state.auth_user,
     )
 
 
@@ -1843,8 +2081,11 @@ async def list_plugins() -> Dict[str, List[Dict[str, Any]]]:
     """Return metadata about loaded backend plugins."""
     plugin_infos = []
     for plugin_info in plugins.get_plugins():
+        if plugin_info.hidden_from_display:
+            continue
         info = {
             'js_extension_path': plugin_info.js_extension_path,
+            'requires_early_init': plugin_info.requires_early_init,
         }
         for attr in ('name', 'version', 'commit'):
             value = getattr(plugin_info, attr, None)
@@ -1952,7 +2193,9 @@ async def _get_cluster_and_validate(
         cluster_records = await context_utils.to_thread_with_executor(
             thread_pool_executor, core.status, cluster_name, all_users=True)
     cluster_record = cluster_records[0]
-    if cluster_record['status'] != status_lib.ClusterStatus.UP:
+    if cluster_record['status'] not in (status_lib.ClusterStatus.INIT,
+                                        status_lib.ClusterStatus.UP,
+                                        status_lib.ClusterStatus.AUTOSTOPPING):
         raise fastapi.HTTPException(
             status_code=400, detail=f'Cluster {cluster_name} is not running')
 
@@ -2183,7 +2426,7 @@ async def slurm_job_ssh_proxy(websocket: fastapi.WebSocket,
     login_node_host = login_node_ssh_config['hostname']
     login_node_port = int(login_node_ssh_config['port'])
     login_node_user = login_node_ssh_config['user']
-    login_node_key = login_node_ssh_config['private_key']
+    login_node_key = login_node_ssh_config.get('private_key', None)
     login_node_proxy_command = login_node_ssh_config.get('proxycommand', None)
     login_node_proxy_jump = login_node_ssh_config.get('proxyjump', None)
 
@@ -2217,9 +2460,17 @@ async def slurm_job_ssh_proxy(websocket: fastapi.WebSocket,
 
     # Run sshd inside the Slurm job "container" via srun, such that it inherits
     # the resource constraints of the Slurm job.
+    is_container_image = handle.launched_resources.extract_docker_image(
+    ) is not None
     ssh_cmd += [
         shlex.quote(
-            slurm_utils.srun_sshd_command(job_id, target_node, login_node_user))
+            slurm_utils.srun_sshd_command(
+                job_id,
+                target_node,
+                login_node_user,
+                handle.cluster_name_on_cloud,
+                is_container_image,
+            ))
     ]
 
     proc = await asyncio.create_subprocess_shell(
@@ -2396,29 +2647,29 @@ async def all_contexts(request: fastapi.Request) -> None:
     await executor.schedule_request_async(
         request_id=request.state.request_id,
         request_name=request_names.RequestName.ALL_CONTEXTS,
-        request_body=server_utils.build_body_at_server(
-            request=request, body_type=payloads.RequestBody),
+        request_body=payloads.RequestBody(),
         func=core.get_all_contexts,
         schedule_type=requests_lib.ScheduleType.SHORT,
+        auth_user=request.state.auth_user,
     )
 
 
 # === Internal APIs ===
 @app.get('/api/completion/cluster_name')
 async def complete_cluster_name(incomplete: str,) -> List[str]:
-    return await context_utils.to_thread(
+    return await asyncio.to_thread(
         global_user_state.get_cluster_names_start_with, incomplete)
 
 
 @app.get('/api/completion/storage_name')
 async def complete_storage_name(incomplete: str,) -> List[str]:
-    return await context_utils.to_thread(
+    return await asyncio.to_thread(
         global_user_state.get_storage_names_start_with, incomplete)
 
 
 @app.get('/api/completion/volume_name')
 async def complete_volume_name(incomplete: str,) -> List[str]:
-    return await context_utils.to_thread(
+    return await asyncio.to_thread(
         global_user_state.get_volume_names_start_with, incomplete)
 
 
@@ -2455,6 +2706,13 @@ async def serve_dashboard(full_path: str):
         if os.path.isfile(plugin_catchall):
             return fastapi.responses.FileResponse(plugin_catchall)
 
+    # Serve recipe detail page for any /recipes/* paths (dynamic route)
+    if full_path.startswith('recipes/') and full_path != 'recipes/':
+        recipe_page = os.path.join(server_constants.DASHBOARD_DIR, 'recipes',
+                                   '[recipe].html')
+        if os.path.isfile(recipe_page):
+            return fastapi.responses.FileResponse(recipe_page)
+
     # Serve index.html for client-side routing
     # e.g. /clusters, /jobs
     index_path = os.path.join(server_constants.DASHBOARD_DIR, 'index.html')
@@ -2559,6 +2817,10 @@ def apply_user_hash(user_hash: str) -> None:
     # Restore the server user hash
     logger.info('Initializing server user hash')
     _init_or_restore_server_user_hash()
+    # Pre-load plugin RBAC rules before initializing permission service.
+    # This ensures plugin RBAC rules are available when policies are created.
+    logger.info('Pre-loading plugin RBAC rules')
+    plugins.load_plugin_rbac_rules()
     logger.info('Initializing permission service')
     permission.permission_service.initialize()
     logger.info('Permission service initialized')
diff --git a/sky/server/server_utils.py b/sky/server/server_utils.py
deleted file mode 100644
index 27ce2778598..00000000000
--- a/sky/server/server_utils.py
+++ /dev/null
@@ -1,30 +0,0 @@
-"""Utilities for the API server."""
-
-from typing import Optional, Type, TypeVar
-
-import fastapi
-
-from sky.server.requests import payloads
-from sky.skylet import constants
-
-_BodyT = TypeVar('_BodyT', bound=payloads.RequestBody)
-
-
-# TODO(aylei): remove this and disable request body construction at server-side
-def build_body_at_server(request: Optional[fastapi.Request],
-                         body_type: Type[_BodyT], **data) -> _BodyT:
-    """Builds the request body at the server.
-
-    For historical reasons, some handlers mimic a client request body
-    at server-side in order to coordinate with the interface of executor.
-    This will cause issues where the client info like user identity is not
-    respected in these handlers. This function is a helper to build the request
-    body at server-side with the auth user overridden.
-    """
-    request_body = body_type(**data)
-    if request is not None:
-        auth_user = getattr(request.state, 'auth_user', None)
-        if auth_user:
-            request_body.env_vars[constants.USER_ID_ENV_VAR] = auth_user.id
-            request_body.env_vars[constants.USER_ENV_VAR] = auth_user.name
-    return request_body
diff --git a/sky/setup_files/alembic.ini b/sky/setup_files/alembic.ini
index e86a55b910b..4a08d57d6da 100644
--- a/sky/setup_files/alembic.ini
+++ b/sky/setup_files/alembic.ini
@@ -106,6 +106,10 @@ version_table = alembic_version_sky_config_db
 version_locations = %(here)s/../schemas/db/kv_cache
 version_table = alembic_version_kv_cache_db
 
+[recipes_db]
+version_locations = %(here)s/../schemas/db/recipes
+version_table = alembic_version_recipes_db
+
 [post_write_hooks]
 # post_write_hooks defines scripts or Python functions that are run
 # on newly generated revision scripts.  See the documentation for further
diff --git a/sky/setup_files/dependencies.py b/sky/setup_files/dependencies.py
index a8e3c683a5c..503a60d1a57 100644
--- a/sky/setup_files/dependencies.py
+++ b/sky/setup_files/dependencies.py
@@ -36,8 +36,10 @@
     'python-dotenv',
     'rich',
     'tabulate',
-    # Light weight requirement, can be replaced with "typing" once
-    # we deprecate Python 3.7 (this will take a while).
+    # Light weight requirement, can be removed after we deprecate Python 3.9.
+    # ParamSpec is available in typing module starting from Python 3.10, so
+    # we can replace "from typing_extensions import ParamSpec" with
+    # "from typing import ParamSpec" once we require Python >= 3.10.
     'typing_extensions',
     # filelock 3.15.0 or higher is required for async file locking.
     'filelock >= 3.15.0',
@@ -55,7 +57,7 @@
     # uvicorn, so we need to pin uvicorn version to avoid potential break
     # changes.
     # Notes for current version check:
-    # - uvicorn 0.33.0 is the latest version that supports Python 3.8
+    # - uvicorn 0.33.0 is the latest version that supports Python 3.9
     # - uvicorn 0.36.0 removes setup_event_loop thus breaks SkyPilot's custom
     #   behavior.
     'uvicorn[standard] >=0.33.0, <0.36.0',
@@ -74,6 +76,9 @@
     'psycopg2-binary',
     'aiosqlite',
     'asyncpg',
+    # Required by sqlalchemy.ext.asyncio which is used in
+    # sky/utils/db/db_utils.py
+    'greenlet',
     # TODO(hailong): These three dependencies should be removed after we make
     # the client-side actually not importing them.
     'casbin',
@@ -118,7 +123,6 @@
     GRPC,
     PROTOBUF,
     'aiosqlite',
-    'greenlet',
 ]
 
 local_ray = [
@@ -247,6 +251,7 @@
     'seeweb': ['ecsapi==0.4.0'],
     'shadeform': [],  # No dependencies needed for shadeform
     'slurm': ['python-hostlist'],
+    'yotta': [],  # No dependencies needed for Yotta
 }
 
 # Calculate which clouds should be included in the [all] installation.
diff --git a/sky/setup_files/setup.py b/sky/setup_files/setup.py
index 7ea8cd94f3e..4233ef43fbf 100644
--- a/sky/setup_files/setup.py
+++ b/sky/setup_files/setup.py
@@ -21,6 +21,7 @@
 import runpy
 import subprocess
 import sys
+from typing import Any, cast
 
 import setuptools
 
@@ -165,7 +166,7 @@ def parse_readme(readme: str) -> str:
         long_description=long_description,
         long_description_content_type='text/markdown',
         setup_requires=['wheel'],
-        requires_python='>=3.7',
+        requires_python='>=3.9',
         install_requires=dependencies['install_requires'],
         extras_require=dependencies['extras_require'],
         entry_points={
@@ -173,8 +174,6 @@ def parse_readme(readme: str) -> str:
         },
         include_package_data=True,
         classifiers=[
-            'Programming Language :: Python :: 3.7',
-            'Programming Language :: Python :: 3.8',
             'Programming Language :: Python :: 3.9',
             'Programming Language :: Python :: 3.10',
             'Programming Language :: Python :: 3.11',
@@ -185,10 +184,11 @@ def parse_readme(readme: str) -> str:
             'Topic :: Software Development :: Libraries :: Python Modules',
             'Topic :: System :: Distributed Computing',
         ],
-        project_urls={
-            'Homepage': 'https://github.com/skypilot-org/skypilot',
-            'Issues': 'https://github.com/skypilot-org/skypilot/issues',
-            'Discussion': 'https://github.com/skypilot-org/skypilot/discussions',
-            'Documentation': 'https://docs.skypilot.co/',
-        },
+        project_urls=cast(
+            Any, {
+                'Homepage': 'https://github.com/skypilot-org/skypilot',
+                'Issues': 'https://github.com/skypilot-org/skypilot/issues',
+                'Discussion': 'https://github.com/skypilot-org/skypilot/discussions',
+                'Documentation': 'https://docs.skypilot.co/',
+            }),
     )
diff --git a/sky/skylet/autostop_lib.py b/sky/skylet/autostop_lib.py
index 94a64c6990d..841f20267c3 100644
--- a/sky/skylet/autostop_lib.py
+++ b/sky/skylet/autostop_lib.py
@@ -1,5 +1,6 @@
 """Autostop utilities."""
 import enum
+import os
 import pickle
 import shlex
 import subprocess
@@ -11,6 +12,7 @@
 from sky.adaptors import common as adaptors_common
 from sky.skylet import configs
 from sky.skylet import constants
+from sky.skylet import log_lib
 from sky.utils import message_utils
 from sky.utils import ux_utils
 
@@ -123,7 +125,9 @@ def __init__(self,
                  boot_time: float,
                  backend: Optional[str],
                  wait_for: AutostopWaitFor,
-                 down: bool = False):
+                 down: bool = False,
+                 hook: Optional[str] = None,
+                 hook_timeout: Optional[int] = None):
         assert autostop_idle_minutes < 0 or backend is not None, (
             autostop_idle_minutes, backend)
         self.autostop_idle_minutes = autostop_idle_minutes
@@ -131,9 +135,17 @@ def __init__(self,
         self.backend = backend
         self.wait_for = wait_for
         self.down = down
+        self.hook = hook
+        # Use the constant if hook_timeout is not specified
+        if hook_timeout is None:
+            hook_timeout = constants.DEFAULT_AUTOSTOP_HOOK_TIMEOUT_SECONDS
+        self.hook_timeout = hook_timeout
 
     def __setstate__(self, state: dict):
         state.setdefault('down', False)
+        state.setdefault('hook', None)
+        state.setdefault('hook_timeout',
+                         constants.DEFAULT_AUTOSTOP_HOOK_TIMEOUT_SECONDS)
         self.__dict__.update(state)
 
 
@@ -141,17 +153,41 @@ def get_autostop_config() -> AutostopConfig:
     config_str = configs.get_config(_AUTOSTOP_CONFIG_KEY)
     if config_str is None:
         return AutostopConfig(-1, -1, None, DEFAULT_AUTOSTOP_WAIT_FOR)
-    return pickle.loads(config_str)
+    config = pickle.loads(config_str)
+    # Ensure backward compatibility: set hook and hook_timeout if not present
+    if not hasattr(config, 'hook'):
+        config.hook = None
+    if not hasattr(config, 'hook_timeout'):
+        config.hook_timeout = constants.DEFAULT_AUTOSTOP_HOOK_TIMEOUT_SECONDS
+    return config
 
 
-def set_autostop(idle_minutes: int, backend: Optional[str],
-                 wait_for: AutostopWaitFor, down: bool) -> None:
+def set_autostop(idle_minutes: int,
+                 backend: Optional[str],
+                 wait_for: AutostopWaitFor,
+                 down: bool,
+                 hook: Optional[str] = None,
+                 hook_timeout: Optional[int] = None) -> None:
+    """Set autostop configuration.
+
+    Args:
+        idle_minutes: Minutes of idleness before autostop.
+        backend: Backend name.
+        wait_for: Condition for resetting idleness timer.
+        down: Whether to tear down (autodown) instead of stop.
+        hook: Hook script to execute before autostop.
+        hook_timeout: Timeout in seconds for hook execution. If None, uses
+            DEFAULT_AUTOSTOP_HOOK_TIMEOUT_SECONDS (3600 = 1 hour).
+    """
     boot_time = psutil.boot_time()
+
     autostop_config = AutostopConfig(idle_minutes, boot_time, backend, wait_for,
-                                     down)
+                                     down, hook, hook_timeout)
     configs.set_config(_AUTOSTOP_CONFIG_KEY, pickle.dumps(autostop_config))
-    logger.debug(f'set_autostop(): idle_minutes {idle_minutes}, down {down}, '
-                 f'wait_for {wait_for.value}.')
+    logger.debug(
+        f'set_autostop(): idle_minutes {idle_minutes}, down {down}, '
+        f'wait_for {wait_for.value}, hook {"present" if hook else "none"}, '
+        f'hook_timeout {hook_timeout}s.')
     # Reset timer whenever an autostop setting is submitted, i.e. the idle
     # time will be counted from now.
     set_last_active_time_to_now()
@@ -219,6 +255,59 @@ def has_active_ssh_sessions() -> bool:
         return False
 
 
+def execute_autostop_hook(hook: Optional[str],
+                          hook_timeout: Optional[int] = None) -> bool:
+    """Execute the autostop hook script if provided.
+
+    Args:
+        hook: The hook script to execute, or None if no hook is set.
+        hook_timeout: Timeout in seconds for hook execution. If None, uses
+            DEFAULT_AUTOSTOP_HOOK_TIMEOUT_SECONDS (3600 = 1 hour).
+
+    Returns:
+        True if hook executed successfully (or no hook), False if hook failed.
+    """
+    if hook is None or not hook.strip():
+        return True
+
+    if hook_timeout is None:
+        hook_timeout = constants.DEFAULT_AUTOSTOP_HOOK_TIMEOUT_SECONDS
+
+    logger.info(f'Executing autostop hook (timeout: {hook_timeout}s)...')
+    log_path = os.path.expanduser(constants.AUTOSTOP_HOOK_LOG_FILE)
+    try:
+        # Execute the hook script and log output to file
+        returncode, stdout, stderr = log_lib.run_with_log(hook,
+                                                          log_path,
+                                                          require_outputs=True,
+                                                          shell=True,
+                                                          process_stream=True,
+                                                          timeout=hook_timeout)
+
+        if returncode != 0:
+            logger.error(f'Autostop hook failed with return code {returncode}. '
+                         f'Check {log_path} for details. '
+                         f'stdout: {stdout}, stderr: {stderr}')
+            return False
+
+        logger.info(
+            f'Autostop hook executed successfully. Logs saved to {log_path}. '
+            f'stdout: {stdout}')
+        if stderr:
+            logger.error(f'Hook stderr: {stderr}')
+        return True
+    except subprocess.TimeoutExpired:
+        logger.error(f'Autostop hook timed out after {hook_timeout} seconds. '
+                     f'Check {log_path} for details.')
+        return False
+    except Exception as e:  # pylint: disable=broad-except
+        logger.error(
+            f'Error executing autostop hook: {e}. '
+            f'Check {log_path} for details.',
+            exc_info=True)
+        return False
+
+
 class AutostopCodeGen:
     """Code generator for autostop utility functions.
 
@@ -233,15 +322,23 @@ def set_autostop(cls,
                      idle_minutes: int,
                      backend: str,
                      wait_for: Optional[AutostopWaitFor],
-                     down: bool = False) -> str:
+                     down: bool = False,
+                     hook: Optional[str] = None,
+                     hook_timeout: Optional[int] = None) -> str:
         if wait_for is None:
             wait_for = DEFAULT_AUTOSTOP_WAIT_FOR
         code = [
-            f'\nif getattr(constants, "SKYLET_LIB_VERSION", 1) < 4: '
-            f'\n autostop_lib.set_autostop({idle_minutes}, {backend!r}, {down})'
-            f'\nelse: '
+            '\nskylet_lib_version = getattr(constants, "SKYLET_LIB_VERSION", 1)'
+            '\nif skylet_lib_version < 4: '
+            f'\n autostop_lib.set_autostop({idle_minutes}, {backend!r}, '
+            f'{down})'
+            '\nelif skylet_lib_version < 5: '
+            f'\n autostop_lib.set_autostop({idle_minutes}, {backend!r}, '
+            f'autostop_lib.{wait_for}, {down})'
+            '\nelse: '
             f'\n autostop_lib.set_autostop({idle_minutes}, {backend!r}, '
-            f'autostop_lib.{wait_for}, {down})',
+            f'autostop_lib.{wait_for}, {down}, hook={hook!r}, '
+            f'hook_timeout={hook_timeout})',
         ]
         return cls._build(code)
 
diff --git a/sky/skylet/constants.py b/sky/skylet/constants.py
index b210511fe4b..4e5d84630c1 100644
--- a/sky/skylet/constants.py
+++ b/sky/skylet/constants.py
@@ -144,17 +144,21 @@
 # cluster yaml is updated.
 #
 # TODO(zongheng,zhanghao): make the upgrading of skylet automatic?
-SKYLET_VERSION = '30'  # conditional plugin loading for jobs bwcompat
+SKYLET_VERSION = '34'  # Add fields to ManagedJobInfo proto for GPU metrics.
 # The version of the lib files that skylet/jobs use. Whenever there is an API
 # change for the job_lib or log_lib, we need to bump this version, so that the
 # user can be notified to update their SkyPilot version on the remote cluster.
-SKYLET_LIB_VERSION = 4  # add wait_for param to set_autostop
+SKYLET_LIB_VERSION = 6  # Add better support for launching many jobs at once.
 SKYLET_VERSION_FILE = '.sky/skylet_version'
 SKYLET_LOG_FILE = '.sky/skylet.log'
 SKYLET_PID_FILE = '.sky/skylet_pid'
 SKYLET_PORT_FILE = '.sky/skylet_port'
 SKYLET_GRPC_PORT = 46590
 SKYLET_GRPC_TIMEOUT_SECONDS = 10
+AUTOSTOP_HOOK_LOG_FILE = '.sky/autostop_hook.log'
+
+# Autostop hook timeout default (1 hour in seconds)
+DEFAULT_AUTOSTOP_HOOK_TIMEOUT_SECONDS = 3600
 
 # Docker default options
 DEFAULT_DOCKER_CONTAINER_NAME = 'sky_container'
@@ -230,7 +234,9 @@
     'if [ "{is_custom_docker}" = "false" ]; then '
     'grep "# >>> conda initialize >>>" ~/.bashrc || '
     '{ conda init && source ~/.bashrc; };'
-    'fi;'
+    'fi;')
+
+UV_INSTALLATION_COMMANDS = (
     # Install uv for venv management and pip installation.
     f'{SKY_UV_INSTALL_CMD};'
     # Create a separate python environment for SkyPilot dependencies.
@@ -248,7 +254,7 @@
     # TODO(zhwu): consider adding --python-preference only-managed to avoid
     # using the system python, if a user report such issue.
     f'{SKY_UV_CMD} venv --seed {SKY_REMOTE_PYTHON_ENV} --python 3.10;'
-    f'echo "$(echo {SKY_REMOTE_PYTHON_ENV})/bin/python" > {SKY_PYTHON_PATH_FILE};'
+    f'echo "$(echo {SKY_REMOTE_PYTHON_ENV})/bin/python" > {SKY_PYTHON_PATH_FILE};'  # pylint: disable=line-too-long
 )
 
 _sky_version = str(version.parse(sky.__version__))
@@ -390,6 +396,11 @@
 # and hyphens. We use this regex to validate the cluster name.
 CLUSTER_NAME_VALID_REGEX = '[a-zA-Z]([-_.a-zA-Z0-9]*[a-zA-Z0-9])?'
 
+# Recipe names: letters, numbers, and dashes only (no underscores or dots).
+# Must start with a letter, end with an alphanumeric character.
+RECIPE_NAME_VALID_REGEX = r'[a-zA-Z]([-a-zA-Z0-9]*[a-zA-Z0-9])?'
+RECIPE_NAME_MAX_LENGTH = 40
+
 # Used for translate local file mounts to cloud storage. Please refer to
 # sky/execution.py::_maybe_translate_local_file_mounts_and_sync_up for
 # more details.
@@ -472,6 +483,7 @@
     ('kubernetes', 'provision_timeout'),
     ('kubernetes', 'dws'),
     ('kubernetes', 'kueue'),
+    ('kubernetes', 'remote_identity'),
     ('gcp', 'managed_instance_group'),
     ('gcp', 'enable_gvnic'),
     ('gcp', 'enable_gpu_direct'),
@@ -541,6 +553,11 @@
 # Environment variable that is set to 'true' if rolling update strategy is
 # enabled for the API server deployment.
 SKYPILOT_ROLLING_UPDATE_ENABLED = 'SKYPILOT_ROLLING_UPDATE_ENABLED'
+# Environment variable that is set to 'true' if persistent storage is enabled
+# for the API server deployment (via Helm storage.enabled=true).
+# This enables persistence of managed job logs and file mounts across rolling
+# updates.
+SKYPILOT_API_SERVER_STORAGE_ENABLED = 'SKYPILOT_API_SERVER_STORAGE_ENABLED'
 
 SERVE_OVERRIDE_CONCURRENT_LAUNCHES = (
     f'{SKYPILOT_ENV_VAR_PREFIX}SERVE_OVERRIDE_CONCURRENT_LAUNCHES')
@@ -560,6 +577,8 @@
 ENV_VAR_ENABLE_BASIC_AUTH = 'ENABLE_BASIC_AUTH'
 SKYPILOT_INITIAL_BASIC_AUTH = 'SKYPILOT_INITIAL_BASIC_AUTH'
 SKYPILOT_INGRESS_BASIC_AUTH_ENABLED = 'SKYPILOT_INGRESS_BASIC_AUTH_ENABLED'
+SKYPILOT_DISABLE_BASIC_AUTH_MIDDLEWARE = (
+    'SKYPILOT_DISABLE_BASIC_AUTH_MIDDLEWARE')
 ENV_VAR_ENABLE_SERVICE_ACCOUNTS = 'ENABLE_SERVICE_ACCOUNTS'
 
 # Enable debug logging for requests.
@@ -576,7 +595,7 @@
 ALL_CLOUDS = ('aws', 'azure', 'gcp', 'ibm', 'lambda', 'scp', 'oci',
               'kubernetes', 'runpod', 'vast', 'vsphere', 'cudo', 'fluidstack',
               'paperspace', 'primeintellect', 'do', 'nebius', 'ssh', 'slurm',
-              'hyperbolic', 'seeweb', 'shadeform')
+              'hyperbolic', 'seeweb', 'shadeform', 'yotta')
 # END constants used for service catalog.
 
 # The user ID of the SkyPilot system.
@@ -593,10 +612,26 @@
     'w': 7 * 24 * 60,
 }
 
-TIME_PATTERN: str = ('^[0-9]+('
-                     f'{"|".join([unit.lower() for unit in TIME_UNITS])}|'
-                     f'{"|".join([unit.upper() for unit in TIME_UNITS])}|'
-                     ')?$')
+# Time units for seconds-based duration parsing (for termination_delay, etc.)
+# This includes 's' for seconds, which is not in TIME_UNITS (minutes-based).
+TIME_UNITS_SECONDS = {
+    's': 1,
+    'm': 60,
+    'h': 3600,
+    'd': 86400,
+    'w': 604800,
+}
+
+
+def _make_time_pattern(units: dict) -> str:
+    """Create a regex pattern for time duration strings."""
+    unit_pattern = '|'.join([unit.lower() for unit in units] +
+                            [unit.upper() for unit in units])
+    return f'^[0-9]+({unit_pattern})?$'
+
+
+TIME_PATTERN: str = _make_time_pattern(TIME_UNITS)
+TIME_PATTERN_SECONDS: str = _make_time_pattern(TIME_UNITS_SECONDS)
 
 MEMORY_SIZE_UNITS = {
     'kb': 2**10,
@@ -620,6 +655,7 @@
 
 LAST_USE_TRUNC_LENGTH = 25
 USED_BY_TRUNC_LENGTH = 25
+ERROR_MESSAGE_TRUNC_LENGTH = 60
 
 MIN_PRIORITY = -1000
 MAX_PRIORITY = 1000
@@ -634,5 +670,14 @@
 ARM64_ARCH = 'arm64'
 X86_64_ARCH = 'x86_64'
 
+# Slurm marker file for proctrack type detection.
+# Used by the executor to conditionally apply multi-node barrier.
+SLURM_PROCTRACK_TYPE_FILE = '.sky_proctrack_type'
+
 SSH_DISABLE_LATENCY_MEASUREMENT_ENV_VAR = (
     f'{SKYPILOT_ENV_VAR_PREFIX}SSH_DISABLE_LATENCY_MEASUREMENT')
+
+SSD_LOCAL_DISK = 'ssd'
+HDD_LOCAL_DISK = 'hdd'
+
+LOCAL_DISK_TYPES = {SSD_LOCAL_DISK, HDD_LOCAL_DISK}
diff --git a/sky/skylet/events.py b/sky/skylet/events.py
index 73ffc79c9a3..43e0b8a87bb 100644
--- a/sky/skylet/events.py
+++ b/sky/skylet/events.py
@@ -220,6 +220,22 @@ def _run(self):
                 f'Stopping.')
             self._stop_cluster(autostop_config)
 
+    def _execute_hook_if_present(self, autostop_config) -> None:
+        """Execute autostop hook if present in the config."""
+        hook = autostop_config.hook
+        hook_timeout = autostop_config.hook_timeout
+        if hook:
+            logger.info(f'Executing autostop hook before stopping cluster '
+                        f'(timeout: {hook_timeout}s)...')
+            hook_success = autostop_lib.execute_autostop_hook(
+                hook, hook_timeout)
+            if not hook_success:
+                logger.warning(
+                    'Autostop hook failed, but continuing with cluster stop. '
+                    'Check logs for details.')
+            else:
+                logger.info('Autostop hook completed successfully.')
+
     def _stop_cluster(self, autostop_config):
         if (autostop_config.backend ==
                 cloud_vm_ray_backend.CloudVmRayBackend.NAME):
@@ -241,6 +257,9 @@ def _stop_cluster(self, autostop_config):
             logger.info('Not using new provisioner to stop the cluster. '
                         f'Cloud of this cluster: {provider_name}')
 
+            # Execute autostop hook if provided (for old provisioner path)
+            self._execute_hook_if_present(autostop_config)
+
             is_cluster_multinode = config['max_workers'] > 0
 
             # Even for !is_cluster_multinode, we want to call this to replace
@@ -320,6 +339,9 @@ def _stop_cluster_with_new_provisioner(self, autostop_config,
         from sky import provision as provision_lib
         autostop_lib.set_autostopping_started()
 
+        # Execute autostop hook if provided
+        self._execute_hook_if_present(autostop_config)
+
         cluster_name_on_cloud = cluster_config['cluster_name']
         is_cluster_multinode = cluster_config['max_workers'] > 0
 
diff --git a/sky/skylet/executor/slurm.py b/sky/skylet/executor/slurm.py
index 17db628ccdc..089a700bcc7 100644
--- a/sky/skylet/executor/slurm.py
+++ b/sky/skylet/executor/slurm.py
@@ -7,16 +7,31 @@
 import json
 import os
 import pathlib
+import shutil
 import socket
-import subprocess
 import sys
 import time
 
 import colorama
+import hostlist
 
+from sky.skylet import constants
 from sky.skylet.log_lib import run_bash_command_with_log
 
 
+def _is_proctrack_cgroup_enabled() -> bool:
+    proctrack_file = os.path.join(os.path.expanduser('~'),
+                                  constants.SLURM_PROCTRACK_TYPE_FILE)
+    try:
+        with open(proctrack_file, 'r', encoding='utf-8') as f:
+            proctrack_type = f.read().strip()
+            return proctrack_type == 'cgroup'
+    except (FileNotFoundError, IOError):
+        # If file doesn't exist or can't be read,
+        # default to True to be conservative.
+        return True
+
+
 def _get_ip_address() -> str:
     """Get the IP address of the current node."""
     # Use socket.gethostbyname to be consistent with _get_job_node_ips(),
@@ -30,16 +45,10 @@ def _get_job_node_ips() -> str:
     nodelist = os.environ.get('SLURM_JOB_NODELIST', '')
     assert nodelist, 'SLURM_JOB_NODELIST is not set'
 
-    # Expand compressed nodelist (e.g., "node[1-3,5]"
-    # -> "node1\nnode2\nnode3\nnode5")
-    result = subprocess.run(['scontrol', 'show', 'hostnames', nodelist],
-                            capture_output=True,
-                            text=True,
-                            check=False)
-    if result.returncode != 0:
-        raise RuntimeError(f'Failed to get hostnames for: {nodelist}')
-
-    hostnames = result.stdout.strip().split('\n')
+    # Expand compressed nodelist (e.g., "node[1-3,5]" -> "node1\nnode2...")
+    # Alternative: `scontrol show hostnames $SLURM_JOB_NODELIST`, but `scontrol`
+    # (and Slurm CLI binaries in general) may not exist inside containers.
+    hostnames = list(hostlist.expand_hostlist(nodelist))
     ips = []
     for hostname in hostnames:
         try:
@@ -180,6 +189,51 @@ def main():
                                            stream_logs=True,
                                            streaming_prefix=prefix)
 
+    # For multi-node Slurm jobs (one task per node), we need to wait for all
+    # tasks to complete before any task exits, because Slurm's proctrack/cgroup
+    # kills all processes in a task's cgroup when that task's main process
+    # exits. If one task exits early, child processes (e.g., Ray workers) get
+    # killed even while other tasks are still running.
+    # This ensures all tasks wait until every task has completed before exiting.
+    # Only needed when proctrack/cgroup is enabled.
+    # https://slurm.schedmd.com/cgroups.html#proctrack
+    if num_nodes > 1 and not args.is_setup and _is_proctrack_cgroup_enabled():
+        slurm_job_id = os.environ['SLURM_JOB_ID']
+        slurm_step_id = os.environ['SLURM_STEP_ID']
+        run_done_dir = os.path.expanduser(
+            f'~/.sky_run_done_{slurm_job_id}_{slurm_step_id}')
+        done_file = f'{run_done_dir}/{rank}'
+
+        if rank == 0:
+            shutil.rmtree(run_done_dir, ignore_errors=True)
+            os.makedirs(run_done_dir, exist_ok=True)
+        else:
+            # Workers wait for dir to exist (rank 0 creates it)
+            while not os.path.isdir(run_done_dir):
+                time.sleep(0.1)
+
+        pathlib.Path(done_file).touch()
+
+        # All ranks wait for all done files to exist.
+        max_errs = 10
+        errs = 0
+        while True:
+            try:
+                num_ready = len(os.listdir(run_done_dir))
+                errs = 0
+            except OSError as e:
+                errs += 1
+                if errs >= max_errs:
+                    raise OSError(f'Failed to read {run_done_dir} after '
+                                  f'{max_errs} attempts') from e
+                num_ready = 0
+            if num_ready >= num_nodes:
+                break
+            time.sleep(0.5)
+
+        if rank == 0:
+            shutil.rmtree(run_done_dir, ignore_errors=True)
+
     sys.exit(returncode)
 
 
diff --git a/sky/skylet/job_lib.py b/sky/skylet/job_lib.py
index 452fdbe8df1..8aeae40d0d9 100644
--- a/sky/skylet/job_lib.py
+++ b/sky/skylet/job_lib.py
@@ -1209,6 +1209,61 @@ def add_job(cls, job_name: Optional[str], username: str, run_timestamp: str,
         ]
         return cls._build(code)
 
+    @classmethod
+    def set_job_info_without_job_id(cls,
+                                    name: str,
+                                    workspace: str,
+                                    entrypoint: str,
+                                    pool: Optional[str],
+                                    pool_hash: Optional[str],
+                                    user_hash: Optional[str],
+                                    task_ids: List[int],
+                                    task_names: List[str],
+                                    resources_str: str,
+                                    metadata_jsons: List[str],
+                                    is_primary_in_job_groups: List[bool],
+                                    execution: str,
+                                    num_jobs: int = 1) -> str:
+        pool_str = f'{pool!r}' if pool is not None else 'None'
+        pool_hash_str = f'{pool_hash!r}' if pool_hash is not None else 'None'
+        user_hash_str = f'{user_hash!r}' if user_hash is not None else 'None'
+        # Build the tasks data as Python code
+        task_ids_str = '[' + ','.join(str(tid) for tid in task_ids) + ']'
+        task_names_str = ('[' + ','.join(f'{name!r}' for name in task_names) +
+                          ']')
+        metadata_jsons_str = ('[' +
+                              ','.join(f'{md!r}' for md in metadata_jsons) +
+                              ']')
+        is_primary_in_job_groups_str = ('[' + ','.join(
+            str(is_primary) for is_primary in is_primary_in_job_groups) + ']')
+        code = [
+            '\nfrom sky.jobs import state as managed_job_state',
+            f'\nnum_jobs = {num_jobs}',
+            f'\ntask_ids = {task_ids_str}',
+            f'\ntask_names = {task_names_str}',
+            f'\nresources_str = {resources_str!r}',
+            f'\nmetadata_jsons = {metadata_jsons_str}',
+            f'\nis_primary_in_job_groups = {is_primary_in_job_groups_str}',
+            '\njob_ids = []',
+            '\nfor _ in range(num_jobs):'
+            '\n  job_id = managed_job_state.set_job_info_without_job_id('
+            f'name={name!r},'
+            f'workspace={workspace!r},'
+            f'entrypoint={entrypoint!r},'
+            f'pool={pool_str},'
+            f'pool_hash={pool_hash_str},'
+            f'user_hash={user_hash_str},'
+            f'execution={execution!r})',
+            '\n  job_ids.append(job_id)',
+            '\n  # Set pending state for all tasks',
+            '\n  for task_id, task_name, metadata_json, is_primary_in_job_group in zip('  # pylint: disable=line-too-long
+            '\n      task_ids, task_names, metadata_jsons, is_primary_in_job_groups):'  # pylint: disable=line-too-long
+            '\n    managed_job_state.set_pending('
+            '\n      job_id, task_id, task_name, resources_str, metadata_json, is_primary_in_job_group)',  # pylint: disable=line-too-long
+            '\nprint("Job IDs: " + ",".join(map(str, job_ids)), flush=True)',
+        ]
+        return cls._build(code)
+
     @classmethod
     def queue_job(cls, job_id: int, cmd: str) -> str:
         code = [
diff --git a/sky/skylet/log_lib.py b/sky/skylet/log_lib.py
index ed6ad2622f0..33accad4042 100644
--- a/sky/skylet/log_lib.py
+++ b/sky/skylet/log_lib.py
@@ -171,6 +171,7 @@ def run_with_log(
     line_processor: Optional[log_utils.LineProcessor] = None,
     streaming_prefix: Optional[str] = None,
     log_cmd: bool = False,
+    timeout: Optional[int] = None,
     **kwargs,
 ) -> Union[int, Tuple[int, str, str], Tuple[int, int]]:
     """Runs a command and logs its output to a file.
@@ -185,9 +186,15 @@ def run_with_log(
             enabled, lines are printed only when '\r' or '\n' is found.
         streaming_prefix: Optional prefix for each log line. Can contain {pid}
             placeholder which will be replaced with the subprocess PID.
+        timeout: Optional timeout in seconds. If the command does not complete
+            within this time, it will be terminated and TimeoutExpired will be
+            raised. None means no timeout (default).
 
     Returns the returncode or returncode, stdout and stderr of the command.
       Note that the stdout and stderr is already decoded.
+
+    Raises:
+        subprocess.TimeoutExpired: If the command times out.
     """
     assert process_stream or not require_outputs, (
         process_stream, require_outputs,
@@ -278,24 +285,65 @@ def run_with_log(
                         _handle_io_stream,
                         args=err_args,
                     )
-            if ctx is not None:
-                # When runs in a coroutine, always process the subprocess
-                # stream to:
-                # 1. handle context cancellation
-                # 2. redirect subprocess stdout/stderr to the contextual
-                #    stdout/stderr of current coroutine.
-                stdout, stderr = context_utils.pipe_and_wait_process(
-                    ctx,
-                    proc,
-                    stdout_stream_handler=stdout_stream_handler,
-                    stderr_stream_handler=stderr_stream_handler)
-            elif process_stream:
-                # When runs in a process, only process subprocess stream if
-                # necessary to avoid unnecessary stream handling overhead.
-                stdout, stderr = process_subprocess_stream(
-                    proc, stdout_stream_handler, stderr_stream_handler)
+            # Use a timer to enforce timeout during stream processing.
+            # Without this, process_subprocess_stream blocks until the process
+            # finishes, making the timeout at proc.wait() ineffective.
+            timeout_triggered = False
+            timer = None
+
+            def _timeout_handler():
+                nonlocal timeout_triggered
+                timeout_triggered = True
+                subprocess_utils.kill_children_processes(proc.pid)
+
+            if timeout is not None:
+                timer = threading.Timer(timeout, _timeout_handler)
+                timer.start()
+
+            try:
+                if ctx is not None:
+                    # When runs in a coroutine, always process the subprocess
+                    # stream to:
+                    # 1. handle context cancellation
+                    # 2. redirect subprocess stdout/stderr to the contextual
+                    #    stdout/stderr of current coroutine.
+                    stdout, stderr = context_utils.pipe_and_wait_process(
+                        ctx,
+                        proc,
+                        stdout_stream_handler=stdout_stream_handler,
+                        stderr_stream_handler=stderr_stream_handler)
+                elif process_stream:
+                    # When runs in a process, only process subprocess stream if
+                    # necessary to avoid unnecessary stream handling overhead.
+                    stdout, stderr = process_subprocess_stream(
+                        proc, stdout_stream_handler, stderr_stream_handler)
+            finally:
+                if timer is not None:
+                    timer.cancel()
+
+            # Check if timeout was triggered during stream processing
+            if timeout_triggered:
+                logger.error(
+                    f'Command timed out after {timeout} seconds: {cmd}')
+                raise subprocess.TimeoutExpired(cmd, timeout)
+
             # Ensure returncode is set.
-            proc.wait()
+            if ctx is not None or process_stream:
+                # Stream processing already waited for process completion, so
+                # proc.wait() will return immediately. We still call it to
+                # ensure proc.returncode is set.
+                proc.wait()
+            else:
+                # No stream processing - use proc.wait with timeout as primary
+                # timeout mechanism.
+                try:
+                    proc.wait(timeout=timeout)
+                except subprocess.TimeoutExpired:
+                    # Kill the process and all its children
+                    subprocess_utils.kill_children_processes(proc.pid)
+                    logger.error(
+                        f'Command timed out after {timeout} seconds: {cmd}')
+                    raise
             if require_outputs:
                 return proc.returncode, stdout, stderr
             return proc.returncode
diff --git a/sky/skylet/log_lib.pyi b/sky/skylet/log_lib.pyi
index 8ed1dfef2c0..9d001242142 100644
--- a/sky/skylet/log_lib.pyi
+++ b/sky/skylet/log_lib.pyi
@@ -69,6 +69,7 @@ def run_with_log(cmd: Union[List[str], str],
                  line_processor: Optional[log_utils.LineProcessor] = ...,
                  streaming_prefix: Optional[str] = ...,
                  log_cmd: bool = ...,
+                 timeout: Optional[int] = ...,
                  **kwargs) -> int:
     ...
 
@@ -88,6 +89,7 @@ def run_with_log(cmd: Union[List[str], str],
                  line_processor: Optional[log_utils.LineProcessor] = ...,
                  streaming_prefix: Optional[str] = ...,
                  log_cmd: bool = ...,
+                 timeout: Optional[int] = ...,
                  **kwargs) -> Tuple[int, str, str]:
     ...
 
@@ -107,6 +109,7 @@ def run_with_log(cmd: Union[List[str], str],
                  line_processor: Optional[log_utils.LineProcessor] = ...,
                  streaming_prefix: Optional[str] = ...,
                  log_cmd: bool = ...,
+                 timeout: Optional[int] = ...,
                  **kwargs) -> Tuple[int, int]:
     ...
 
diff --git a/sky/skylet/services.py b/sky/skylet/services.py
index baf46000ee7..72c5f9a0a42 100644
--- a/sky/skylet/services.py
+++ b/sky/skylet/services.py
@@ -43,12 +43,17 @@ def SetAutostop(  # type: ignore[return]
         try:
             wait_for = autostop_lib.AutostopWaitFor.from_protobuf(
                 request.wait_for)
+            hook = request.hook if request.HasField('hook') else None
+            hook_timeout = (request.hook_timeout
+                            if request.HasField('hook_timeout') else None)
             autostop_lib.set_autostop(
                 idle_minutes=request.idle_minutes,
                 backend=request.backend,
                 wait_for=wait_for if wait_for is not None else
                 autostop_lib.DEFAULT_AUTOSTOP_WAIT_FOR,
-                down=request.down)
+                down=request.down,
+                hook=hook,
+                hook_timeout=hook_timeout)
             return autostopv1_pb2.SetAutostopResponse()
         except Exception as e:  # pylint: disable=broad-except
             context.abort(grpc.StatusCode.INTERNAL, str(e))
@@ -206,36 +211,45 @@ def QueueJob(  # type: ignore[return]
                 # Note that the order of ">filename 2>&1" matters.
                 f' > {remote_log_path} 2>&1')
             job_lib.scheduler.queue(job_id, job_submit_cmd)
-
-            if request.HasField('managed_job'):
-                managed_job = request.managed_job
-                pool = managed_job.pool if managed_job.HasField(
-                    'pool') else None
-                pool_hash = None
-                if pool is not None:
-                    pool_hash = serve_state.get_service_hash(pool)
-                # Add the managed job to job queue database.
-                user_id = managed_job.user_id if managed_job.HasField(
-                    'user_id') else None
-                managed_job_state.set_job_info(job_id, managed_job.name,
-                                               managed_job.workspace,
-                                               managed_job.entrypoint, pool,
-                                               pool_hash, user_id)
-                # Set the managed job to PENDING state to make sure that
-                # this managed job appears in the `sky jobs queue`, even
-                # if it needs to wait to be submitted.
-                # We cannot set the managed job to PENDING state in the
-                # job template (jobs-controller.yaml.j2), as it may need
-                # to wait for the run commands to be scheduled on the job
-                # controller in high-load cases.
-                for task in managed_job.tasks:
-                    managed_job_state.set_pending(job_id, task.task_id,
-                                                  task.name, task.resources_str,
-                                                  task.metadata_json)
             return jobsv1_pb2.QueueJobResponse()
         except Exception as e:  # pylint: disable=broad-except
             context.abort(grpc.StatusCode.INTERNAL, str(e))
 
+    def SetJobInfoWithoutJobId(  # type: ignore[return]
+        self, request: jobsv1_pb2.SetJobInfoWithoutJobIdRequest,
+        context: grpc.ServicerContext
+    ) -> jobsv1_pb2.SetJobInfoWithoutJobIdResponse:
+        try:
+            pool = request.pool if request.HasField('pool') else None
+            pool_hash = request.pool_hash if request.HasField(
+                'pool_hash') else None
+            user_hash = request.user_hash if request.HasField(
+                'user_hash') else None
+            job_ids = []
+            execution = request.execution
+            for i in range(request.num_jobs):
+                is_primary_in_job_group = request.is_primary_in_job_groups[i]
+                job_id = managed_job_state.set_job_info_without_job_id(
+                    name=request.name,
+                    workspace=request.workspace,
+                    entrypoint=request.entrypoint,
+                    pool=pool,
+                    pool_hash=pool_hash,
+                    user_hash=user_hash,
+                    execution=execution)
+                job_ids.append(job_id)
+                # Set pending state for all tasks
+                for task_id, task_name, metadata_json in zip(
+                        request.task_ids, request.task_names,
+                        request.metadata_jsons):
+                    managed_job_state.set_pending(job_id, task_id, task_name,
+                                                  request.resources_str,
+                                                  metadata_json,
+                                                  is_primary_in_job_group)
+            return jobsv1_pb2.SetJobInfoWithoutJobIdResponse(job_ids=job_ids)
+        except Exception as e:  # pylint: disable=broad-except
+            context.abort(grpc.StatusCode.INTERNAL, str(e))
+
     def UpdateStatus(  # type: ignore[return]
             self, request: jobsv1_pb2.UpdateStatusRequest,
             context: grpc.ServicerContext) -> jobsv1_pb2.UpdateStatusResponse:
@@ -503,7 +517,13 @@ def GetJobTable(  # type: ignore[return]
                     metadata=converted_metadata,
                     pool=job.get('pool'),
                     pool_hash=job.get('pool_hash'),
-                    links=job.get('links'))
+                    links=job.get('links'),
+                    # Primary/auxiliary task support (None for non-job-groups)
+                    is_primary_in_job_group=job.get('is_primary_in_job_group'),
+                    # Fields populated from cluster handle
+                    zone=job.get('zone'),
+                    labels=job.get('labels'),
+                    cluster_name_on_cloud=job.get('cluster_name_on_cloud'))
                 jobs_info.append(job_info)
 
             return managed_jobsv1_pb2.GetJobTableResponse(
diff --git a/sky/skylet/skylet.py b/sky/skylet/skylet.py
index fa6c429a124..dc6ae87a7d1 100644
--- a/sky/skylet/skylet.py
+++ b/sky/skylet/skylet.py
@@ -44,7 +44,7 @@
 
 def start_grpc_server(port: int = constants.SKYLET_GRPC_PORT) -> grpc.Server:
     """Start the gRPC server."""
-    # This is the default value in Python 3.8 - 3.12,
+    # This is the default value in Python 3.9 - 3.12,
     # putting it here for visibility.
     # TODO(kevin): Determine the optimal max number of threads.
     max_workers = min(32, (os.cpu_count() or 1) + 4)
diff --git a/sky/ssh_node_pools/deploy/deploy.py b/sky/ssh_node_pools/deploy/deploy.py
index 143e75a5744..f50d40833a3 100644
--- a/sky/ssh_node_pools/deploy/deploy.py
+++ b/sky/ssh_node_pools/deploy/deploy.py
@@ -412,7 +412,7 @@ def deploy_single_cluster(cluster_name,
         'if [ "$(sudo sshd -T | grep allowtcpforwarding)" = "allowtcpforwarding yes" ]; then '
         f'echo "TCP Forwarding already enabled on head node ({head_node})."; '
         'else '
-        'sudo sed -i \'s/^#\?\s*AllowTcpForwarding.*/AllowTcpForwarding yes/\' '  # pylint: disable=anomalous-backslash-in-string
+        'sudo sed -i \'s/^#\\?\\s*AllowTcpForwarding.*/AllowTcpForwarding yes/\' '
         '/etc/ssh/sshd_config && sudo systemctl restart sshd && '
         f'echo "Successfully enabled TCP Forwarding on head node ({head_node})."; '
         'fi')
diff --git a/sky/ssh_node_pools/server.py b/sky/ssh_node_pools/server.py
index 915c7907f59..7c1fd710f69 100644
--- a/sky/ssh_node_pools/server.py
+++ b/sky/ssh_node_pools/server.py
@@ -104,6 +104,7 @@ async def deploy_ssh_node_pool(request: fastapi.Request,
             request_body=ssh_up_body,
             func=core.ssh_up,
             schedule_type=requests_lib.ScheduleType.LONG,
+            auth_user=request.state.auth_user,
         )
 
         return {
@@ -129,6 +130,7 @@ async def deploy_ssh_node_pool_general(
             request_body=ssh_up_body,
             func=core.ssh_up,
             schedule_type=requests_lib.ScheduleType.LONG,
+            auth_user=request.state.auth_user,
         )
 
         pool_name = ssh_up_body.infra or 'default'
@@ -155,6 +157,7 @@ async def down_ssh_node_pool(request: fastapi.Request,
             request_body=ssh_up_body,
             func=core.ssh_up,  # Reuse ssh_up function with cleanup=True
             schedule_type=requests_lib.ScheduleType.LONG,
+            auth_user=request.state.auth_user,
         )
 
         return {
@@ -183,6 +186,7 @@ async def down_ssh_node_pool_general(
             request_body=ssh_up_body,
             func=core.ssh_up,  # Reuse ssh_up function with cleanup=True
             schedule_type=requests_lib.ScheduleType.LONG,
+            auth_user=request.state.auth_user,
         )
 
         pool_name = ssh_up_body.infra or 'default'
diff --git a/sky/task.py b/sky/task.py
index 5d421a29926..1672cf288d3 100644
--- a/sky/task.py
+++ b/sky/task.py
@@ -754,9 +754,19 @@ def from_yaml_config(
             service = service_spec.SkyServiceSpec.from_yaml_config(service)
             task.set_service(service)
         elif pool is not None:
-            pool['pool'] = True
-            pool = service_spec.SkyServiceSpec.from_yaml_config(pool)
-            task.set_service(pool)
+            # When pool is a dict (from top-level pool: in YAML), wrap it
+            # properly The schema expects {'pool': {...}} structure, not
+            # {'workers': 1, 'pool': True}
+            if isinstance(pool, dict):
+                # pool is a dict like {'workers': 1, 'max_workers': 3}
+                # Wrap it as {'pool': {'workers': 1, 'max_workers': 3}}
+                pool_config_dict = {'pool': pool}
+            else:
+                # pool is a boolean True (shouldn't happen, but handle it)
+                pool_config_dict = {'pool': {}}
+            pool_spec = service_spec.SkyServiceSpec.from_yaml_config(
+                pool_config_dict)
+            task.set_service(pool_spec)
 
         volume_mounts = config.pop('volume_mounts', None)
         if volume_mounts is not None:
diff --git a/sky/templates/aws-ray.yml.j2 b/sky/templates/aws-ray.yml.j2
index 17cd654eb3d..3f1f5f0c286 100644
--- a/sky/templates/aws-ray.yml.j2
+++ b/sky/templates/aws-ray.yml.j2
@@ -188,6 +188,7 @@ setup_commands:
     {{ initial_setup_command }}
     {%- endfor %}
     {{ conda_installation_commands }}
+    {{ uv_installation_commands }}
     conda config --remove channels "https://aws-ml-conda-ec2.s3.us-west-2.amazonaws.com" || true;
     {{ ray_skypilot_installation_commands }}
     {{ copy_skypilot_templates_commands }}
diff --git a/sky/templates/azure-ray.yml.j2 b/sky/templates/azure-ray.yml.j2
index 63f0588abea..71621169b1a 100644
--- a/sky/templates/azure-ray.yml.j2
+++ b/sky/templates/azure-ray.yml.j2
@@ -117,6 +117,7 @@ setup_commands:
   # Line 'sudo mv /etc/nccl.conf /etc/nccl.conf.bak' removes the default nccl.conf which is wrongly configured on many multi-GPU Azure VM, causing failure for multi-GPU workloads using NCCL.
   - mkdir -p ~/.ssh; touch ~/.ssh/config;
     {{ conda_installation_commands }}
+    {{ uv_installation_commands }}
     {{ ray_skypilot_installation_commands }}
     {{ copy_skypilot_templates_commands }}
     touch ~/.sudo_as_admin_successful;
diff --git a/sky/templates/cudo-ray.yml.j2 b/sky/templates/cudo-ray.yml.j2
index 0aab575a2ce..a4c35f515d6 100644
--- a/sky/templates/cudo-ray.yml.j2
+++ b/sky/templates/cudo-ray.yml.j2
@@ -67,6 +67,7 @@ setup_commands:
     sudo dpkg --configure -a;
     mkdir -p ~/.ssh; touch ~/.ssh/config;
     {{ conda_installation_commands }}
+    {{ uv_installation_commands }}
     {{ ray_skypilot_installation_commands }}
     {{ copy_skypilot_templates_commands }}
     touch ~/.sudo_as_admin_successful;
diff --git a/sky/templates/do-ray.yml.j2 b/sky/templates/do-ray.yml.j2
index fb9120bd0bb..9c78f4c39ee 100644
--- a/sky/templates/do-ray.yml.j2
+++ b/sky/templates/do-ray.yml.j2
@@ -88,6 +88,7 @@ setup_commands:
     sudo dpkg --configure -a;
     mkdir -p ~/.ssh; touch ~/.ssh/config;
     {{ conda_installation_commands }}
+    {{ uv_installation_commands }}
     {{ ray_skypilot_installation_commands }}
     {{ copy_skypilot_templates_commands }}
     sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
diff --git a/sky/templates/fluidstack-ray.yml.j2 b/sky/templates/fluidstack-ray.yml.j2
index 90b7090a774..f4fe6a53462 100644
--- a/sky/templates/fluidstack-ray.yml.j2
+++ b/sky/templates/fluidstack-ray.yml.j2
@@ -68,6 +68,7 @@ setup_commands:
     sudo dpkg --configure -a;
     mkdir -p ~/.ssh; touch ~/.ssh/config;
     {{ conda_installation_commands }}
+    {{ uv_installation_commands }}
     {{ ray_skypilot_installation_commands }}
     {{ copy_skypilot_templates_commands }}
     touch ~/.sudo_as_admin_successful;
diff --git a/sky/templates/gcp-ray.yml.j2 b/sky/templates/gcp-ray.yml.j2
index 2c3f0cff5d3..072093618c3 100644
--- a/sky/templates/gcp-ray.yml.j2
+++ b/sky/templates/gcp-ray.yml.j2
@@ -137,6 +137,9 @@ available_node_types:
       schedulingConfig:
         preemptible: true
   {%- endif %}
+  {%- if gcp_queued_resource %}
+      gcp_queued_resource: {{gcp_queued_resource}}
+  {%- endif %}
 {%- else %}
       machineType: {{instance_type}}
     {%- if machine_image is not none %}
@@ -267,6 +270,7 @@ setup_commands:
     sudo dpkg --configure --force-overwrite -a;
     mkdir -p ~/.ssh; touch ~/.ssh/config;
     {{ conda_installation_commands }}
+    {{ uv_installation_commands }}
     source ~/.bashrc;
   {%- if tpu_vm %}
     test -f ~/miniconda3/etc/profile.d/conda.sh && source ~/miniconda3/etc/profile.d/conda.sh && conda activate base || true;
diff --git a/sky/templates/hyperbolic-ray.yml.j2 b/sky/templates/hyperbolic-ray.yml.j2
index 3479b9f4cb0..d02cf582791 100644
--- a/sky/templates/hyperbolic-ray.yml.j2
+++ b/sky/templates/hyperbolic-ray.yml.j2
@@ -61,6 +61,7 @@ setup_commands:
     sudo dpkg --configure -a;
     which patch > /dev/null || sudo apt install -y patch;
     {{ conda_installation_commands }}
+    {{ uv_installation_commands }}
     {{ ray_skypilot_installation_commands }}
     {{ copy_skypilot_templates_commands }}
     sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
diff --git a/sky/templates/ibm-ray.yml.j2 b/sky/templates/ibm-ray.yml.j2
index dd2f75aa2db..b99a5a9f196 100644
--- a/sky/templates/ibm-ray.yml.j2
+++ b/sky/templates/ibm-ray.yml.j2
@@ -101,6 +101,7 @@ setup_commands:
     sudo dpkg --configure -a;
     mkdir -p ~/.ssh; touch ~/.ssh/config;
     {{ conda_installation_commands }}
+    {{ uv_installation_commands }}
     {{ ray_skypilot_installation_commands }}
     {{ copy_skypilot_templates_commands }}
     sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
diff --git a/sky/templates/jobs-controller-provision.yaml.j2 b/sky/templates/jobs-controller-provision.yaml.j2
new file mode 100644
index 00000000000..84a3f9623b7
--- /dev/null
+++ b/sky/templates/jobs-controller-provision.yaml.j2
@@ -0,0 +1,22 @@
+# The template for provisioning the jobs controller cluster.
+
+name: {{dag_name}}
+
+setup: |
+  {{ sky_activate_python_env }}
+  # Disable the pip version check to avoid the warning message, which makes the
+  # output hard to read.
+  export PIP_DISABLE_PIP_VERSION_CHECK=1
+
+  {%- for cmd in cloud_dependencies_installation_commands %}
+  {{cmd}}
+  {%- endfor %}
+
+  # This is used by the skylet events to check if we are a jobs controller.
+  touch {{job_controller_indicator_file}}
+
+
+envs:
+{%- for env_name, env_value in controller_envs.items() %}
+  {{env_name}}: {{env_value}}
+{%- endfor %}
diff --git a/sky/templates/jobs-controller.yaml.j2 b/sky/templates/jobs-controller.yaml.j2
index b3a2d59890f..6a2914e3d7e 100644
--- a/sky/templates/jobs-controller.yaml.j2
+++ b/sky/templates/jobs-controller.yaml.j2
@@ -14,6 +14,14 @@ file_mounts:
   {%- for controller_file_mount_path, local_file_mount_path in local_to_controller_file_mounts.items() %}
   {{controller_file_mount_path}}: {{local_file_mount_path}}
   {%- endfor %}
+  {%- if not is_consolidation_mode %}
+  {%- for remote_plugin_wheel_path, local_plugin_wheel_path in plugin_wheel_file_mounts.items() %}
+  {{remote_plugin_wheel_path}}: {{local_plugin_wheel_path}}
+  {%- endfor %}
+  {%- if local_plugins_config_path is not none %}
+  {{remote_plugins_config_path}}.tmp: {{local_plugins_config_path}}
+  {%- endif %}
+  {%- endif %}
 
 # NOTE(dev): This needs to be a subset of sky/templates/sky-serve-controller.yaml.j2.
 # It is because we use the --fast flag to submit jobs and no --fast flag to launch pools.
@@ -22,26 +30,22 @@ file_mounts:
 # Maybe in the --fast implementation, we can store the hash of setup commands that used to be
 # run and don't skip setup phase if the hash is different.
 setup: |
-  {{ sky_activate_python_env }}
-  # Disable the pip version check to avoid the warning message, which makes the
-  # output hard to read.
-  export PIP_DISABLE_PIP_VERSION_CHECK=1
-
-  {%- for cmd in cloud_dependencies_installation_commands %}
-  {{cmd}}
-  {%- endfor %}
-
   {% if controller_envs.get('SKYPILOT_DEV') != '0' %}
   grep -q 'export SKYPILOT_DEV=' ~/.bashrc || echo 'export SKYPILOT_DEV=1' >> ~/.bashrc
   grep -q 'alias sky-env=' ~/.bashrc || echo 'alias sky-env="{{ sky_activate_python_env }}"' >> ~/.bashrc
   {% endif %}
 
-  # This is used by the skylet events to check if we are a jobs controller.
-  touch {{job_controller_indicator_file}}
-
 run: |
-  {%- if consolidation_mode_job_id is none %}
+  {%- if not is_consolidation_mode %}
   {{ sky_activate_python_env }}
+  # Install plugin wheels if any
+  {%- if plugins_wheel_install_commands %}
+  {{plugins_wheel_install_commands}}
+  # Move the temporary plugins config to the final location if it exists
+  if [ -f {{remote_plugins_config_path}}.tmp ]; then
+    mv {{remote_plugins_config_path}}.tmp {{remote_plugins_config_path}}
+  fi
+  {%- endif %}
   {%- endif %}
 
   # Write env vars to a file
@@ -49,27 +53,28 @@ run: |
   echo "export {{env_name}}='{{env_value}}'" >> {{remote_env_file_path}}
   {%- endfor %}
 
-  # Submit the job to the scheduler.
+  {%- if job_id_to_rank is defined and job_id_to_rank is not none %}
+  # Write job_id_to_rank mapping to env file for rank lookup
+  # For consolidation mode, job_id_to_rank is already fully set with all job IDs
+  export _TMP_JOB_ID_TO_RANK='{{job_id_to_rank | tojson}}'
+  python3 -c "import json, os, shlex; d=json.loads(os.environ['_TMP_JOB_ID_TO_RANK']); print('export SKYPILOT_JOB_ID_TO_RANK=' + shlex.quote(json.dumps(d)))" 2>&1 >> {{remote_env_file_path}}
+  unset _TMP_JOB_ID_TO_RANK
+  {%- endif %}
+
+  # Submit the job(s) to the scheduler.
   # Note: The job is already in the `spot` table, marked as PENDING.
-  # CloudVmRayBackend._exec_code_on_head() calls
-  # managed_job_codegen.set_pending() before we get here.
-  {%- if consolidation_mode_job_id is not none %}
-  {{sky_python_cmd}} \
-  {%- else %}
+  job_ids_array=({% for job_id in job_ids %}{{job_id}} {% endfor %})
+  # Create a space-separated string of job IDs
+  job_ids_str=$(IFS=' '; echo "${job_ids_array[*]}")
   python \
-  {%- endif %}
     -u -m sky.jobs.scheduler {{remote_user_yaml_path}} \
     --user-yaml-path {{remote_original_user_yaml_path}} \
-    {%- if consolidation_mode_job_id is not none %}
-    --job-id {{consolidation_mode_job_id}} \
-    {%- else %}
-    --job-id $SKYPILOT_INTERNAL_JOB_ID \
-    {%- endif %}
     --env-file {{remote_env_file_path}} \
     {%- if pool is not none %}
     --pool {{pool}} \
     {%- endif %}
-    --priority {{priority}}
+    --priority {{priority}} \
+    --job-id $job_ids_str
 
 
 envs:
diff --git a/sky/templates/kubernetes-ray.yml.j2 b/sky/templates/kubernetes-ray.yml.j2
index 50151ea1976..982af72124d 100644
--- a/sky/templates/kubernetes-ray.yml.j2
+++ b/sky/templates/kubernetes-ray.yml.j2
@@ -523,7 +523,7 @@ available_node_types:
                 resourceFieldRef:
                   containerName: ray-node
                   resource: requests.memory
-            # Disable Ray memory monitor to prevent Ray's memory manager 
+            # Disable Ray memory monitor to prevent Ray's memory manager
             # from interfering with kubernetes resource manager.
             # If ray memory monitor is enabled, the ray memory monitor kills
             # the running job is the job uses more than 95% of allocated memory,
@@ -917,6 +917,7 @@ available_node_types:
                   echo "Waiting for curl package to be installed..."
                 done
                 {{ conda_installation_commands }}
+                {{ uv_installation_commands }}
                 {{ ray_installation_commands }}
 
                 # set UV_SYSTEM_PYTHON to false in case the user provided docker image set it to true.
@@ -1171,15 +1172,32 @@ available_node_types:
               {% if k8s_network_type == 'coreweave' %}
               rdma/ib: 1
               {% endif %}
-            {% if k8s_resource_key is not none %}
+              {% if k8s_network_type == 'together' and k8s_resource_key is not none %}
+              nvidia.com/rdma_ib: {{accelerator_count}}
+              {% endif %}
+              {% if k8s_efa_count is not none %}
+              vpc.amazonaws.com/efa: {{k8s_efa_count}}
+              {% endif %}
+            {% if k8s_resource_key is not none or k8s_cpu_limit is defined %}
             limits:
-              # Limits need to be defined for GPU/TPU requests
+              {% if k8s_cpu_limit is defined %}
+              # CPU/Memory limits set via set_pod_resource_limits config
+              cpu: {{k8s_cpu_limit}}
+              memory: {{k8s_memory_limit}}G
+              {% endif %}
               {% if k8s_resource_key is not none %}
+              # Limits need to be defined for GPU/TPU/EFA requests
               {{k8s_resource_key}}: {{accelerator_count}}
               {% endif %}
               {% if k8s_network_type == 'coreweave' %}
               rdma/ib: 1
               {% endif %}
+              {% if k8s_network_type == 'together' %}
+              nvidia.com/rdma_ib: {{accelerator_count}}
+              {% endif %}
+              {% if k8s_efa_count is not none %}
+              vpc.amazonaws.com/efa: {{k8s_efa_count}}
+              {% endif %}
             {% endif %}
           {% if k8s_ipc_lock_capability %}
           securityContext:
diff --git a/sky/templates/lambda-ray.yml.j2 b/sky/templates/lambda-ray.yml.j2
index 942f7379e38..0e37c9691ee 100644
--- a/sky/templates/lambda-ray.yml.j2
+++ b/sky/templates/lambda-ray.yml.j2
@@ -90,6 +90,7 @@ setup_commands:
     mkdir -p ~/.ssh; touch ~/.ssh/config;
     rm ~/.local/bin/pip ~/.local/bin/pip3 ~/.local/bin/pip3.8 ~/.local/bin/pip3.10;
     {{ conda_installation_commands }}
+    {{ uv_installation_commands }}
     {{ ray_skypilot_installation_commands }}
     {{ copy_skypilot_templates_commands }}
     touch ~/.sudo_as_admin_successful;
diff --git a/sky/templates/nebius-ray.yml.j2 b/sky/templates/nebius-ray.yml.j2
index fc85ad20b96..65e91dbbe93 100644
--- a/sky/templates/nebius-ray.yml.j2
+++ b/sky/templates/nebius-ray.yml.j2
@@ -150,6 +150,19 @@ setup_commands:
     sudo dpkg --configure -a;
     mkdir -p ~/.ssh; touch ~/.ssh/config;
     {{ conda_installation_commands }}
+    sudo apt install -y ufw;
+    sudo ufw default allow incoming;
+    sudo ufw default allow outgoing;
+    sudo ufw allow from 10.0.0.0/8;
+    sudo ufw allow from 172.16.0.0/12;
+    sudo ufw deny 6379/tcp;
+    sudo ufw deny 8265/tcp;
+    sudo ufw deny 10001/tcp;
+    sudo ufw deny 11000:20000/tcp;
+    sudo ufw deny 6379:6390/tcp;
+    sudo ufw deny 8265:8270/tcp;
+    sudo ufw --force enable;
+    {{ uv_installation_commands }}
     {{ ray_skypilot_installation_commands }}
     {{ copy_skypilot_templates_commands }}
   {%- if env_vars is defined %}
diff --git a/sky/templates/oci-ray.yml.j2 b/sky/templates/oci-ray.yml.j2
index 8036329e4ad..25b7e250d7d 100644
--- a/sky/templates/oci-ray.yml.j2
+++ b/sky/templates/oci-ray.yml.j2
@@ -84,6 +84,7 @@ setup_commands:
     {%- endif %}
     mkdir -p ~/.ssh; touch ~/.ssh/config;
     {{ conda_installation_commands }}
+    {{ uv_installation_commands }}
     {{ ray_skypilot_installation_commands }}
     {{ copy_skypilot_templates_commands }}
     touch ~/.sudo_as_admin_successful;
diff --git a/sky/templates/paperspace-ray.yml.j2 b/sky/templates/paperspace-ray.yml.j2
index db3ed311301..b75f3adb8cc 100644
--- a/sky/templates/paperspace-ray.yml.j2
+++ b/sky/templates/paperspace-ray.yml.j2
@@ -86,6 +86,7 @@ setup_commands:
     sudo dpkg --configure -a;
     mkdir -p ~/.ssh; touch ~/.ssh/config;
     {{ conda_installation_commands }}
+    {{ uv_installation_commands }}
     {{ ray_skypilot_installation_commands }}
     {{ copy_skypilot_templates_commands }}
     sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
diff --git a/sky/templates/primeintellect-ray.yml.j2 b/sky/templates/primeintellect-ray.yml.j2
index b0aa41d9ab5..c55299bf95e 100644
--- a/sky/templates/primeintellect-ray.yml.j2
+++ b/sky/templates/primeintellect-ray.yml.j2
@@ -65,6 +65,7 @@ setup_commands:
     sudo dpkg --configure -a;
     which patch > /dev/null || sudo apt install -y patch;
     {{ conda_installation_commands }}
+    {{ uv_installation_commands }}
     {{ ray_skypilot_installation_commands }}
     {{ copy_skypilot_templates_commands }}
     sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
diff --git a/sky/templates/runpod-ray.yml.j2 b/sky/templates/runpod-ray.yml.j2
index 6d2d897ba93..39f0d566af4 100644
--- a/sky/templates/runpod-ray.yml.j2
+++ b/sky/templates/runpod-ray.yml.j2
@@ -92,6 +92,7 @@ setup_commands:
     sudo dpkg --configure -a;
     mkdir -p ~/.ssh; touch ~/.ssh/config;
     {{ conda_installation_commands }}
+    {{ uv_installation_commands }}
     {{ ray_skypilot_installation_commands }}
     {{ copy_skypilot_templates_commands }}
     touch ~/.sudo_as_admin_successful;
diff --git a/sky/templates/scp-ray.yml.j2 b/sky/templates/scp-ray.yml.j2
index d0f1b73a4d8..58b14690e89 100644
--- a/sky/templates/scp-ray.yml.j2
+++ b/sky/templates/scp-ray.yml.j2
@@ -55,6 +55,7 @@ setup_commands:
   # Line 'python3 -c ..': patch the buggy ray files and enable `-o allow_other` option for `goofys`
   - mkdir -p ~/.ssh; touch ~/.ssh/config;
     {{ conda_installation_commands }}
+    {{ uv_installation_commands }}
     {{ ray_skypilot_installation_commands }}
     {{ copy_skypilot_templates_commands }}
     sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
diff --git a/sky/templates/seeweb-ray.yml.j2 b/sky/templates/seeweb-ray.yml.j2
index ec2361e12af..891a3d21d30 100644
--- a/sky/templates/seeweb-ray.yml.j2
+++ b/sky/templates/seeweb-ray.yml.j2
@@ -111,6 +111,7 @@ setup_commands:
     {%- endif %}
 
     {{ conda_installation_commands }}
+    {{ uv_installation_commands }}
     {{ ray_skypilot_installation_commands }}
     {{ copy_skypilot_templates_commands }}
 
diff --git a/sky/templates/shadeform-ray.yml.j2 b/sky/templates/shadeform-ray.yml.j2
index c5b202841b6..1a8eb3782d4 100644
--- a/sky/templates/shadeform-ray.yml.j2
+++ b/sky/templates/shadeform-ray.yml.j2
@@ -62,6 +62,7 @@ setup_commands:
     {%- endfor %}
     mkdir -p ~/.ssh; touch ~/.ssh/config; which patch > /dev/null || sudo apt install -y patch;
     {{ conda_installation_commands }}
+    {{ uv_installation_commands }}
     {{ ray_skypilot_installation_commands }}
     {{ copy_skypilot_templates_commands }}
     sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
diff --git a/sky/templates/sky-serve-controller.yaml.j2 b/sky/templates/sky-serve-controller.yaml.j2
index 374f114607d..71c036ea4b1 100644
--- a/sky/templates/sky-serve-controller.yaml.j2
+++ b/sky/templates/sky-serve-controller.yaml.j2
@@ -37,6 +37,14 @@ file_mounts:
   {%- for controller_file_mount_path, local_file_mount_path in local_to_controller_file_mounts.items() %}
   {{controller_file_mount_path}}: {{local_file_mount_path}}
   {%- endfor %}
+  {%- if consolidation_mode_job_id is none %}
+  {%- for remote_plugin_wheel_path, local_plugin_wheel_path in plugin_wheel_file_mounts.items() %}
+  {{remote_plugin_wheel_path}}: {{local_plugin_wheel_path}}
+  {%- endfor %}
+  {%- if local_plugins_config_path is not none %}
+  {{remote_plugins_config_path}}.tmp: {{local_plugins_config_path}}
+  {%- endif %}
+  {%- endif %}
 {%- if use_tls %}
   {{remote_tls_keyfile}}: {{local_tls_keyfile}}
   {{remote_tls_certfile}}: {{local_tls_certfile}}
@@ -47,6 +55,14 @@ run: |
   # PATH.
   {%- if consolidation_mode_job_id is none %}
   {{ sky_activate_python_env }}
+  # Install plugin wheels if any
+  {%- if plugins_wheel_install_commands %}
+  {{plugins_wheel_install_commands}}
+  # Move the temporary plugins config to the final location if it exists
+  if [ -f {{remote_plugins_config_path}}.tmp ]; then
+    mv {{remote_plugins_config_path}}.tmp {{remote_plugins_config_path}}
+  fi
+  {%- endif %}
   {%- endif %}
   # Start sky serve service.
   {%- if consolidation_mode_job_id is not none %}
diff --git a/sky/templates/slurm-ray.yml.j2 b/sky/templates/slurm-ray.yml.j2
index cfc91edc1fb..faa1d564c8f 100644
--- a/sky/templates/slurm-ray.yml.j2
+++ b/sky/templates/slurm-ray.yml.j2
@@ -1,3 +1,5 @@
+{% set user = ssh_user if image_id is none else 'root' %}
+{% set has_image = image_id is not none %}
 cluster_name: {{cluster_name_on_cloud}}
 
 # The maximum number of workers nodes to launch in addition to the head node.
@@ -16,7 +18,9 @@ provider:
     hostname: {{ssh_hostname}}
     port: {{ssh_port}}
     user: {{ssh_user}}
+{% if slurm_private_key is not none %}
     private_key: {{slurm_private_key}}
+{% endif %}
 {% if slurm_proxy_command is not none %}
     proxycommand: {{slurm_proxy_command | tojson }}
 {% endif %}
@@ -25,16 +29,8 @@ provider:
 {% endif %}
 
 auth:
-  ssh_user: {{ssh_user}}
-  # TODO(jwj,kevin): Modify this tmp workaround.
-  # Right now there's a chicken-and-egg problem:
-  # 1. ssh_credential_from_yaml reads from the auth.ssh_private_key: ~/.sky/clients/.../ssh/sky-key
-  # 2. This is SkyPilot's generated key, not the Slurm cluster's key
-  # 3. The internal_file_mounts stage tries to rsync using sky-key, but its public key isn't on the remote yet
-  # 4. The public key only gets added by setup_commands, which runs AFTER file_mounts
-  # ssh_private_key: {{ssh_private_key}}
-  ssh_private_key: {{slurm_private_key}}
-  ssh_proxy_command: {{slurm_proxy_command | tojson }}
+  ssh_user: {{user}}
+  ssh_private_key: {{ssh_private_key}}
 
 available_node_types:
   ray_head_default:
@@ -47,6 +43,9 @@ available_node_types:
       memory: {{memory}}
       accelerator_type: {{accelerator_type}}
       accelerator_count: {{accelerator_count}}
+{% if image_id is not none %}
+      image_id: {{image_id}}
+{% endif %}
 
       # TODO: more configs that is required by the provisioner to create new
       # instances on the FluffyCloud:
@@ -79,17 +78,21 @@ setup_commands:
     {%- for initial_setup_command in initial_setup_commands %}
     {{ initial_setup_command }}
     {%- endfor %}
+    # SSH setup: authorized_keys, host key, and bashrc for interactive sessions.
+    # - Non-container: always run (for the normal user)
+    # - Container: only run inside container (when root), not on the host
+    if [ {{ 'true' if not has_image else 'false' }} ] || [ "$(id -u)" = "0" ]; then
     # Generate host key for sshd -i if not exists
-    mkdir -p ~{{ssh_user}}/.ssh && chmod 700 ~{{ssh_user}}/.ssh
-    [ -f ~{{ssh_user}}/.ssh/{{slurm_sshd_host_key_filename}} ] || ssh-keygen -t ed25519 -f ~{{ssh_user}}/.ssh/{{slurm_sshd_host_key_filename}} -N "" -q
+    mkdir -p ~{{user}}/.ssh && chmod 700 ~{{user}}/.ssh
+    [ -f ~{{user}}/.ssh/{{slurm_sshd_host_key_filename}} ] || ssh-keygen -t ed25519 -f ~{{user}}/.ssh/{{slurm_sshd_host_key_filename}} -N "" -q
     # Add public key to user's authorized_keys if not already present
-    grep -qF 'skypilot:ssh_public_key_content' ~{{ssh_user}}/.ssh/authorized_keys 2>/dev/null || cat >> ~{{ssh_user}}/.ssh/authorized_keys <<'SKYPILOT_SSH_KEY_EOF'
+    grep -qF 'skypilot:ssh_public_key_content' ~{{user}}/.ssh/authorized_keys 2>/dev/null || cat >> ~{{user}}/.ssh/authorized_keys <<'SKYPILOT_SSH_KEY_EOF'
     skypilot:ssh_public_key_content
     SKYPILOT_SSH_KEY_EOF
-    chmod 600 ~{{ssh_user}}/.ssh/authorized_keys
+    chmod 600 ~{{user}}/.ssh/authorized_keys
 
-    mkdir -p ~{{ssh_user}}/.sky
-    cat > ~{{ssh_user}}/.sky_ssh_rc <<'SKYPILOT_SSH_RC'
+    mkdir -p ~{{user}}/.sky
+    cat > ~{{user}}/.sky_ssh_rc <<'SKYPILOT_SSH_RC'
     # Added by SkyPilot: override HOME for Slurm interactive sessions
     if [ -n "${{slurm_cluster_name_env_var}}" ]; then
         CLUSTER_DIR=~/.sky_clusters/${{slurm_cluster_name_env_var}}
@@ -99,11 +102,76 @@ setup_commands:
         fi
     fi
     SKYPILOT_SSH_RC
-    grep -q "source ~/.sky_ssh_rc" ~{{ssh_user}}/.bashrc 2>/dev/null || (echo "" >> ~{{ssh_user}}/.bashrc && echo "source ~/.sky_ssh_rc" >> ~{{ssh_user}}/.bashrc)
+    grep -q "source ~/.sky_ssh_rc" ~{{user}}/.bashrc 2>/dev/null || (echo "" >> ~{{user}}/.bashrc && echo "source ~/.sky_ssh_rc" >> ~{{user}}/.bashrc)
+    fi
     {{ setup_sky_dirs_commands }}
     {{ conda_installation_commands }}
+    {{ uv_installation_commands }}
     {{ skypilot_wheel_installation_commands }}
     {{ copy_skypilot_templates_commands }}
+{% if image_id is not none %}
+    # Install dropbear and socat for container SSH.
+    # OpenSSH fails in enroot containers created with --container-remap-root with:
+    #   "permanently_set_uid: was able to restore old [e]gid [preauth]"
+    # sshd verifies it can actually drop privileges, which doesn't work with
+    # --container-remap-root because of the uid remapping.
+    # See:
+    # - https://github.com/NVIDIA/pyxis/issues/85#issuecomment-1201913582
+    # - https://github.com/NVIDIA/enroot/issues/92
+    #
+    # So we use Dropbear (https://matt.ucc.asn.au/dropbear/dropbear.html), a
+    # lightweight SSH server popular in embedded and resource-constrained
+    # environments. It's actively maintained (https://github.com/mkj/dropbear/releases)
+    # and doesn't have the same issue with OpenSSH.
+    #
+    # This section runs on both host and container; only install when root (container).
+    # TODO(kevin): Host the binary somewhere, so we don't have to build it every time.
+    if [ "$(id -u)" = "0" ]; then
+      set -e
+      DROPBEAR_VERSION=2025.89
+      PACKAGES="socat iproute2 ccache"
+      export CCACHE_DIR=/var/cache/ccache
+      # Only Debian-based images are supported for now.
+      # See: https://docs.skypilot.co/en/latest/examples/docker-containers.html
+      export DEBIAN_FRONTEND=noninteractive
+      # Helper: apt with retries and exponential backoff (adapted from kubernetes-ray.yml.j2)
+      # TODO(kevin): Consolidate with the one in kubernetes-ray.yml.j2
+      apt_install_with_retries() {
+        local packages="$@"
+        [ -z "$packages" ] && return 0
+        set +e
+        local log=/tmp/apt-install.log
+        local tries=3
+        local delay=5
+        for i in $(seq 1 $tries); do
+          apt-get update >> "$log" 2>&1
+          apt-get install -y -o Dpkg::Options::="--force-confdef" -o Dpkg::Options::="--force-confold" $packages >> "$log" 2>&1 && { set -e; return 0; }
+          echo "apt-get install failed (attempt $i/$tries). Retrying in ${delay}s..." >> "$log"
+          apt-get -f install -y >> "$log" 2>&1 || true
+          apt-get clean >> "$log" 2>&1 || true
+          sleep $delay
+          delay=$((delay * 2))
+        done
+        echo "apt-get install failed after $tries attempts. Log:" >&2
+        cat "$log" >&2
+        set -e
+        return 1
+      }
+      if ! /usr/local/bin/dropbear -V 2>&1 | grep -q "$DROPBEAR_VERSION"; then
+        cd /tmp
+        apt_install_with_retries build-essential $PACKAGES
+        curl -sL https://matt.ucc.asn.au/dropbear/releases/dropbear-$DROPBEAR_VERSION.tar.bz2 | tar xj
+        cd dropbear-$DROPBEAR_VERSION
+        ./configure --disable-zlib --disable-syslog --disable-wtmp --disable-lastlog >/dev/null
+        BUILD_START=$SECONDS
+        make CC="ccache gcc" -j$(nproc) PROGRAMS="dropbear" >/dev/null
+        echo "[dropbear] Build took $((SECONDS - BUILD_START))s"
+        cp dropbear /usr/local/bin/
+        cd /; rm -rf /tmp/dropbear-$DROPBEAR_VERSION
+      fi
+      mkdir -p /etc/dropbear
+    fi
+{% endif %}
 
 head_node: {}
 worker_nodes: {}
diff --git a/sky/templates/vast-ray.yml.j2 b/sky/templates/vast-ray.yml.j2
index 20bc7d2b8e9..5809e183c8c 100644
--- a/sky/templates/vast-ray.yml.j2
+++ b/sky/templates/vast-ray.yml.j2
@@ -64,6 +64,7 @@ setup_commands:
     {%- endfor %}
     mkdir -p ~/.ssh; touch ~/.ssh/config; which patch > /dev/null || sudo apt install -y patch;
     {{ conda_installation_commands }}
+    {{ uv_installation_commands }}
     {{ ray_skypilot_installation_commands }}
     {{ copy_skypilot_templates_commands }}
     sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
diff --git a/sky/templates/vsphere-ray.yml.j2 b/sky/templates/vsphere-ray.yml.j2
index 631c92b7315..57e30ac0788 100644
--- a/sky/templates/vsphere-ray.yml.j2
+++ b/sky/templates/vsphere-ray.yml.j2
@@ -66,6 +66,7 @@ setup_commands:
     mkdir -p ~/.ssh; touch ~/.ssh/config;
     pip3 --version > /dev/null 2>&1 || (curl -sSL https://bootstrap.pypa.io/get-pip.py -o get-pip.py && python3 get-pip.py && echo "PATH=$HOME/.local/bin:$PATH" >> ~/.bashrc);
     {{ conda_installation_commands }}
+    {{ uv_installation_commands }}
     {{ ray_skypilot_installation_commands }}
     {{ copy_skypilot_templates_commands }}
     sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
diff --git a/sky/templates/websocket_proxy.py b/sky/templates/websocket_proxy.py
index 79a3286605d..5d659267b52 100644
--- a/sky/templates/websocket_proxy.py
+++ b/sky/templates/websocket_proxy.py
@@ -30,6 +30,10 @@
 BUFFER_SIZE = 2**16  # 64KB
 HEARTBEAT_INTERVAL_SECONDS = 10
 MAX_UNANSWERED_PINGS = 100
+# Timeout for opening the WebSocket connection. The default (10s) can be
+# insufficient when many concurrent SSH connections are established under load,
+# causing intermittent "timed out during opening handshake" errors.
+OPEN_TIMEOUT_SECONDS = 60
 
 
 async def main(url: str, timestamps_supported: bool, login_url: str) -> None:
@@ -37,7 +41,9 @@ async def main(url: str, timestamps_supported: bool, login_url: str) -> None:
     headers.update(server_common.get_cookie_header_for_url(url))
     headers.update(service_account_auth.get_service_account_headers())
     try:
-        async with connect(url, ping_interval=None,
+        async with connect(url,
+                           ping_interval=None,
+                           open_timeout=OPEN_TIMEOUT_SECONDS,
                            additional_headers=headers) as websocket:
             await run_websocket_proxy(websocket, timestamps_supported)
     except websockets.exceptions.InvalidStatus as e:
diff --git a/sky/templates/yotta-ray.yml.j2 b/sky/templates/yotta-ray.yml.j2
new file mode 100644
index 00000000000..c9a69ec6e75
--- /dev/null
+++ b/sky/templates/yotta-ray.yml.j2
@@ -0,0 +1,96 @@
+cluster_name: {{cluster_name_on_cloud}}
+
+# The maximum number of workers nodes to launch in addition to the head node.
+max_workers: {{num_nodes - 1}}
+upscaling_speed: {{num_nodes - 1}}
+idle_timeout_minutes: 60
+
+
+provider:
+  type: external
+  module: sky.provision.yotta
+  region: "{{region}}"
+  disable_launch_config_check: true
+  # For Yotta, we directly set the image id for the docker as runtime environment
+  # support, thus we need to avoid the DockerInitializer detects the docker field
+  # and performs the initialization. Therefore we put the docker login config in
+  # the provider config here.
+  {%- if docker_login_config is not none %}
+  docker_login_config:
+    username: |-
+      {{docker_login_config.username}}
+    password: |-
+      {{docker_login_config.password | indent(6) }}
+    server: |-
+      {{docker_login_config.server}}
+  {%- endif %}
+
+auth:
+  ssh_user: root
+  ssh_private_key: {{ssh_private_key}}
+
+available_node_types:
+  ray_head_default:
+    resources: {}
+    node_config:
+      InstanceType: {{instance_type}}
+      ImageId: {{image_id}}
+      DiskSize: {{disk_size}}
+      PublicKey: |-
+        skypilot:ssh_public_key_content
+
+head_node_type: ray_head_default
+
+# Format: `REMOTE_PATH : LOCAL_PATH`
+file_mounts: {
+  "{{sky_ray_yaml_remote_path}}": "{{sky_ray_yaml_local_path}}",
+  "{{sky_remote_path}}/{{sky_wheel_hash}}": "{{sky_local_path}}",
+{%- for remote_path, local_path in credentials.items() %}
+  "{{remote_path}}": "{{local_path}}",
+  "~/.ssh/sky-cluster-key": "{{ssh_private_key}}",
+{%- endfor %}
+}
+
+rsync_exclude: []
+
+initialization_commands: []
+
+# List of shell commands to run to set up nodes.
+# NOTE: these are very performance-sensitive. Each new item opens/closes an SSH
+# connection, which is expensive. Try your best to co-locate commands into fewer
+# items!
+#
+# Increment the following for catching performance bugs easier:
+#   current num items (num SSH connections): 1
+setup_commands:
+  # Disable `unattended-upgrades` to prevent apt-get from hanging. It should be called at the beginning before the process started to avoid being blocked. (This is a temporary fix.)
+  # Add ~/.ssh/sky-cluster-key to SSH config to allow nodes within a cluster to connect to each other
+  # Line 'rm ..': there is another installation of pip.
+  # Line 'sudo bash ..': set the ulimit as suggested by ray docs for performance. https://docs.ray.io/en/latest/cluster/vms/user-guides/large-cluster-best-practices.html#system-configuration
+  # Line 'sudo grep ..': set the number of threads per process to unlimited to avoid ray job submit stucking issue when the number of running ray jobs increase.
+  # Line 'mkdir -p ..': disable host key check
+  # Line 'python3 -c ..': patch the buggy ray files and enable `-o allow_other` option for `goofys`
+  - {%- for initial_setup_command in initial_setup_commands %}
+    {{ initial_setup_command }}
+    {%- endfor %}
+    sudo systemctl stop unattended-upgrades || true;
+    sudo systemctl disable unattended-upgrades || true;
+    sudo sed -i 's/Unattended-Upgrade "1"/Unattended-Upgrade "0"/g' /etc/apt/apt.conf.d/20auto-upgrades || true;
+    sudo kill -9 `sudo lsof /var/lib/dpkg/lock-frontend | awk '{print $2}' | tail -n 1` || true;
+    sudo pkill -9 apt-get;
+    sudo pkill -9 dpkg;
+    sudo dpkg --configure -a;
+    mkdir -p ~/.ssh; touch ~/.ssh/config;
+    {{ conda_installation_commands }}
+    {{ uv_installation_commands }}
+    {{ ray_skypilot_installation_commands }}
+    {{ copy_skypilot_templates_commands }}
+    touch ~/.sudo_as_admin_successful;
+    sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
+    sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
+    mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n  StrictHostKeyChecking no\n  IdentityFile ~/.ssh/sky-cluster-key\n  IdentityFile ~/.ssh/id_rsa" ~/.ssh/config) || printf "Host *\n  StrictHostKeyChecking no\n  IdentityFile ~/.ssh/sky-cluster-key\n  IdentityFile ~/.ssh/id_rsa\n" >> ~/.ssh/config;
+    [ -f /etc/fuse.conf ] && sudo sed -i 's/#user_allow_other/user_allow_other/g' /etc/fuse.conf || (sudo sh -c 'echo "user_allow_other" > /etc/fuse.conf');
+    {{ ssh_max_sessions_config }}
+
+# Command to start ray clusters are now placed in `sky.provision.instance_setup`.
+# We do not need to list it here anymore.
diff --git a/sky/usage/constants.py b/sky/usage/constants.py
index 4fe7271249c..c65d687cd70 100644
--- a/sky/usage/constants.py
+++ b/sky/usage/constants.py
@@ -12,7 +12,7 @@
     'Usage logging can be disabled by setting the '
     'environment variable SKYPILOT_DISABLE_USAGE_COLLECTION=1.')
 
-USAGE_MESSAGE_REDACT_KEYS = ['setup', 'run', 'envs']
+USAGE_MESSAGE_REDACT_KEYS = ['setup', 'run', 'envs', 'secrets']
 USAGE_MESSAGE_REDACT_TYPES = {str, dict}
 
 # Env var for the usage run id. This is used by the API server to associate
diff --git a/sky/users/permission.py b/sky/users/permission.py
index cf1b96dd657..b26c39d7eec 100644
--- a/sky/users/permission.py
+++ b/sky/users/permission.py
@@ -16,6 +16,7 @@
 from sky.skylet import constants
 from sky.users import rbac
 from sky.utils import annotations
+from sky.utils import common
 from sky.utils import common_utils
 from sky.utils.db import db_utils
 
@@ -197,7 +198,14 @@ def _maybe_initialize_policies(self) -> None:
                 user_added = self._add_user_if_not_exists_no_lock(
                     existing_user.id)
                 policy_updated = policy_updated or user_added
-
+        for system_user_id in [
+                common.SERVER_ID, constants.SKYPILOT_SYSTEM_USER_ID
+        ]:
+            if system_user_id not in users_with_roles:
+                logger.debug(f'Adding role for system user: {system_user_id}')
+                user_added = self._add_user_if_not_exists_no_lock(
+                    system_user_id, rbac.RoleName.ADMIN.value)
+                policy_updated = policy_updated or user_added
         if policy_updated:
             enforcer.save_policy()
 
@@ -207,7 +215,9 @@ def add_user_if_not_exists(self, user_id: str) -> None:
         with _policy_lock():
             self._add_user_if_not_exists_no_lock(user_id)
 
-    def _add_user_if_not_exists_no_lock(self, user_id: str) -> bool:
+    def _add_user_if_not_exists_no_lock(self,
+                                        user_id: str,
+                                        role: Optional[str] = None) -> bool:
         """Add user role relationship without lock.
 
         Returns:
@@ -216,7 +226,8 @@ def _add_user_if_not_exists_no_lock(self, user_id: str) -> bool:
         enforcer = self._ensure_enforcer()
         user_roles = enforcer.get_roles_for_user(user_id)
         if not user_roles:
-            enforcer.add_grouping_policy(user_id, rbac.get_default_role())
+            enforcer.add_grouping_policy(user_id, role or
+                                         rbac.get_default_role())
             return True
         return False
 
diff --git a/sky/utils/admin_policy_utils.py b/sky/utils/admin_policy_utils.py
index 33e8b389724..a4c994464d9 100644
--- a/sky/utils/admin_policy_utils.py
+++ b/sky/utils/admin_policy_utils.py
@@ -145,6 +145,10 @@ def apply(
     config = copy.deepcopy(skypilot_config.to_dict())
     mutated_dag = dag_lib.Dag()
     mutated_dag.name = dag.name
+    # Preserve DAG execution properties if set
+    if dag.is_job_group():
+        assert dag.execution is not None
+        mutated_dag.set_execution(dag.execution)
 
     mutated_config = None
     for task in dag.tasks:
diff --git a/sky/utils/cluster_utils.py b/sky/utils/cluster_utils.py
index a7c43c69601..8a951afa2c6 100644
--- a/sky/utils/cluster_utils.py
+++ b/sky/utils/cluster_utils.py
@@ -4,20 +4,105 @@
 import glob
 import os
 import re
+import shlex
+import shutil
+import subprocess
 import textwrap
-from typing import Dict, List, Optional
+from typing import Dict, List, Optional, Tuple
 import uuid
 
+from sky import sky_logging
 from sky.skylet import constants
+from sky.utils import annotations
 from sky.utils import command_runner
 from sky.utils import common_utils
 from sky.utils import lock_events
 
+logger = sky_logging.init_logger(__name__)
+
 # The cluster yaml used to create the current cluster where the module is
 # called.
 SKY_CLUSTER_YAML_REMOTE_PATH = '~/.sky/sky_ray.yml'
 
 
+def _convert_windows_path_to_wsl(windows_path: str) -> str:
+    """Convert a Windows path to WSL path format.
+
+    Args:
+        windows_path: A Windows path (e.g., 'C:\\Users\\username')
+
+    Returns:
+        A WSL-accessible path (e.g., '/mnt/c/Users/username')
+    """
+    path = windows_path.replace('\\', '/')
+    if len(path) >= 2 and path[1] == ':':
+        drive = path[0].lower()
+        return f'/mnt/{drive}{path[2:]}'
+    return path
+
+
+def _convert_wsl_path_to_windows(wsl_path: str) -> str:
+    """Convert a WSL mount path to Windows path format.
+
+    Args:
+        wsl_path: A WSL path starting with /mnt/ (e.g., '/mnt/c/Users/name')
+
+    Returns:
+        A Windows path (e.g., 'C:/Users/name')
+    """
+    if wsl_path.startswith('/mnt/') and len(wsl_path) > 5:
+        drive = wsl_path[5].upper()
+        return f'{drive}:{wsl_path[6:]}'
+    return wsl_path
+
+
+def _get_windows_userprofile_via_cmd() -> Optional[str]:
+    """Query Windows USERPROFILE via cmd.exe.
+
+    Returns:
+        The USERPROFILE path as a string, or None if not available.
+    """
+    try:
+        result = subprocess.run(
+            ['cmd.exe', '/c', 'echo', '%USERPROFILE%'],
+            capture_output=True,
+            text=True,
+            timeout=5,
+            check=False,
+        )
+        if result.returncode == 0:
+            userprofile = result.stdout.strip()
+            if userprofile and userprofile != '%USERPROFILE%':
+                return userprofile
+    except (subprocess.TimeoutExpired, FileNotFoundError, OSError):
+        pass
+    return None
+
+
+@annotations.lru_cache(scope='global', maxsize=1)
+def get_wsl_windows_home() -> Optional[str]:
+    """Get the Windows user's home directory path when running in WSL.
+
+    Returns:
+        The path to Windows home directory (e.g., '/mnt/c/Users/username')
+        or None if not in WSL or cannot determine.
+    """
+    if not common_utils.is_wsl():
+        return None
+
+    # Try environment variable first, then query Windows via cmd.exe
+    userprofile = os.environ.get('USERPROFILE')
+    if not userprofile:
+        userprofile = _get_windows_userprofile_via_cmd()
+
+    if userprofile:
+        windows_home = _convert_windows_path_to_wsl(userprofile)
+        if os.path.isdir(windows_home):
+            return windows_home
+
+    return None
+
+
 def get_provider_name(config: dict) -> str:
     """Return the name of the provider."""
 
@@ -35,7 +120,7 @@ def get_provider_name(config: dict) -> str:
     return provider_name
 
 
-class SSHConfigHelper(object):
+class SSHConfigHelper:
     """Helper for handling local SSH configuration."""
 
     ssh_conf_path = '~/.ssh/config'
@@ -45,6 +130,13 @@ class SSHConfigHelper(object):
     ssh_cluster_path = constants.SKY_USER_FILE_PATH + '/ssh/{}'
     ssh_cluster_key_path = constants.SKY_USER_FILE_PATH + '/ssh-keys/{}.key'
 
+    # Windows paths (used when running in WSL)
+    # Note: These flags are best-effort for UX purposes and are not
+    # thread-safe. The worst case is duplicate log messages, which is
+    # acceptable for this non-critical functionality.
+    _windows_ssh_setup_attempted = False
+    _windows_ssh_setup_warned = False
+
     @classmethod
     def _get_generated_config(cls, autogen_comment: str,
                               cluster_name_on_cloud: str, host_name: str,
@@ -86,6 +178,190 @@ def _get_generated_config(cls, autogen_comment: str,
         codegen = codegen + '\n'
         return codegen
 
+    @classmethod
+    def _get_windows_ssh_paths(cls) -> Optional[Tuple[str, str, str]]:
+        """Get Windows SSH config paths when running in WSL.
+
+        Returns:
+            A tuple of (windows_ssh_config_path, windows_sky_ssh_dir,
+            windows_home) as WSL-accessible paths, or None if not in WSL
+            or Windows home cannot be determined.
+        """
+        windows_home = get_wsl_windows_home()
+        if not windows_home:
+            return None
+
+        windows_ssh_config = os.path.join(windows_home, '.ssh', 'config')
+        windows_sky_ssh_dir = os.path.join(windows_home, '.sky', 'ssh')
+        return (windows_ssh_config, windows_sky_ssh_dir, windows_home)
+
+    @classmethod
+    def _convert_proxy_command_for_windows(cls, proxy_command: str) -> str:
+        """Convert a WSL proxy command to Windows-compatible format.
+
+        Wraps the proxy command with wsl.exe so Windows SSH can execute it.
+        The command runs inside WSL where the original paths are valid.
+
+        Uses single quotes to prevent shell expansion of variables and special
+        characters. Single quotes in the original command are escaped using
+        the standard shell technique: end quote, escaped single quote, start
+        quote ('\"'\"').
+
+        Args:
+            proxy_command: The original proxy command from WSL SSH config.
+
+        Returns:
+            A Windows-compatible proxy command using wsl.exe.
+        """
+        return f'wsl.exe bash -c {shlex.quote(proxy_command)}'
+
+    @classmethod
+    def _add_cluster_to_windows_ssh_config(
+        cls,
+        cluster_name: str,
+        cluster_name_on_cloud: str,
+        ips: List[str],
+        username: str,
+        key_path: str,
+        ports: List[int],
+        proxy_command: Optional[str],
+        uses_docker: bool,
+    ) -> None:
+        """Add cluster SSH config to Windows SSH config when running in WSL.
+
+        This enables VSCode on Windows to connect to SkyPilot clusters
+        launched from WSL without additional configuration.
+
+        Note: Clusters using Docker are skipped as they require complex proxy
+        commands. Regular proxy commands (e.g., for Kubernetes) are converted
+        to use wsl.exe wrapper.
+        """
+        # Skip clusters using Docker (proxy commands are too complex to convert)
+        if uses_docker:
+            return
+
+        windows_paths = cls._get_windows_ssh_paths()
+        if not windows_paths:
+            return
+
+        windows_ssh_config, windows_sky_ssh_dir, windows_home = windows_paths
+
+        # Convert proxy command for Windows if present
+        windows_proxy_command: Optional[str] = None
+        if proxy_command is not None:
+            windows_proxy_command = cls._convert_proxy_command_for_windows(
+                proxy_command)
+
+        try:
+            # Ensure Windows .ssh and .sky/ssh directories exist
+            os.makedirs(os.path.dirname(windows_ssh_config),
+                        exist_ok=True,
+                        mode=0o700)
+            os.makedirs(windows_sky_ssh_dir, exist_ok=True, mode=0o700)
+
+            # Copy SSH key to Windows filesystem to avoid UNC path permission
+            # issues. Windows SSH rejects keys accessed via //wsl$/ paths
+            # because it cannot verify Unix file permissions.
+            # Use cluster name as key filename to ensure consistent naming
+            # between add and remove operations.
+            windows_ssh_keys_dir = os.path.join(windows_home, '.sky',
+                                                'ssh-keys')
+            os.makedirs(windows_ssh_keys_dir, exist_ok=True, mode=0o700)
+            key_filename = f'{cluster_name}.pem'
+            windows_key_dest = os.path.join(windows_ssh_keys_dir, key_filename)
+            # Copy the key file (use copyfile to only copy content, not
+            # metadata/permissions which fail across WSL->Windows boundary)
+            shutil.copyfile(os.path.expanduser(key_path), windows_key_dest)
+            # Convert to Windows path format for SSH config
+            windows_key_path = _convert_wsl_path_to_windows(windows_key_dest)
+
+            # Create Windows SSH config if it doesn't exist
+            if not os.path.exists(windows_ssh_config):
+                with open(windows_ssh_config,
+                          'w',
+                          encoding='utf-8',
+                          opener=functools.partial(os.open, mode=0o644)) as f:
+                    f.write('\n')
+
+            with open(windows_ssh_config, 'r', encoding='utf-8') as f:
+                config = f.readlines()
+
+            # Add Include directive for SkyPilot configs if not present
+            win_ssh_dir = _convert_wsl_path_to_windows(windows_sky_ssh_dir)
+            include_path = f'{win_ssh_dir}/*'
+            include_str = f'Include {include_path}'
+
+            include_found = any(include_str in line for line in config)
+
+            if not include_found:
+                config.insert(
+                    0, '# Added by SkyPilot for ssh config of all clusters\n'
+                    f'{include_str}\n')
+                with open(windows_ssh_config, 'w', encoding='utf-8') as f:
+                    f.write(''.join(config).strip())
+                    f.write('\n\n')
+
+            # Generate cluster config for Windows
+            sky_autogen_comment = ('# Added by sky (use `sky stop/down '
+                                   f'{cluster_name}` to remove)')
+
+            codegen = ''
+            for i, ip in enumerate(ips):
+                node_name = (cluster_name
+                             if i == 0 else f'{cluster_name}-worker{i}')
+                codegen += cls._get_generated_config(
+                    sky_autogen_comment,
+                    cluster_name_on_cloud,
+                    node_name,
+                    ip,
+                    username,
+                    windows_key_path,
+                    windows_proxy_command,
+                    ports[i],
+                    None,
+                ) + '\n'
+
+            if codegen:
+                cluster_config_path = os.path.join(windows_sky_ssh_dir,
+                                                   cluster_name)
+                with open(cluster_config_path,
+                          'w',
+                          encoding='utf-8',
+                          opener=functools.partial(os.open, mode=0o644)) as f:
+                    f.write(codegen)
+
+                if not cls._windows_ssh_setup_warned:
+                    cls._windows_ssh_setup_warned = True
+                    print(f'  WSL detected: SSH config also added to Windows '
+                          f'({windows_ssh_config}) for VSCode Remote-SSH.')
+
+        except (OSError, PermissionError) as e:
+            # Silently ignore errors - Windows SSH config is optional
+            if not cls._windows_ssh_setup_attempted:
+                cls._windows_ssh_setup_attempted = True
+                logger.debug(f'Could not set up Windows SSH config: {e}')
+
+    @classmethod
+    def _remove_cluster_from_windows_ssh_config(cls, cluster_name: str) -> None:
+        """Remove cluster SSH config from Windows when running in WSL."""
+        windows_paths = cls._get_windows_ssh_paths()
+        if not windows_paths:
+            return
+
+        _, windows_sky_ssh_dir, windows_home = windows_paths
+        cluster_config_path = os.path.join(windows_sky_ssh_dir, cluster_name)
+
+        try:
+            common_utils.remove_file_if_exists(cluster_config_path)
+            # Also remove the copied SSH key (named as {cluster_name}.pem)
+            windows_ssh_keys_dir = os.path.join(windows_home, '.sky',
+                                                'ssh-keys')
+            key_path = os.path.join(windows_ssh_keys_dir, f'{cluster_name}.pem')
+            common_utils.remove_file_if_exists(key_path)
+        except (OSError, PermissionError):
+            # Silently ignore errors - Windows SSH config is optional
+            pass
+
     @classmethod
     def generate_local_key_file(cls, cluster_name: str,
                                 auth_config: Dict[str, str]) -> str:
@@ -249,6 +525,20 @@ def _docker_proxy_cmd(ip: str, port: int) -> str:
                   opener=functools.partial(os.open, mode=0o644)) as f:
             f.write(codegen)
 
+        # Also add to Windows SSH config if running in WSL
+        # This enables VSCode on Windows to connect to clusters launched in WSL
+        if common_utils.is_wsl():
+            cls._add_cluster_to_windows_ssh_config(
+                cluster_name=cluster_name,
+                cluster_name_on_cloud=cluster_name_on_cloud,
+                ips=ips,
+                username=username,
+                key_path=key_path_for_config,
+                ports=ports,
+                proxy_command=proxy_command,
+                uses_docker=docker_user is not None,
+            )
+
     @classmethod
     def _remove_stale_cluster_config_for_backward_compatibility(
         cls,
@@ -369,6 +659,10 @@ def remove_cluster(cls, cluster_name: str):
                 cls.ssh_cluster_path.format(cluster_name))
             common_utils.remove_file_if_exists(cluster_config_path)
 
+            # Also remove from Windows SSH config if running in WSL
+            if common_utils.is_wsl():
+                cls._remove_cluster_from_windows_ssh_config(cluster_name)
+
     @classmethod
     def list_cluster_names(cls) -> List[str]:
         """List all names of clusters with SSH config set up."""
diff --git a/sky/utils/command_runner.py b/sky/utils/command_runner.py
index c879ed055fc..a45651440b8 100644
--- a/sky/utils/command_runner.py
+++ b/sky/utils/command_runner.py
@@ -62,7 +62,6 @@
 # The git exclude file to support.
 GIT_EXCLUDE = '.git/info/exclude'
 RSYNC_EXCLUDE_OPTION = '--exclude-from={}'
-# Owner and group metadata is not needed for downloads.
 RSYNC_NO_OWNER_NO_GROUP_OPTION = '--no-owner --no-group'
 
 _HASH_MAX_LENGTH = 10
@@ -209,6 +208,7 @@ def ssh_options_list(
     disable_control_master: Optional[bool] = False,
     escape_percent_expand: bool = False,
     ssh_log_file: Optional[str] = None,
+    disable_identities_only: bool = False,
 ) -> List[str]:
     """Returns a list of sane options for 'ssh'."""
     if connect_timeout is None:
@@ -233,8 +233,9 @@ def ssh_options_list(
         #   Warning: Permanently added 'xx.xx.xx.xx' (EDxxx) to the list of
         #   known hosts.
         'LogLevel': 'ERROR',
-        # Try fewer extraneous key pairs.
-        'IdentitiesOnly': 'yes',
+        # Try fewer extraneous key pairs. Disabled when disable_identities_only
+        # is set, allowing fallback to ssh-agent or default identity files.
+        'IdentitiesOnly': None if disable_identities_only else 'yes',
         # Abort if port forwarding fails (instead of just printing to
         # stderr).
         'ExitOnForwardFailure': 'yes',
@@ -335,7 +336,7 @@ def __init__(self, node: Tuple[Any, Any], **kwargs):
     def node_id(self) -> str:
         return '-'.join(str(x) for x in self.node)
 
-    def _get_remote_home_dir(self) -> str:
+    def get_remote_home_dir(self) -> str:
         # Use pattern matching to extract home directory.
         # Some container images print MOTD when login shells start, which can
         # contaminate command output. We use a unique pattern to extract the
@@ -394,10 +395,16 @@ def _get_command_to_run(
                 shlex.quote('true && export OMP_NUM_THREADS=1 '
                             f'PYTHONWARNINGS=ignore && ({cmd})')
             ]
-        if not separate_stderr:
-            command.append('2>&1')
         if run_in_background:
-            command = ['nohup'] + command + ['&']
+            command = ['nohup'] + command + [
+                '>/dev/null',  # Detach stdout.
+                '2>&1',  # Detach stderr.
+                '</dev/null',  # Detach stdin.
+                '&',  # Run in background.
+            ]
+        else:
+            if not separate_stderr:
+                command.append('2>&1')
         if not process_stream and skip_num_lines:
             assert not run_in_background, (
                 'run_in_background and skip_num_lines cannot be used together')
@@ -414,7 +421,7 @@ def _get_command_to_run(
         command_str = ' '.join(command)
         return command_str
 
-    def _get_remote_home_dir_with_retry(
+    def get_remote_home_dir_with_retry(
         self,
         max_retry: int,
         get_remote_home_dir: Callable[[], str],
@@ -454,8 +461,7 @@ def _rsync(
         if prefix_command is not None:
             rsync_command.append(prefix_command)
         rsync_command += ['rsync', RSYNC_DISPLAY_OPTION]
-        if not up:
-            rsync_command.append(RSYNC_NO_OWNER_NO_GROUP_OPTION)
+        rsync_command.append(RSYNC_NO_OWNER_NO_GROUP_OPTION)
 
         # --filter
         # The source is a local path, so we need to resolve it.
@@ -489,7 +495,7 @@ def _rsync(
                     pathlib.Path(target).expanduser().resolve())
             else:
                 if target.startswith('~'):
-                    remote_home_dir = self._get_remote_home_dir_with_retry(
+                    remote_home_dir = self.get_remote_home_dir_with_retry(
                         max_retry=max_retry,
                         get_remote_home_dir=get_remote_home_dir)
                     resolved_target = target.replace('~', remote_home_dir)
@@ -510,7 +516,7 @@ def _rsync(
             else:
                 resolved_target = os.path.expanduser(target)
                 if source.startswith('~'):
-                    remote_home_dir = self._get_remote_home_dir_with_retry(
+                    remote_home_dir = self.get_remote_home_dir_with_retry(
                         max_retry=max_retry,
                         get_remote_home_dir=get_remote_home_dir)
                     resolved_source = source.replace('~', remote_home_dir)
@@ -589,6 +595,51 @@ def run(
         """
         raise NotImplementedError
 
+    def run_driver(
+        self,
+        cmd: Union[str, List[str]],
+        **kwargs,
+    ) -> Union[int, Tuple[int, str, str]]:
+        """Runs the command for executing the job driver on the cluster.
+
+        On most clouds, this is equivalent to run(). For Slurm with containers,
+        this runs on the host because the driver itself uses srun internally to
+        launch work in containers and running srun inside srun has lots of
+        complications.
+
+        Args:
+            cmd: The command to run.
+            **kwargs: Additional arguments passed to run().
+
+        Returns:
+            returncode
+            or
+            A tuple of (returncode, stdout, stderr).
+        """
+        return self.run(cmd, **kwargs)
+
+    def run_setup(
+        self,
+        cmd: Union[str, List[str]],
+        **kwargs,
+    ) -> Union[int, Tuple[int, str, str]]:
+        """Runs the setup command on the cluster.
+
+        On most clouds, this is equivalent to run(). For Slurm with containers,
+        this runs on BOTH the host AND inside the container to ensure the
+        environment is consistent across both.
+
+        Args:
+            cmd: The command to run.
+            **kwargs: Additional arguments passed to run().
+
+        Returns:
+            returncode
+            or
+            A tuple of (returncode, stdout, stderr).
+        """
+        return self.run(cmd, **kwargs)
+
     @timeline.event
     def rsync(
         self,
@@ -618,6 +669,75 @@ def rsync(
         """
         raise NotImplementedError
 
+    def rsync_driver(
+        self,
+        source: str,
+        target: str,
+        *,
+        up: bool,
+        log_path: str = os.devnull,
+        stream_logs: bool = True,
+        max_retry: int = 1,
+    ) -> None:
+        """Rsync files related to the job driver execution.
+
+        Transfers files related to job orchestration (scripts, logs, etc.).
+        On most clouds, this is equivalent to rsync(). For Slurm with
+        containers, this syncs to/from the host because the driver
+        runs there.
+
+        Args:
+            source: The source path.
+            target: The target path.
+            up: True for local to cluster, False for cluster to local.
+            log_path: Redirect stdout/stderr to the log_path.
+            stream_logs: Stream logs to the stdout/stderr.
+            max_retry: Maximum retry attempts.
+
+        Raises:
+            exceptions.CommandError: rsync command failed.
+        """
+        return self.rsync(source,
+                          target,
+                          up=up,
+                          log_path=log_path,
+                          stream_logs=stream_logs,
+                          max_retry=max_retry)
+
+    def rsync_setup(
+        self,
+        source: str,
+        target: str,
+        *,
+        up: bool,
+        log_path: str = os.devnull,
+        stream_logs: bool = True,
+        max_retry: int = 1,
+    ) -> None:
+        """Rsync files for setting up the SkyPilot runtime on the cluster.
+
+        Transfers setup files (dependencies, configs, etc.). On most clouds,
+        this is equivalent to rsync(). For Slurm with containers, this syncs
+        to BOTH the host AND inside the container.
+
+        Args:
+            source: The source path.
+            target: The target path.
+            up: True for local to cluster, False for cluster to local.
+            log_path: Redirect stdout/stderr to the log_path.
+            stream_logs: Stream logs to the stdout/stderr.
+            max_retry: Maximum retry attempts.
+
+        Raises:
+            exceptions.CommandError: rsync command failed.
+        """
+        return self.rsync(source,
+                          target,
+                          up=up,
+                          log_path=log_path,
+                          stream_logs=stream_logs,
+                          max_retry=max_retry)
+
     @classmethod
     def make_runner_list(
         cls: Type['CommandRunner'],
@@ -708,9 +828,9 @@ def git_clone(
 
         # Step 2: Execute the script on remote machine
         if target_dir.startswith('~'):
-            remote_home_dir = self._get_remote_home_dir_with_retry(
+            remote_home_dir = self.get_remote_home_dir_with_retry(
                 max_retry=max_retry,
-                get_remote_home_dir=self._get_remote_home_dir)
+                get_remote_home_dir=self.get_remote_home_dir)
             target_dir = target_dir.replace('~', remote_home_dir)
         quoted_target_dir = shlex.quote(target_dir)
         quoted_script_path = shlex.quote(remote_script_path)
@@ -766,6 +886,7 @@ def __init__(
         disable_control_master: Optional[bool] = False,
         port_forward_execute_remote_command: Optional[bool] = False,
         enable_interactive_auth: bool = False,
+        disable_identities_only: bool = False,
     ):
         """Initialize SSHCommandRunner.
 
@@ -798,6 +919,9 @@ def __init__(
                 add -N to the port forwarding command. This is useful if you
                 want to run a command on the remote machine to make sure the
                 SSH tunnel is established.
+            disable_identities_only: If True, do not set IdentitiesOnly=yes.
+                This allows SSH to use keys from ssh-agent and default key
+                locations in addition to any explicitly specified key.
         """
         super().__init__(node)
         ip, port = node
@@ -810,6 +934,7 @@ def __init__(
         self.disable_control_master = (
             disable_control_master or
             control_master_utils.should_disable_control_master())
+        self.disable_identities_only = disable_identities_only
         # Ensure SSH key is available. For SkyPilot-managed keys, create from
         # database. For external keys (e.g., Slurm clusters), verify existence.
         if ssh_private_key is not None and _is_skypilot_managed_key(
@@ -842,12 +967,13 @@ def __init__(
                 inner_proxy_command = inner_proxy_command.replace(
                     '%p', str(inner_proxy_port))
             self._docker_ssh_proxy_command = lambda ssh: ' '.join(
-                ssh + ssh_options_list(ssh_private_key,
-                                       None,
-                                       ssh_proxy_command=inner_proxy_command,
-                                       port=inner_proxy_port,
-                                       disable_control_master=self.
-                                       disable_control_master) +
+                ssh + ssh_options_list(
+                    ssh_private_key,
+                    None,
+                    ssh_proxy_command=inner_proxy_command,
+                    port=inner_proxy_port,
+                    disable_control_master=self.disable_control_master,
+                    disable_identities_only=self.disable_identities_only) +
                 ['-W', '%h:%p', f'{ssh_user}@{ip}'])
         else:
             self.ip = ip
@@ -925,7 +1051,9 @@ def ssh_base_command(self,
             port=self.port,
             connect_timeout=connect_timeout,
             disable_control_master=self.disable_control_master,
-            ssh_log_file=ssh_log_file) + [f'{self.ssh_user}@{self.ip}']
+            ssh_log_file=ssh_log_file,
+            disable_identities_only=self.disable_identities_only,
+        ) + [f'{self.ssh_user}@{self.ip}']
 
     def _retry_with_interactive_auth(
             self, session_id: str, command: List[str], log_path: str,
@@ -1264,7 +1392,8 @@ def rsync(
                 ssh_proxy_jump=self._ssh_proxy_jump,
                 docker_ssh_proxy_command=docker_ssh_proxy_command,
                 port=self.port,
-                disable_control_master=self.disable_control_master))
+                disable_control_master=self.disable_control_master,
+                disable_identities_only=self.disable_identities_only))
         rsh_option = f'ssh {ssh_options}'
         self._rsync(source,
                     target,
@@ -1286,6 +1415,7 @@ def __init__(
         self,
         node: Tuple[Tuple[str, Optional[str]], str],
         deployment: Optional[str] = None,
+        container: Optional[str] = None,
         **kwargs,
     ):
         """Initialize KubernetesCommandRunner.
@@ -1297,11 +1427,19 @@ def __init__(
 
         Args:
             node: The namespace and pod_name of the remote machine.
+            deployment: If set, run commands against `deployment/<deployment>`
+                instead of `pod/<pod_name>`.
+            container: If set, run commands inside the given container name via
+                `kubectl exec -c <container>`. This is recommended for
+                multi-container pods (e.g., when sidecars are injected) to
+                ensure commands target the primary workload container (such as
+                `ray-node`).
         """
         del kwargs
         super().__init__(node)
         (self.namespace, self.context), self.pod_name = node
         self.deployment = deployment
+        self.container = container
 
     @property
     def node_id(self) -> str:
@@ -1423,6 +1561,8 @@ def run(
             kubectl_args += ['--kubeconfig', '/dev/null']
 
         kubectl_args += [self.kube_identifier]
+        if self.container is not None:
+            kubectl_args += ['-c', self.container]
 
         if ssh_mode == SshMode.LOGIN:
             assert isinstance(cmd, list), 'cmd must be a list for login mode.'
@@ -1487,7 +1627,7 @@ def rsync(
         # Advanced options.
         log_path: str = os.devnull,
         stream_logs: bool = True,
-        max_retry: int = _MAX_RETRIES_FOR_RSYNC,
+        max_retry: int = 1,
     ) -> None:
         """Uses 'rsync' to sync 'source' to 'target'.
 
@@ -1526,11 +1666,13 @@ def rsync(
             log_path=log_path,
             stream_logs=stream_logs,
             max_retry=max_retry,
-            prefix_command=f'chmod +x {helper_path} && ',
+            prefix_command=(f'chmod +x {helper_path} && ' + (
+                '' if self.container is None else
+                f'SKYPILOT_K8S_EXEC_CONTAINER={shlex.quote(self.container)} ')),
             # rsync with `kubectl` as the rsh command will cause ~/xx parsed as
             # /~/xx, so we need to replace ~ with the remote home directory. We
             # only need to do this when ~ is at the beginning of the path.
-            get_remote_home_dir=self._get_remote_home_dir)
+            get_remote_home_dir=self.get_remote_home_dir)
 
 
 class LocalProcessCommandRunner(CommandRunner):
@@ -1634,6 +1776,12 @@ class SlurmCommandRunner(SSHCommandRunner):
     controller, to the virtual instances.
     """
 
+    # Set the uv cache directory to /tmp/uv_cache_$(id -u) to speed up
+    # package installation while avoiding permission conflicts when
+    # multiple users share the same node. Otherwise it defaults to
+    # ~/.cache/uv.
+    _ENV_SETUP = 'export UV_CACHE_DIR=/tmp/uv_cache_$(id -u)'
+
     def __init__(
         self,
         node: Tuple[str, int],
@@ -1644,6 +1792,7 @@ def __init__(
         skypilot_runtime_dir: str,
         job_id: str,
         slurm_node: str,
+        container_args: Optional[str],
         **kwargs,
     ):
         """Initialize SlurmCommandRunner.
@@ -1680,32 +1829,50 @@ def __init__(
         self.skypilot_runtime_dir = skypilot_runtime_dir
         self.job_id = job_id
         self.slurm_node = slurm_node
+        self.container_args = container_args
 
-    def rsync(
+    def _rsync_via_srun(
         self,
         source: str,
         target: str,
         *,
         up: bool,
+        in_container: bool,
         log_path: str = os.devnull,
         stream_logs: bool = True,
         max_retry: int = 1,
     ) -> None:
-        """Rsyncs files to/from the Slurm compute node using srun as transport.
+        """Rsyncs files via srun, either to host or into container.
+
+        Args:
+            source: Source path.
+            target: Target path.
+            up: If True, upload from local to remote. If False, download.
+            in_container: If True, rsync into container filesystem.
+                If False, rsync to the host.
+            log_path: Path for rsync logs.
+            stream_logs: Whether to stream logs.
+            max_retry: Maximum retry attempts.
         """
         ssh_command = ' '.join(
             self.ssh_base_command(ssh_mode=SshMode.NON_INTERACTIVE,
                                   port_forward=None,
                                   connect_timeout=None))
 
-        # The script parses job_id+node_list from $1 (node_destination)
-        # shifts past $1, and then SSHs to login node, and runs srun with
-        # the remaining arguments.
+        extra_srun_args = (f'{self.container_args} '
+                           if in_container and self.container_args else '')
+        if in_container:
+            # TODO(kevin): Cache container home dir using kv_cache.py keyed by
+            # container image+version (same image -> same $HOME).
+            remote_home_dir = self.get_remote_home_dir()
+        else:
+            remote_home_dir = self.sky_dir
+
         script_content = f"""#!/bin/bash
 job_id=$(echo "$1" | cut -d+ -f1)
 node_list=$(echo "$1" | cut -d+ -f2)
 shift
-exec {ssh_command} srun --unbuffered --quiet --overlap \
+exec {ssh_command} srun --unbuffered --quiet --overlap {extra_srun_args}\\
     --jobid="$job_id" --nodelist="$node_list" --nodes=1 --ntasks=1 "$@"
 """
         encoded_info = f'{self.job_id}+{self.slurm_node}'
@@ -1723,7 +1890,7 @@ def rsync(
                         log_path=log_path,
                         stream_logs=stream_logs,
                         max_retry=max_retry,
-                        get_remote_home_dir=lambda: self.sky_dir)
+                        get_remote_home_dir=lambda: remote_home_dir)
         finally:
             try:
                 os.unlink(rsh_script_path)
@@ -1732,38 +1899,140 @@ def rsync(
                                f'{rsh_script_path}: '
                                f'{common_utils.exception_to_string(e)}')
 
+    def _run_via_srun(
+        self,
+        cmd: Union[str, List[str]],
+        in_container: bool,
+        **kwargs,
+    ) -> Union[int, Tuple[int, str, str]]:
+        """Run command via srun, either on host or in container.
+
+        Args:
+            cmd: Command to run.
+            in_container: If True, run inside container using container_args.
+                If False, run on the host with HOME and working directory setup.
+            **kwargs: Additional arguments passed to SSHCommandRunner.run.
+        """
+        if isinstance(cmd, list):
+            cmd = ' '.join(cmd)
+
+        # Build inner command with environment setup.
+        if in_container:
+            assert self.container_args is not None, (
+                '_run_via_srun with in_container=True called but '
+                'container_args not set')
+            inner_cmd = f'{self._ENV_SETUP} && {cmd}'
+            extra_srun_args = f'{self.container_args} '
+        else:
+            inner_cmd = (f'export {constants.SKY_RUNTIME_DIR_ENV_VAR_KEY}='
+                         f'"{self.skypilot_runtime_dir}" && '
+                         f'{self._ENV_SETUP} && '
+                         f'cd {self.sky_dir} && export HOME="$PWD" && '
+                         f'{cmd}')
+            extra_srun_args = ''
+
+        srun_cmd = (
+            f'srun --unbuffered --quiet --overlap --jobid={self.job_id} '
+            f'--nodelist={self.slurm_node} '
+            f'--nodes=1 --ntasks=1 {extra_srun_args}'
+            f'bash -c {shlex.quote(inner_cmd)}')
+
+        return SSHCommandRunner.run(self, srun_cmd, **kwargs)
+
+    def rsync(
+        self,
+        source: str,
+        target: str,
+        *,
+        up: bool,
+        log_path: str = os.devnull,
+        stream_logs: bool = True,
+        max_retry: int = 1,
+    ) -> None:
+        # Default: run in container if container_args set, otherwise on host
+        in_container = self.container_args is not None
+        self._rsync_via_srun(source=source,
+                             target=target,
+                             up=up,
+                             in_container=in_container,
+                             log_path=log_path,
+                             stream_logs=stream_logs,
+                             max_retry=max_retry)
+
     @timeline.event
     @context_utils.cancellation_guard
-    def run(self, cmd: Union[str, List[str]],
-            **kwargs) -> Union[int, Tuple[int, str, str]]:
-        """Run Slurm-supported user commands over an SSH connection.
+    def run(
+        self,
+        cmd: Union[str, List[str]],
+        **kwargs,
+    ) -> Union[int, Tuple[int, str, str]]:
+        in_container = self.container_args is not None
+        return self._run_via_srun(cmd, in_container=in_container, **kwargs)
 
-        Args:
-            cmd: The Slurm-supported user command to run.
+    def run_driver(
+        self,
+        cmd: Union[str, List[str]],
+        **kwargs,
+    ) -> Union[int, Tuple[int, str, str]]:
+        # Host only: driver uses srun internally to launch work in containers.
+        return self._run_via_srun(cmd, in_container=False, **kwargs)
 
-        Returns:
-            returncode
-            or
-            A tuple of (returncode, stdout, stderr).
-        """
-        # Override $HOME so that each SkyPilot cluster's state is isolated
-        # from one another. We rely on the assumption that ~ is exclusively
-        # used by a cluster, and in Slurm that is not the case, as $HOME
-        # could be part of a shared filesystem.
-        # And similarly for SKY_RUNTIME_DIR. See constants.\
-        # SKY_RUNTIME_DIR_ENV_VAR_KEY for more details.
-        cmd = (
-            f'export {constants.SKY_RUNTIME_DIR_ENV_VAR_KEY}='
-            f'"{self.skypilot_runtime_dir}" && '
-            # Set the uv cache directory to /tmp/uv_cache_$(id -u) to speed up
-            # package installation while avoiding permission conflicts when
-            # multiple users share the same host. Otherwise it defaults to
-            # ~/.cache/uv.
-            f'export UV_CACHE_DIR=/tmp/uv_cache_$(id -u) && '
-            f'cd {self.sky_dir} && export HOME=$(pwd) && {cmd}')
-
-        cmd = (f'srun --unbuffered --quiet --overlap --jobid={self.job_id} '
-               f'--nodelist={self.slurm_node} '
-               f'--nodes=1 --ntasks=1 bash -c {shlex.quote(cmd)}')
-
-        return super().run(cmd, **kwargs)
+    def run_setup(
+        self,
+        cmd: Union[str, List[str]],
+        **kwargs,
+    ) -> Union[int, Tuple[int, str, str]]:
+        # Both host and container: ensure environment is consistent.
+        result = self._run_via_srun(cmd, in_container=False, **kwargs)
+        if self.container_args:
+            returncode = result if isinstance(result, int) else result[0]
+            if returncode != 0:
+                return result
+            result = self._run_via_srun(cmd, in_container=True, **kwargs)
+        return result
+
+    def rsync_driver(
+        self,
+        source: str,
+        target: str,
+        *,
+        up: bool,
+        log_path: str = os.devnull,
+        stream_logs: bool = True,
+        max_retry: int = 1,
+    ) -> None:
+        # Host only: driver runs on host and uses srun internally.
+        self._rsync_via_srun(source=source,
+                             target=target,
+                             up=up,
+                             in_container=False,
+                             log_path=log_path,
+                             stream_logs=stream_logs,
+                             max_retry=max_retry)
+
+    def rsync_setup(
+        self,
+        source: str,
+        target: str,
+        *,
+        up: bool,
+        log_path: str = os.devnull,
+        stream_logs: bool = True,
+        max_retry: int = 1,
+    ) -> None:
+        # Both host and container: ensure environment is consistent.
+        self._rsync_via_srun(source=source,
+                             target=target,
+                             up=up,
+                             in_container=False,
+                             log_path=log_path,
+                             stream_logs=stream_logs,
+                             max_retry=max_retry)
+        if self.container_args is not None:
+            self._rsync_via_srun(source=source,
+                                 target=target,
+                                 up=up,
+                                 in_container=True,
+                                 log_path=log_path,
+                                 stream_logs=stream_logs,
+                                 max_retry=max_retry)
diff --git a/sky/utils/command_runner.pyi b/sky/utils/command_runner.pyi
index 799bb8ff69f..b7c238de94d 100644
--- a/sky/utils/command_runner.pyi
+++ b/sky/utils/command_runner.pyi
@@ -30,9 +30,12 @@ def ssh_options_list(
     ssh_proxy_command: Optional[str] = ...,
     ssh_proxy_jump: Optional[str] = ...,
     docker_ssh_proxy_command: Optional[str] = ...,
-    timeout: int = ...,
+    connect_timeout: Optional[int] = ...,
     port: int = ...,
     disable_control_master: Optional[bool] = ...,
+    escape_percent_expand: bool = ...,
+    ssh_log_file: Optional[str] = ...,
+    disable_identities_only: bool = ...,
 ) -> List[str]:
     ...
 
@@ -101,14 +104,83 @@ class CommandRunner:
             **kwargs) -> Union[Tuple[int, str, str], int]:
         ...
 
-    def rsync(self,
-              source: str,
-              target: str,
-              *,
-              up: bool,
-              log_path: str = ...,
-              stream_logs: bool = ...,
-              max_retry: int = ...) -> None:
+    def get_remote_home_dir(self) -> str:
+        ...
+
+    def rsync(
+        self,
+        source: str,
+        target: str,
+        *,
+        up: bool,
+        log_path: str = ...,
+        stream_logs: bool = ...,
+        max_retry: int = ...,
+    ) -> None:
+        ...
+
+    @typing.overload
+    def run_driver(self,
+                   cmd: Union[str, List[str]],
+                   *,
+                   require_outputs: Literal[False] = ...,
+                   **kwargs) -> int:
+        ...
+
+    @typing.overload
+    def run_driver(self, cmd: Union[str, List[str]], *,
+                   require_outputs: Literal[True],
+                   **kwargs) -> Tuple[int, str, str]:
+        ...
+
+    @typing.overload
+    def run_driver(self,
+                   cmd: Union[str, List[str]],
+                   *,
+                   require_outputs: bool = ...,
+                   **kwargs) -> Union[Tuple[int, str, str], int]:
+        ...
+
+    @typing.overload
+    def run_setup(self,
+                  cmd: Union[str, List[str]],
+                  *,
+                  require_outputs: Literal[False] = ...,
+                  **kwargs) -> int:
+        ...
+
+    @typing.overload
+    def run_setup(self, cmd: Union[str, List[str]], *,
+                  require_outputs: Literal[True],
+                  **kwargs) -> Tuple[int, str, str]:
+        ...
+
+    @typing.overload
+    def run_setup(self,
+                  cmd: Union[str, List[str]],
+                  *,
+                  require_outputs: bool = ...,
+                  **kwargs) -> Union[Tuple[int, str, str], int]:
+        ...
+
+    def rsync_driver(self,
+                     source: str,
+                     target: str,
+                     *,
+                     up: bool,
+                     log_path: str = ...,
+                     stream_logs: bool = ...,
+                     max_retry: int = ...) -> None:
+        ...
+
+    def rsync_setup(self,
+                    source: str,
+                    target: str,
+                    *,
+                    up: bool,
+                    log_path: str = ...,
+                    stream_logs: bool = ...,
+                    max_retry: int = ...) -> None:
         ...
 
     def port_forward_command(
@@ -141,6 +213,7 @@ class SSHCommandRunner(CommandRunner):
     disable_control_master: Optional[bool]
     port_forward_execute_remote_command: Optional[bool]
     enable_interactive_auth: bool
+    disable_identities_only: bool
 
     def __init__(
         self,
@@ -154,6 +227,7 @@ class SSHCommandRunner(CommandRunner):
         disable_control_master: Optional[bool] = ...,
         port_forward_execute_remote_command: Optional[bool] = ...,
         enable_interactive_auth: bool = ...,
+        disable_identities_only: bool = ...,
     ) -> None:
         ...
 
@@ -220,15 +294,17 @@ class SSHCommandRunner(CommandRunner):
     ) -> List[str]:
         ...
 
-    def rsync(self,
-              source: str,
-              target: str,
-              *,
-              up: bool,
-              log_path: str = ...,
-              stream_logs: bool = ...,
-              max_retry: int = ...,
-              get_remote_home_dir: Callable[[], str] = ...) -> None:
+    def rsync(
+        self,
+        source: str,
+        target: str,
+        *,
+        up: bool,
+        log_path: str = ...,
+        stream_logs: bool = ...,
+        max_retry: int = ...,
+        get_remote_home_dir: Callable[[], str] = ...,
+    ) -> None:
         ...
 
     def port_forward_command(
@@ -245,6 +321,7 @@ class KubernetesCommandRunner(CommandRunner):
         self,
         node: Tuple[Tuple[str, Optional[str]], str],
         deployment: Optional[str] = ...,
+        container: Optional[str] = ...,
         **kwargs,
     ) -> None:
         ...
@@ -303,14 +380,16 @@ class KubernetesCommandRunner(CommandRunner):
             **kwargs) -> Union[Tuple[int, str, str], int]:
         ...
 
-    def rsync(self,
-              source: str,
-              target: str,
-              *,
-              up: bool,
-              log_path: str = ...,
-              stream_logs: bool = ...,
-              max_retry: int = ...) -> None:
+    def rsync(
+        self,
+        source: str,
+        target: str,
+        *,
+        up: bool,
+        log_path: str = ...,
+        stream_logs: bool = ...,
+        max_retry: int = ...,
+    ) -> None:
         ...
 
     def port_forward_command(
@@ -327,6 +406,7 @@ class SlurmCommandRunner(SSHCommandRunner):
     skypilot_runtime_dir: str
     job_id: str
     slurm_node: str
+    container_args: Optional[str]
 
     def __init__(
         self,
@@ -338,6 +418,7 @@ class SlurmCommandRunner(SSHCommandRunner):
         skypilot_runtime_dir: str,
         job_id: str,
         slurm_node: str,
+        container_args: Optional[str] = ...,
         **kwargs,
     ) -> None:
         ...
diff --git a/sky/utils/common_utils.py b/sky/utils/common_utils.py
index 342cd5142aa..b2781f030d1 100644
--- a/sky/utils/common_utils.py
+++ b/sky/utils/common_utils.py
@@ -1,5 +1,6 @@
 """Utils shared between all of sky"""
 
+import base64
 import ctypes
 import difflib
 import enum
@@ -189,6 +190,36 @@ def check_cluster_name_is_valid(cluster_name: Optional[str]) -> None:
                 f'{valid_regex}')
 
 
+def check_recipe_name_is_valid(recipe_name: Optional[str]) -> None:
+    """Errors out on invalid recipe names.
+
+    Recipe names must:
+    - Start with a letter
+    - Contain only letters, numbers, and dashes (no underscores or dots)
+    - End with a letter or number
+    - Be at most constants.RECIPE_NAME_MAX_LENGTH characters
+
+    Raises:
+        exceptions.InvalidRecipeNameError: If the recipe name is invalid.
+    """
+    if recipe_name is None:
+        return
+    if len(recipe_name) > constants.RECIPE_NAME_MAX_LENGTH:
+        with ux_utils.print_exception_no_traceback():
+            raise exceptions.InvalidRecipeNameError(
+                f'Recipe name "{recipe_name}" is too long; '
+                f'maximum length is {constants.RECIPE_NAME_MAX_LENGTH} '
+                f'characters, got {len(recipe_name)}')
+    valid_regex = constants.RECIPE_NAME_VALID_REGEX
+    if re.fullmatch(valid_regex, recipe_name) is None:
+        with ux_utils.print_exception_no_traceback():
+            raise exceptions.InvalidRecipeNameError(
+                f'Recipe name "{recipe_name}" is invalid; '
+                'ensure it is fully matched by regex (e.g., '
+                'only contains letters, numbers, and dashes): '
+                f'{valid_regex}')
+
+
 def make_cluster_name_on_cloud(display_name: str,
                                max_length: Optional[int] = 15,
                                add_user_hash: bool = True) -> str:
@@ -723,7 +754,7 @@ def remove_file_if_exists(path: Optional[str]):
 
 def is_wsl() -> bool:
     """Detect if running under Windows Subsystem for Linux (WSL)."""
-    return 'microsoft' in platform.uname()[3].lower()
+    return 'microsoft' in platform.uname().release.lower()
 
 
 def find_free_port(start_port: int) -> int:
@@ -1139,3 +1170,24 @@ def release_memory():
         logger.error(f'Failed to release memory: '
                      f'{format_exception(e)}')
         return 0
+
+
+def base64_url_encode(data: bytes) -> str:
+    """Base64url encode data without padding.
+
+    Uses URL-safe alphabet (- and _ instead of + and /) and strips padding
+    to avoid URL encoding issues with = characters.
+    """
+    return base64.urlsafe_b64encode(data).rstrip(b'=').decode('ascii')
+
+
+def compute_code_challenge(code_verifier: str) -> str:
+    """Compute a code_challenge from code_verifier using SHA256.
+
+    Used in the CLI login flow for CSRF protection. The CLI generates a
+    random code_verifier, computes the challenge, and sends the challenge
+    to the server. Later, the CLI proves it initiated the request by
+    providing the original verifier which the server hashes to verify.
+    """
+    digest = hashlib.sha256(code_verifier.encode('utf-8')).digest()
+    return base64_url_encode(digest)
diff --git a/sky/utils/config_utils.py b/sky/utils/config_utils.py
index 77cc897c303..67c62b4c740 100644
--- a/sky/utils/config_utils.py
+++ b/sky/utils/config_utils.py
@@ -13,9 +13,9 @@
 # maps the field name to the patch merge key.
 # pylint: disable=line-too-long
 # Ref: https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.27/#podspec-v1-core
-# NOTE: field containers and imagePullSecrets are not included deliberately for
-# backward compatibility (we only support one container per pod now).
+# NOTE: field imagePullSecrets are not included deliberately for backward compatibility
 _PATCH_MERGE_KEYS = {
+    'containers': 'name',
     'initContainers': 'name',
     'ephemeralContainers': 'name',
     'volumes': 'name',
@@ -207,9 +207,15 @@ def merge_k8s_configs(
     Updates nested dictionaries instead of replacing them.
     If a list is encountered, it will be appended to the base_config list.
 
-    An exception is when the key is 'containers', in which case the
-    first container in the list will be fetched and merge_dict will be
-    called on it with the first container in the base_config list.
+    For fields with Kubernetes patch merge strategy (containers, volumes, env,
+    etc.), items are merged by their patch merge key (e.g., 'name' for
+    containers). If an item with the same key exists in base_config, it is
+    merged; otherwise, the new item is appended.
+
+    Special handling for 'containers': for backward compatibility, if a
+    container in the override does not have a 'name' field, it is merged
+    into the first container in the base config (legacy behavior). If the
+    container has a 'name' field, patch merge by name is used.
     """
     for key, value in override_config.items():
         (next_allowed_override_keys, next_disallowed_override_keys
@@ -222,12 +228,11 @@ def merge_k8s_configs(
         elif isinstance(value, list) and key in base_config:
             assert isinstance(base_config[key], list), \
                 f'Expected {key} to be a list, found {base_config[key]}'
-            if key in ['containers', 'imagePullSecrets']:
-                # If the key is 'containers' or 'imagePullSecrets, we take the
-                # first and only container/secret in the list and merge it, as
-                # we only support one container per pod.
+            if key == 'imagePullSecrets':
+                # For imagePullSecrets, merge the first item from override
+                # into the first item in base (legacy behavior).
                 assert len(value) == 1, \
-                    f'Expected only one container, found {value}'
+                    f'Expected only one imagePullSecret, found {value}'
                 merge_k8s_configs(base_config[key][0], value[0],
                                   next_allowed_override_keys,
                                   next_disallowed_override_keys)
@@ -238,6 +243,7 @@ def merge_k8s_configs(
                 for override_item in value:
                     override_item_name = override_item.get(patch_merge_key)
                     if override_item_name is not None:
+                        # Item has a name - use patch merge by name
                         existing_base_item = next(
                             (v for v in base_config[key]
                              if v.get(patch_merge_key) == override_item_name),
@@ -246,6 +252,12 @@ def merge_k8s_configs(
                             merge_k8s_configs(existing_base_item, override_item)
                         else:
                             base_config[key].append(override_item)
+                    elif key == 'containers' and base_config[key]:
+                        # Backward compatibility for containers: if no name is
+                        # specified, merge into the first container (index 0)
+                        merge_k8s_configs(base_config[key][0], override_item,
+                                          next_allowed_override_keys,
+                                          next_disallowed_override_keys)
                     else:
                         base_config[key].append(override_item)
             else:
diff --git a/sky/utils/context_utils.py b/sky/utils/context_utils.py
index 81461efc207..79d3c637f76 100644
--- a/sky/utils/context_utils.py
+++ b/sky/utils/context_utils.py
@@ -218,17 +218,6 @@ def wrapper(*args, **kwargs):
 T = TypeVar('T')
 
 
-# TODO(aylei): replace this with asyncio.to_thread once we drop support for
-# python 3.8
-def to_thread(func: Callable[P, T], /, *args: P.args,
-              **kwargs: P.kwargs) -> 'asyncio.Future[T]':
-    """Asynchronously run function *func* in a separate thread.
-
-    This is same as asyncio.to_thread added in python 3.9
-    """
-    return to_thread_with_executor(None, func, *args, **kwargs)
-
-
 def to_thread_with_executor(executor: Optional[concurrent.futures.Executor],
                             func: Callable[P, T], /, *args: P.args,
                             **kwargs: P.kwargs) -> 'asyncio.Future[T]':
diff --git a/sky/utils/controller_utils.py b/sky/utils/controller_utils.py
index a7a410cf614..024fd03b051 100644
--- a/sky/utils/controller_utils.py
+++ b/sky/utils/controller_utils.py
@@ -27,6 +27,8 @@
 from sky.serve import constants as serve_constants
 from sky.serve import serve_state
 from sky.server import config as server_config
+from sky.server import plugin_utils
+from sky.server import plugins
 from sky.setup_files import dependencies
 from sky.skylet import constants
 from sky.skylet import log_lib
@@ -542,17 +544,52 @@ def shared_controller_vars_to_fill(
             yaml_utils.dump_yaml(temp_file.name, dict(**local_user_config))
         local_user_config_path = temp_file.name
 
-    vars_to_fill: Dict[str, Any] = {
-        'cloud_dependencies_installation_commands':
-            _get_cloud_dependencies_installation_commands(controller),
-        # We need to activate the python environment on the controller to ensure
-        # cloud SDKs are installed in SkyPilot runtime environment and can be
-        # accessed.
+    vars_to_fill: Dict[str, Any] = controller_only_vars_to_fill(controller)
+    vars_to_fill.update({
         'sky_activate_python_env': constants.ACTIVATE_SKY_REMOTE_PYTHON_ENV,
         'sky_python_cmd': constants.SKY_PYTHON_CMD,
         'local_user_config_path': local_user_config_path,
+    })
+    env_vars: Dict[str, Any] = {
+        env.env_key: str(int(env.get())) for env in env_options.Options
     }
-    env_vars: Dict[str, str] = {
+    env_vars.update({
+        # Make sure the clusters launched by the controller are marked as
+        # launched with a remote API server if the controller is launched
+        # with a remote API server.
+        constants.USING_REMOTE_API_SERVER_ENV_VAR: str(
+            common_utils.get_using_remote_api_server()),
+    })
+    if skypilot_config.loaded():
+        # Only set the SKYPILOT_CONFIG env var if the user has a config file.
+        env_vars[
+            skypilot_config.ENV_VAR_SKYPILOT_CONFIG] = remote_user_config_path
+    vars_to_fill['controller_envs'].update(env_vars)
+    return vars_to_fill
+
+
+def controller_only_vars_to_fill(controller: Controllers) -> Dict[str, str]:
+    # Get plugins config and wheel file mounts/commands together to ensure
+    # consistency between the uploaded wheel paths and installation commands.
+    # Only upload plugins specified in remote_plugins.yaml - plugins in
+    # plugins.yaml are intended for local API server use only.
+    local_plugins_config_path = None
+    plugin_wheel_file_mounts, plugins_wheel_install_commands = (
+        plugin_utils.get_plugin_mounts_and_commands())
+    if plugin_wheel_file_mounts and plugins_wheel_install_commands:
+        local_plugins_config_path = (
+            plugin_utils.get_filtered_plugins_config_path())
+    vars_to_fill: Dict[str, Any] = {
+        'sky_activate_python_env': constants.ACTIVATE_SKY_REMOTE_PYTHON_ENV,
+        'cloud_dependencies_installation_commands':
+            _get_cloud_dependencies_installation_commands(controller),
+        # Plugin-related template variables
+        'local_plugins_config_path': local_plugins_config_path,
+        'remote_plugins_config_path': plugins.REMOTE_PLUGINS_CONFIG_PATH,
+        'plugin_wheel_file_mounts': plugin_wheel_file_mounts,
+        'plugins_wheel_install_commands': plugins_wheel_install_commands,
+    }
+    env_vars: Dict[str, Any] = {
         env.env_key: str(int(env.get())) for env in env_options.Options
     }
     env_vars.update({
@@ -564,11 +601,6 @@ def shared_controller_vars_to_fill(
         env_options.Options.SKIP_CLOUD_IDENTITY_CHECK.env_key: '1',
         # Disable minimize logging to get more details on the controller.
         env_options.Options.MINIMIZE_LOGGING.env_key: '0',
-        # Make sure the clusters launched by the controller are marked as
-        # launched with a remote API server if the controller is launched
-        # with a remote API server.
-        constants.USING_REMOTE_API_SERVER_ENV_VAR: str(
-            common_utils.get_using_remote_api_server()),
         constants.IS_SKYPILOT_SERVE_CONTROLLER:
             ('true'
              if controller == Controllers.SKY_SERVE_CONTROLLER else 'false'),
@@ -578,10 +610,6 @@ def shared_controller_vars_to_fill(
     if override_concurrent_launches is not None:
         env_vars[constants.SERVE_OVERRIDE_CONCURRENT_LAUNCHES] = str(
             int(override_concurrent_launches))
-    if skypilot_config.loaded():
-        # Only set the SKYPILOT_CONFIG env var if the user has a config file.
-        env_vars[
-            skypilot_config.ENV_VAR_SKYPILOT_CONFIG] = remote_user_config_path
     vars_to_fill['controller_envs'] = env_vars
     return vars_to_fill
 
diff --git a/sky/utils/dag_utils.py b/sky/utils/dag_utils.py
index e3c54283ba8..a90a007a394 100644
--- a/sky/utils/dag_utils.py
+++ b/sky/utils/dag_utils.py
@@ -1,10 +1,12 @@
 """Utilities for loading and dumping DAGs from/to YAML files."""
 import copy
+import re
 from typing import Any, Dict, List, Optional, Tuple, Union
 
 from sky import dag as dag_lib
 from sky import sky_logging
 from sky import task as task_lib
+from sky.skylet import constants
 from sky.utils import cluster_utils
 from sky.utils import registry
 from sky.utils import ux_utils
@@ -12,6 +14,12 @@
 
 logger = sky_logging.init_logger(__name__)
 
+# JobGroup header fields
+_JOB_GROUP_HEADER_FIELDS = {
+    'name', 'execution', 'primary_tasks', 'termination_delay'
+}
+_JOB_GROUP_REQUIRED_HEADER_FIELDS = {'name'}
+
 # Message thrown when APIs sky.{exec,launch,jobs.launch}() received a string
 # instead of a Dag.  CLI (cli.py) is implemented by us so should not trigger
 # this.
@@ -69,16 +77,45 @@ def _load_chain_dag(
         env_overrides: Optional[List[Tuple[str, str]]] = None,
         secrets_overrides: Optional[List[Tuple[str,
                                                str]]] = None) -> dag_lib.Dag:
-    """Loads a chain DAG from a list of YAML configs."""
+    """Loads a chain DAG (pipeline) from a list of YAML configs.
+
+    A pipeline YAML can have an optional header as the first document with:
+    - name: The pipeline name
+    - execution: Must be 'serial' or omitted (omitted defaults to serial)
+
+    If the first document contains only pipeline header fields, it's treated
+    as the header. Otherwise, all documents are treated as task definitions.
+    """
     dag_name = None
-    if set(configs[0].keys()) == {'name'}:
-        dag_name = configs[0]['name']
+    first_config = configs[0] if configs else None
+
+    # Check if the first document is a pipeline header.
+    # A header has only 'name', or 'name' + 'execution' (for explicit serial).
+    is_header = False
+    if first_config is not None:
+        first_keys = set(first_config.keys())
+        is_header = first_keys == {'name'
+                                  } or first_keys == {'name', 'execution'}
+
+    if is_header:
+        assert first_config is not None  # For mypy
+        dag_name = first_config['name']
+        # Validate execution mode if specified
+        execution = first_config.get('execution')
+        if (execution is not None and
+                execution != dag_lib.DagExecution.SERIAL.value):
+            with ux_utils.print_exception_no_traceback():
+                raise ValueError(
+                    f'Invalid execution mode for pipeline: {execution!r}. '
+                    'Pipelines must use execution: serial (or omit it). '
+                    'Use execution: parallel for job groups.')
         configs = configs[1:]
-    elif len(configs) == 1:
-        dag_name = configs[0].get('name')
+    elif len(configs) == 1 and first_config is not None:
+        # Single document: the task itself may have a name
+        dag_name = first_config.get('name')
 
     if not configs:
-        # YAML has only `name: xxx`. Still instantiate a task.
+        # YAML has only header. Still instantiate a task.
         configs = [{'name': dag_name}]
 
     current_task = None
@@ -92,6 +129,8 @@ def _load_chain_dag(
                 current_task >> task  # pylint: disable=pointless-statement
             current_task = task
     dag.name = dag_name
+    # Pipelines have serial execution (explicitly or by default)
+    dag.execution = dag_lib.DagExecution.SERIAL
     return dag
 
 
@@ -179,6 +218,92 @@ def dump_chain_dag_to_yaml(dag: dag_lib.Dag, path: str) -> None:
         f.write(dag_str)
 
 
+def dump_dag_to_yaml_str(dag: dag_lib.Dag,
+                         use_user_specified_yaml: bool = False) -> str:
+    """Dumps a DAG to a YAML string, auto-detecting the DAG type.
+
+    This function automatically chooses the correct serialization format:
+    - For JobGroups: uses multi-document YAML with header + jobs
+    - For chain DAGs: uses multi-document YAML with name + tasks
+
+    Args:
+        dag: the DAG to dump (can be either a chain DAG or JobGroup).
+        use_user_specified_yaml: whether to use user-specified YAML format.
+
+    Returns:
+        The YAML string.
+    """
+    if dag.is_job_group():
+        return dump_job_group_to_yaml_str(dag, use_user_specified_yaml)
+    else:
+        return dump_chain_dag_to_yaml_str(dag, use_user_specified_yaml)
+
+
+def dump_dag_to_yaml(dag: dag_lib.Dag,
+                     path: str,
+                     use_user_specified_yaml: bool = False) -> None:
+    """Dumps a DAG to a YAML file, auto-detecting the DAG type.
+
+    Args:
+        dag: the DAG to dump.
+        path: the path to the YAML file.
+        use_user_specified_yaml: whether to use user-specified YAML format.
+    """
+    dag_str = dump_dag_to_yaml_str(dag, use_user_specified_yaml)
+    with open(path, 'w', encoding='utf-8') as f:
+        f.write(dag_str)
+
+
+def load_dag_from_yaml_str(
+    yaml_str: str,
+    env_overrides: Optional[List[Tuple[str, str]]] = None,
+    secrets_overrides: Optional[List[Tuple[str, str]]] = None,
+) -> dag_lib.Dag:
+    """Loads a DAG from a YAML string, auto-detecting the type.
+
+    This function automatically detects whether the YAML represents a
+    JobGroup or a chain DAG and uses the appropriate loader.
+
+    Args:
+        yaml_str: The YAML string to parse.
+        env_overrides: Environment variable overrides for all tasks.
+        secrets_overrides: Secrets overrides for all tasks.
+
+    Returns:
+        A Dag (either a JobGroup or chain DAG).
+    """
+    if is_job_group_yaml_str(yaml_str):
+        return load_job_group_from_yaml_str(yaml_str, env_overrides,
+                                            secrets_overrides)
+    else:
+        return load_chain_dag_from_yaml_str(yaml_str, env_overrides,
+                                            secrets_overrides)
+
+
+def load_dag_from_yaml(
+    path: str,
+    env_overrides: Optional[List[Tuple[str, str]]] = None,
+    secrets_overrides: Optional[List[Tuple[str, str]]] = None,
+) -> dag_lib.Dag:
+    """Loads a DAG from a YAML file, auto-detecting the type.
+
+    This function automatically detects whether the YAML represents a
+    JobGroup or a chain DAG and uses the appropriate loader.
+
+    Args:
+        path: Path to the YAML file.
+        env_overrides: Environment variable overrides for all tasks.
+        secrets_overrides: Secrets overrides for all tasks.
+
+    Returns:
+        A Dag (either a JobGroup or chain DAG).
+    """
+    if is_job_group_yaml(path):
+        return load_job_group_from_yaml(path, env_overrides, secrets_overrides)
+    else:
+        return load_chain_dag_from_yaml(path, env_overrides, secrets_overrides)
+
+
 def maybe_infer_and_fill_dag_and_task_names(dag: dag_lib.Dag) -> None:
     """Infer and fill the dag/task name if it is None.
 
@@ -237,3 +362,315 @@ def fill_default_config_in_dag_for_job_launch(dag: dag_lib.Dag) -> None:
                                      'the same job recovery strategy.')
 
         task_.set_resources(type(task_.resources)(new_resources_list))
+
+
+def is_job_group_yaml(path: str) -> bool:
+    """Check if a YAML file defines a JobGroup.
+
+    A JobGroup YAML is a multi-document YAML where the first document
+    contains JobGroup header fields like 'execution'.
+
+    Args:
+        path: Path to the YAML file.
+
+    Returns:
+        True if this is a JobGroup YAML, False otherwise.
+    """
+    configs = yaml_utils.read_yaml_all(path)
+    return _is_job_group_configs(configs)
+
+
+def is_job_group_yaml_str(yaml_str: str) -> bool:
+    """Check if a YAML string defines a JobGroup."""
+    configs = yaml_utils.read_yaml_all_str(yaml_str)
+    return _is_job_group_configs(configs)
+
+
+def _is_job_group_configs(configs: List[Dict[str, Any]]) -> bool:
+    """Check if configs represent a JobGroup.
+
+    A YAML is a JobGroup if and only if execution is set to 'parallel'.
+    If execution is omitted or set to 'serial', it's a pipeline.
+    """
+    if not configs or len(configs) < 2:
+        # JobGroup needs at least header + 1 job
+        return False
+
+    header = configs[0]
+    if header is None:
+        return False
+
+    # Check if execution mode is explicitly set to 'parallel'
+    execution = header.get('execution')
+    return execution == dag_lib.DagExecution.PARALLEL.value
+
+
+def load_job_group_from_yaml(
+    path: str,
+    env_overrides: Optional[List[Tuple[str, str]]] = None,
+    secrets_overrides: Optional[List[Tuple[str, str]]] = None,
+) -> dag_lib.Dag:
+    """Load a JobGroup from a multi-document YAML file.
+
+    JobGroup YAML format:
+        ---
+        name: my-job-group
+        execution: parallel
+        ---
+        name: trainer
+        resources:
+          accelerators: H100:8
+        run: |
+          python train.py
+        ---
+        name: data-processor
+        resources:
+          accelerators: V100:4
+        run: |
+          python process.py
+
+    Args:
+        path: Path to the JobGroup YAML file.
+        env_overrides: Environment variable overrides for all tasks.
+        secrets_overrides: Secrets overrides for all tasks.
+
+    Returns:
+        A Dag marked as a JobGroup with all jobs as tasks.
+
+    Raises:
+        ValueError: If the YAML is not a valid JobGroup format.
+    """
+    configs = yaml_utils.read_yaml_all(path)
+    return _load_job_group(configs, env_overrides, secrets_overrides)
+
+
+def load_job_group_from_yaml_str(
+    yaml_str: str,
+    env_overrides: Optional[List[Tuple[str, str]]] = None,
+    secrets_overrides: Optional[List[Tuple[str, str]]] = None,
+) -> dag_lib.Dag:
+    """Load a JobGroup from a multi-document YAML string."""
+    configs = yaml_utils.read_yaml_all_str(yaml_str)
+    return _load_job_group(configs, env_overrides, secrets_overrides)
+
+
+def _load_job_group(
+    configs: List[Dict[str, Any]],
+    env_overrides: Optional[List[Tuple[str, str]]] = None,
+    secrets_overrides: Optional[List[Tuple[str, str]]] = None,
+) -> dag_lib.Dag:
+    """Load a JobGroup from parsed YAML configs.
+
+    Args:
+        configs: List of YAML document configs. First is header, rest are jobs.
+        env_overrides: Environment variable overrides for all tasks.
+        secrets_overrides: Secrets overrides for all tasks.
+
+    Returns:
+        A Dag marked as a JobGroup.
+    """
+    if not configs or len(configs) < 2:
+        with ux_utils.print_exception_no_traceback():
+            raise ValueError('JobGroup YAML must have at least 2 documents: '
+                             'header and at least one job definition.')
+
+    # Parse header
+    header = configs[0]
+    if header is None:
+        with ux_utils.print_exception_no_traceback():
+            raise ValueError('JobGroup header cannot be empty.')
+
+    # Validate header has required fields
+    missing_fields = _JOB_GROUP_REQUIRED_HEADER_FIELDS - set(header.keys())
+    if missing_fields:
+        with ux_utils.print_exception_no_traceback():
+            raise ValueError(
+                f'JobGroup header missing required fields: {missing_fields}')
+
+    # Warn about unknown fields in header
+    unknown_fields = set(header.keys()) - _JOB_GROUP_HEADER_FIELDS
+    if unknown_fields:
+        logger.warning(f'Unknown fields in JobGroup header: {unknown_fields}. '
+                       'These will be ignored.')
+
+    group_name = header['name']
+
+    # Validate job group name is safe for shell/filesystem use
+    # (alphanumeric, hyphens, underscores only)
+    if not group_name or not all(c.isalnum() or c in '-_' for c in group_name):
+        with ux_utils.print_exception_no_traceback():
+            raise ValueError(
+                f'Invalid job group name: {group_name!r}. '
+                'Name must contain only alphanumeric characters, hyphens, '
+                'and underscores.')
+
+    execution_str = header.get('execution', dag_lib.DagExecution.PARALLEL.value)
+
+    # Parse execution mode
+    try:
+        execution = dag_lib.DagExecution(execution_str)
+    except ValueError as e:
+        with ux_utils.print_exception_no_traceback():
+            raise ValueError(
+                f'Invalid execution mode: {execution_str}. '
+                f'Valid options: {[e.value for e in dag_lib.DagExecution]}'
+            ) from e
+
+    # Parse job definitions
+    job_configs = configs[1:]
+    if not job_configs:
+        with ux_utils.print_exception_no_traceback():
+            raise ValueError('JobGroup must have at least one job definition.')
+
+    # Create DAG using context manager pattern (consistent with _load_chain_dag)
+    job_names = set()
+    with dag_lib.Dag() as dag:
+        for i, job_config in enumerate(job_configs):
+            if job_config is None:
+                continue
+
+            # Ensure each job has a name
+            job_name = job_config.get('name')
+            if job_name is None:
+                with ux_utils.print_exception_no_traceback():
+                    raise ValueError(
+                        f'Job {i + 1} in JobGroup must have a "name" field.')
+
+            # Validate job name is safe for shell/filesystem use
+            if not all(c.isalnum() or c in '-_' for c in job_name):
+                with ux_utils.print_exception_no_traceback():
+                    raise ValueError(
+                        f'Invalid job name: {job_name!r}. '
+                        'Name must contain only alphanumeric characters, '
+                        'hyphens, and underscores.')
+
+            # Check for duplicate job names
+            if job_name in job_names:
+                with ux_utils.print_exception_no_traceback():
+                    raise ValueError(
+                        f'Duplicate job name in JobGroup: {job_name}')
+            job_names.add(job_name)
+
+            # Create task from job config (auto-added to current dag context)
+            task = task_lib.Task.from_yaml_config(job_config, env_overrides,
+                                                  secrets_overrides)
+            task.name = job_name
+
+    # Set DAG execution properties after context manager
+    dag.name = group_name
+    dag.set_execution(execution)
+
+    # Parse and validate primary_tasks
+    primary_tasks = header.get('primary_tasks')
+    if primary_tasks is not None:
+        if not isinstance(primary_tasks, list):
+            with ux_utils.print_exception_no_traceback():
+                raise ValueError(f'primary_tasks must be a list of job names, '
+                                 f'got {type(primary_tasks).__name__}')
+        # Empty list is treated as "all jobs are primary"
+        if primary_tasks:
+            for job_name in primary_tasks:
+                if not isinstance(job_name, str):
+                    with ux_utils.print_exception_no_traceback():
+                        raise ValueError(
+                            f'primary_tasks entries must be strings, '
+                            f'got {type(job_name).__name__}')
+                if job_name not in job_names:
+                    with ux_utils.print_exception_no_traceback():
+                        raise ValueError(
+                            f'primary_tasks references unknown job: '
+                            f'{job_name}. Available jobs: {sorted(job_names)}')
+            dag.primary_tasks = primary_tasks
+        # Empty list means all jobs are primary (same as not setting it)
+
+    # Parse and validate termination_delay
+    termination_delay = header.get('termination_delay')
+    if termination_delay is not None:
+        if isinstance(termination_delay, (str, int)):
+            # Simple format: "30s" or 30
+            _validate_time_duration(str(termination_delay), 'termination_delay')
+            dag.termination_delay = str(termination_delay)
+        elif isinstance(termination_delay, dict):
+            # Dict format: {"default": "30s", "replay-buffer": "1m"}
+            for key, value in termination_delay.items():
+                if not isinstance(key, str) or not isinstance(
+                        value, (str, int)):
+                    with ux_utils.print_exception_no_traceback():
+                        raise ValueError(
+                            f'termination_delay dict must have string keys and '
+                            f'string/int values, got {key!r}: {value!r}')
+                if key != 'default' and key not in job_names:
+                    with ux_utils.print_exception_no_traceback():
+                        raise ValueError(
+                            f'termination_delay references unknown job: '
+                            f'{key}. Available jobs: {sorted(job_names)} '
+                            f'(or "default")')
+                _validate_time_duration(str(value),
+                                        f'termination_delay[{key!r}]')
+            # Convert all values to strings
+            dag.termination_delay = {
+                k: str(v) for k, v in termination_delay.items()
+            }
+        else:
+            with ux_utils.print_exception_no_traceback():
+                raise ValueError(
+                    f'termination_delay must be a string, int, or dict, '
+                    f'got {type(termination_delay).__name__}')
+
+    logger.info(f'Loaded JobGroup "{group_name}" with {len(dag.tasks)} jobs: '
+                f'{[t.name for t in dag.tasks]}')
+
+    return dag
+
+
+def _validate_time_duration(duration: str, field_name: str) -> None:
+    """Validate a time duration string.
+
+    Args:
+        duration: Time duration string (e.g., '30s', '5m', '1h', or '30')
+        field_name: Name of the field for error messages
+
+    Raises:
+        ValueError: If the duration is invalid.
+    """
+    # TIME_PATTERN_SECONDS already enforces non-negative integers with optional
+    # unit suffix, so we only need to check the pattern match.
+    if not re.match(constants.TIME_PATTERN_SECONDS, duration):
+        with ux_utils.print_exception_no_traceback():
+            raise ValueError(
+                f'Invalid time duration for {field_name}: {duration!r}. '
+                f'Expected format: NUMBER[s|m|h|d|w] (e.g., "30s", "5m", "1h")')
+
+
+def dump_job_group_to_yaml_str(dag: dag_lib.Dag,
+                               use_user_specified_yaml: bool = False) -> str:
+    """Dump a JobGroup DAG to a multi-document YAML string.
+
+    Args:
+        dag: The JobGroup DAG to dump.
+        use_user_specified_yaml: Whether to use user-specified YAML format.
+
+    Returns:
+        Multi-document YAML string.
+    """
+    assert dag.is_job_group(), 'DAG is not a JobGroup'
+
+    # Build header
+    header: Dict[str, Any] = {
+        'name': dag.name,
+    }
+    if dag.execution is not None:
+        header['execution'] = dag.execution.value
+    if dag.primary_tasks is not None:
+        header['primary_tasks'] = dag.primary_tasks
+    if dag.termination_delay is not None:
+        header['termination_delay'] = dag.termination_delay
+
+    # Build job configs
+    configs: List[Dict[str, Any]] = [header]
+    for task in dag.tasks:
+        job_config = task.to_yaml_config(
+            use_user_specified_yaml=use_user_specified_yaml)
+        configs.append(job_config)
+
+    return yaml_utils.dump_yaml_str(configs)
diff --git a/sky/utils/db/db_utils.py b/sky/utils/db/db_utils.py
index 582bee22d93..431dbffbc71 100644
--- a/sky/utils/db/db_utils.py
+++ b/sky/utils/db/db_utils.py
@@ -472,32 +472,31 @@ def get_engine(
                 logger.debug(
                     f'Creating a new postgres {engine_type} engine with '
                     f'maximum {_max_connections} connections')
-                if _max_connections == 0:
-                    kw_args = {'poolclass': sqlalchemy.NullPool}
-                    if async_engine:
-                        _postgres_engine_cache[conn_string] = (
-                            sqlalchemy_async.create_async_engine(
-                                conn_string, **kw_args))
-                    else:
-                        _postgres_engine_cache[conn_string] = (
-                            sqlalchemy.create_engine(conn_string, **kw_args))
+                if async_engine:
+                    # Use NullPool for async engines to avoid event loop binding
+                    # issues. asyncpg connection pools bind to the event loop on
+                    # first use, which causes "Future attached to a different
+                    # loop" errors if the engine is created in a different
+                    # context (e.g., a thread). NullPool creates a fresh
+                    # connection per operation, avoiding this issue.
+                    # Refer to https://docs.sqlalchemy.org/en/21/orm/extensions/asyncio.html#using-multiple-asyncio-event-loops for more details. # pylint: disable=line-too-long
+                    _postgres_engine_cache[conn_string] = (
+                        sqlalchemy_async.create_async_engine(
+                            conn_string, poolclass=sqlalchemy.NullPool))
+                elif _max_connections == 0:
+                    _postgres_engine_cache[conn_string] = (
+                        sqlalchemy.create_engine(conn_string,
+                                                 poolclass=sqlalchemy.NullPool))
                 else:
-                    kw_args = {
-                        'pool_size': _max_connections,
-                        'max_overflow': max(0, 5 - _max_connections),
-                        'pool_pre_ping': True,
-                        'pool_recycle': 1800
-                    }
-                    if async_engine:
-                        kw_args[
-                            'poolclass'] = sqlalchemy.pool.AsyncAdaptedQueuePool
-                        _postgres_engine_cache[conn_string] = (
-                            sqlalchemy_async.create_async_engine(
-                                conn_string, **kw_args))
-                    else:
-                        kw_args['poolclass'] = sqlalchemy.pool.QueuePool
-                        _postgres_engine_cache[conn_string] = (
-                            sqlalchemy.create_engine(conn_string, **kw_args))
+                    # Sync engines can safely use QueuePool for connection reuse
+                    _postgres_engine_cache[conn_string] = (
+                        sqlalchemy.create_engine(
+                            conn_string,
+                            poolclass=sqlalchemy.pool.QueuePool,
+                            pool_size=_max_connections,
+                            max_overflow=max(0, 5 - _max_connections),
+                            pool_pre_ping=True,
+                            pool_recycle=1800))
             engine = _postgres_engine_cache[conn_string]
     else:
         assert db_name is not None, 'db_name must be provided for SQLite'
diff --git a/sky/utils/db/migration_utils.py b/sky/utils/db/migration_utils.py
index d3514a04366..e9f8054059a 100644
--- a/sky/utils/db/migration_utils.py
+++ b/sky/utils/db/migration_utils.py
@@ -19,11 +19,11 @@
 DB_INIT_LOCK_TIMEOUT_SECONDS = 10
 
 GLOBAL_USER_STATE_DB_NAME = 'state_db'
-GLOBAL_USER_STATE_VERSION = '011'  # add is_ephemeral column to volumes
+GLOBAL_USER_STATE_VERSION = '013'  # add cloud/region/zone columns to clusters
 GLOBAL_USER_STATE_LOCK_PATH = f'~/.sky/locks/.{GLOBAL_USER_STATE_DB_NAME}.lock'
 
 SPOT_JOBS_DB_NAME = 'spot_jobs_db'
-SPOT_JOBS_VERSION = '011'  # add links column for external links
+SPOT_JOBS_VERSION = '014'  # merge: job groups + cloud/region/zone columns
 SPOT_JOBS_LOCK_PATH = f'~/.sky/locks/.{SPOT_JOBS_DB_NAME}.lock'
 
 SERVE_DB_NAME = 'serve_db'
@@ -38,6 +38,10 @@
 KV_CACHE_VERSION = '001'  # initial kv_cache table for AWS AMIs
 KV_CACHE_LOCK_PATH = f'~/.sky/locks/.{KV_CACHE_DB_NAME}.lock'
 
+RECIPES_DB_NAME = 'recipes_db'
+RECIPES_VERSION = '001'
+RECIPES_LOCK_PATH = f'~/.sky/locks/.{RECIPES_DB_NAME}.lock'
+
 
 @contextlib.contextmanager
 def db_lock(db_name: str):
diff --git a/sky/utils/kubernetes/gpu_labeler.py b/sky/utils/kubernetes/gpu_labeler.py
index fb672dd9f06..f10cacbef15 100644
--- a/sky/utils/kubernetes/gpu_labeler.py
+++ b/sky/utils/kubernetes/gpu_labeler.py
@@ -6,9 +6,11 @@
 from typing import Dict, Optional, Tuple
 
 import colorama
+import jinja2
 import yaml
 
 from sky.adaptors import kubernetes
+from sky.provision.kubernetes import constants as kubernetes_constants
 from sky.provision.kubernetes import utils as kubernetes_utils
 from sky.utils import directory_utils
 from sky.utils import rich_utils
@@ -25,12 +27,13 @@ def cleanup(context: Optional[str] = None) -> Tuple[bool, str]:
     invoked if --cleanup is passed to the script.
     """
     # Delete any existing GPU labeler Kubernetes resources:
-    del_command = ('kubectl delete pods,services,deployments,jobs,daemonsets,'
-                   'replicasets,configmaps,secrets,pv,pvc,clusterrole,'
-                   'serviceaccount,clusterrolebinding -n kube-system '
-                   '-l job=sky-gpu-labeler')
+    del_command = 'kubectl '
     if context:
-        del_command += f' --context {context}'
+        del_command += f'--context {context} '
+    del_command += ('delete pods,services,deployments,jobs,daemonsets,'
+                    'replicasets,configmaps,secrets,pv,pvc,clusterrole,'
+                    'serviceaccount,clusterrolebinding -n kube-system '
+                    '-l job=sky-gpu-labeler')
     success = False
     reason = ''
     with rich_utils.client_status('Cleaning up existing GPU labeling '
@@ -78,16 +81,43 @@ def label(context: Optional[str] = None, wait_for_completion: bool = True):
 
     # Apply the RBAC manifest using kubectl since it contains multiple resources
     with rich_utils.client_status('Setting up GPU labeling'):
-        rbac_manifest_path = os.path.join(manifest_dir,
-                                          'k8s_gpu_labeler_setup.yaml')
+        rbac_template_path = os.path.join(manifest_dir,
+                                          'k8s_gpu_labeler_setup.yaml.j2')
         try:
-            apply_command = ['kubectl', 'apply', '-f', rbac_manifest_path]
+            with open(rbac_template_path, 'r', encoding='utf-8') as f:
+                template_content = f.read()
+        except FileNotFoundError:
+            print(f'Error: GPU labeler template not found at '
+                  f'{rbac_template_path}. '
+                  'Your SkyPilot installation may be incomplete.')
+            return
+        except IOError as e:
+            print(f'Error reading GPU labeler template: {e}')
+            return
+
+        try:
+            # Render the Jinja2 template with canonical GPU names
+            template = jinja2.Template(template_content)
+            manifest_content = template.render(
+                canonical_gpu_names=kubernetes_constants.CANONICAL_GPU_NAMES)
+        except jinja2.TemplateError as e:
+            print(f'Error rendering GPU labeler template: {e}')
+            return
+
+        try:
+            # Apply via stdin to use the rendered content
+            apply_command = ['kubectl']
             if context:
                 apply_command += ['--context', context]
-            subprocess.check_output(apply_command)
+            apply_command += ['apply', '-f', '-']
+            subprocess.run(apply_command,
+                           input=manifest_content.encode(),
+                           check=True,
+                           capture_output=True)
         except subprocess.CalledProcessError as e:
             output = e.output.decode('utf-8')
-            print('Error setting up GPU labeling: ' + output)
+            stderr = e.stderr.decode('utf-8')
+            print('Error applying GPU labeler manifest: ' + output + stderr)
             return
 
     jobs_to_node_names: Dict[str, str] = {}
diff --git a/sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml b/sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml.j2
similarity index 75%
rename from sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml
rename to sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml.j2
index 59eb7f29510..35b0837f425 100644
--- a/sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml
+++ b/sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml.j2
@@ -52,29 +52,44 @@ data:
     #!/usr/bin/env python3
     import os
     import subprocess
+    import sys
     from typing import Optional
 
     from kubernetes import client
     from kubernetes import config
 
-    canonical_gpu_names = [
-        'A100-80GB', 'A100', 'A10G', 'H100', 'K80', 'M60', 'T4g', 'T4', 'V100',
-        'A10', 'P100', 'P40', 'P4', 'L4'
-    ]
+    # Canonical GPU names - injected from sky.provision.kubernetes.constants
+    canonical_gpu_names = {{ canonical_gpu_names }}
 
 
     def get_gpu_name() -> Optional[str]:
         try:
             result = subprocess.run(
                 ['nvidia-smi', '--query-gpu=name', '--format=csv,noheader,nounits'],
-                stdout=subprocess.PIPE)
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE,
+                check=True,
+                timeout=30)
             gpu_name = result.stdout.decode('utf-8').strip()
+            if not gpu_name:
+                print('nvidia-smi returned empty output. GPU driver may not be loaded.')
+                return None
             # In the case of multi-gpu nodes, we assume the node is homogenous and
             # just use the first GPU name.
             gpu_name = gpu_name.split('\n')[0]
             return gpu_name.lower()
+        except FileNotFoundError:
+            print('nvidia-smi not found. Ensure NVIDIA drivers are installed.')
+            return None
+        except subprocess.CalledProcessError as e:
+            stderr = e.stderr.decode('utf-8') if e.stderr else ''
+            print(f'nvidia-smi failed with exit code {e.returncode}: {stderr}')
+            return None
+        except subprocess.TimeoutExpired:
+            print('nvidia-smi timed out. GPU may be unresponsive.')
+            return None
         except Exception as e:
-            print(f'Error getting GPU name: {e}')
+            print(f'Unexpected error getting GPU name: {type(e).__name__}: {e}')
             return None
 
 
@@ -96,6 +111,8 @@ data:
 
         except Exception as e:
             print(f'Error labeling node: {e}')
+            # Re-raise so the script exits non-zero and the job is marked as failed
+            raise
 
 
     def main():
@@ -119,6 +136,7 @@ data:
                 labelled = True
         else:
             print('No GPU detected. Try running nvidia-smi in the container.')
+            sys.exit(1)
 
 
     if __name__ == '__main__':
diff --git a/sky/utils/kubernetes/rsync_helper.sh b/sky/utils/kubernetes/rsync_helper.sh
index 99b4a100556..6b92696cd22 100755
--- a/sky/utils/kubernetes/rsync_helper.sh
+++ b/sky/utils/kubernetes/rsync_helper.sh
@@ -31,6 +31,8 @@ echo "namespace: $namespace" >&2
 context=$(echo $namespace_context | grep '+' >/dev/null && echo $namespace_context | cut -d+ -f2- || echo "")
 echo "context: $context" >&2
 context_lower=$(echo "$context" | tr '[:upper:]' '[:lower:]')
+container="${SKYPILOT_K8S_EXEC_CONTAINER:-ray-node}"
+echo "container: $container" >&2
 
 # Check if the resource is a pod or a deployment (or other type)
 if [[ "$pod" == *"/"* ]]; then
@@ -49,9 +51,9 @@ fi
 if [ -z "$context" ] || [ "$context_lower" = "none" ]; then
     # If context is none, it means we are using incluster auth. In this case,
     # we need to set KUBECONFIG to /dev/null to avoid using kubeconfig file.
-    kubectl_cmd_base="kubectl exec \"$resource_type/$resource_name\" -n \"$namespace\" --kubeconfig=/dev/null --"
+    kubectl_cmd_base="kubectl exec \"$resource_type/$resource_name\" -n \"$namespace\" -c \"$container\" --kubeconfig=/dev/null --"
 else
-    kubectl_cmd_base="kubectl exec \"$resource_type/$resource_name\" -n \"$namespace\" --context=\"$context\" --"
+    kubectl_cmd_base="kubectl exec \"$resource_type/$resource_name\" -n \"$namespace\" -c \"$container\" --context=\"$context\" --"
 fi
 
 # Execute command on remote pod, waiting for rsync to be available first.
diff --git a/sky/utils/plugin_extensions/__init__.py b/sky/utils/plugin_extensions/__init__.py
index ecfde5daa22..efd3fbcc432 100644
--- a/sky/utils/plugin_extensions/__init__.py
+++ b/sky/utils/plugin_extensions/__init__.py
@@ -7,8 +7,10 @@
     ExternalClusterFailure)
 from sky.utils.plugin_extensions.external_failure_source import (
     ExternalFailureSource)
+from sky.utils.plugin_extensions.node_info_source import NodeInfoSource
 
 __all__ = [
     'ExternalClusterFailure',
     'ExternalFailureSource',
+    'NodeInfoSource',
 ]
diff --git a/sky/utils/plugin_extensions/node_info_source.py b/sky/utils/plugin_extensions/node_info_source.py
new file mode 100644
index 00000000000..91896e615b8
--- /dev/null
+++ b/sky/utils/plugin_extensions/node_info_source.py
@@ -0,0 +1,86 @@
+"""External node info source interface for plugins.
+
+This module provides an extension point that allows plugins to provide
+cached Kubernetes node information. By default, no-op implementations
+are used. Plugins can register their own implementations to provide
+cached node info (e.g., from a node-info-service sidecar).
+
+Example usage in a plugin:
+    from sky.utils.plugin_extensions import NodeInfoSource
+
+    # Register custom node info provider
+    NodeInfoSource.register(my_node_info_provider)
+
+Example usage in core SkyPilot:
+    from sky.utils.plugin_extensions import NodeInfoSource
+
+    # Get cached node info (returns None if not registered or unavailable)
+    node_info = NodeInfoSource.get(context='my-k8s-context')
+"""
+from typing import Callable, Optional
+
+from sky import models
+from sky import sky_logging
+
+logger = sky_logging.init_logger(__name__)
+
+# Type alias for the node info provider function
+# Function signature: (context: str) -> Optional[KubernetesNodesInfo]
+NodeInfoProviderFunc = Callable[[str], Optional[models.KubernetesNodesInfo]]
+
+
+class NodeInfoSource:
+    """Singleton class for external Kubernetes node info source.
+
+    This class provides an extension point for plugins to register their own
+    node info providers (e.g., a node-info-service that caches Kubernetes
+    node information). By default, no provider is registered and get()
+    returns None.
+
+    Plugins can register their provider during their install() phase,
+    and core SkyPilot code can use the get() method to attempt to retrieve
+    cached node info before falling back to direct Kubernetes API calls.
+    """
+
+    _provider_func: Optional[NodeInfoProviderFunc] = None
+
+    @classmethod
+    def register(cls, provider: NodeInfoProviderFunc) -> None:
+        """Register a node info provider function.
+
+        This allows plugins to provide cached Kubernetes node information.
+        Only one provider can be registered at a time.
+
+        Args:
+            provider: Function to get node info for a context.
+                Signature: (context: str) -> Optional[KubernetesNodesInfo]
+                Returns KubernetesNodesInfo if available, None otherwise.
+        """
+        cls._provider_func = provider
+        logger.debug('Registered external node info provider')
+
+    @classmethod
+    def is_registered(cls) -> bool:
+        """Check if an external node info provider is registered."""
+        return cls._provider_func is not None
+
+    @classmethod
+    def get(cls, context: str) -> Optional[models.KubernetesNodesInfo]:
+        """Get node info from the registered provider.
+
+        Args:
+            context: Kubernetes context name to get node info for.
+
+        Returns:
+            KubernetesNodesInfo if provider returns data, None otherwise.
+            Returns None if no provider is registered or if the provider
+            fails or returns None (e.g., context not available).
+        """
+        if cls._provider_func is None:
+            return None
+        try:
+            # pylint: disable=not-callable
+            return cls._provider_func(context)
+        except Exception as e:  # pylint: disable=broad-except
+            logger.debug(f'External node info provider failed: {e}')
+            return None
diff --git a/sky/utils/resources_utils.py b/sky/utils/resources_utils.py
index ef295ad29c8..fd48577f4a6 100644
--- a/sky/utils/resources_utils.py
+++ b/sky/utils/resources_utils.py
@@ -433,27 +433,29 @@ def parse_memory_resource(resource_qty_str: Union[str, int, float],
     raise ValueError(error_msg)
 
 
-def parse_time_minutes(time: str) -> int:
-    """Convert a time string to minutes.
+def _parse_time_with_units(time: str, time_units: Dict[str, int]) -> int:
+    """Parse a time string using the given unit multipliers.
 
     Args:
-        time: Time string with optional unit suffix (e.g., '30m', '2h', '1d')
+        time: Time string with optional unit suffix.
+        time_units: Dict mapping unit suffix to multiplier value.
 
     Returns:
-        Time in minutes as an integer
+        Time value as an integer in the base unit.
+
+    Raises:
+        ValueError: If the time format is invalid.
     """
     time_str = str(time)
 
     if time_str.isdecimal():
-        # We assume it is already in minutes to maintain backwards
-        # compatibility
         return int(time_str)
 
-    time_str = time_str.lower()
-    for unit, multiplier in constants.TIME_UNITS.items():
-        if time_str.endswith(unit):
+    time_str_lower = time_str.lower()
+    for unit, multiplier in time_units.items():
+        if time_str_lower.endswith(unit):
             try:
-                value = float(time_str[:-len(unit)])
+                value = float(time_str_lower[:-len(unit)])
                 final_value = math.ceil(value * multiplier)
                 if final_value >= 0:
                     return final_value
@@ -463,6 +465,39 @@ def parse_time_minutes(time: str) -> int:
     raise ValueError(f'Invalid time format: {time}')
 
 
+def parse_time_minutes(time: str) -> int:
+    """Convert a time string to minutes.
+
+    Args:
+        time: Time string with optional unit suffix (e.g., '30m', '2h', '1d').
+              Plain numbers are treated as minutes.
+
+    Returns:
+        Time in minutes as an integer.
+
+    Raises:
+        ValueError: If the time format is invalid.
+    """
+    return _parse_time_with_units(time, constants.TIME_UNITS)
+
+
+def parse_time_seconds(time: str) -> int:
+    """Convert a time string to seconds.
+
+    Args:
+        time: Time string with optional unit suffix (e.g., '30s', '5m', '1h').
+              Supports: s (seconds), m (minutes), h (hours), d (days),
+              w (weeks). Plain numbers are treated as seconds.
+
+    Returns:
+        Time in seconds as an integer.
+
+    Raises:
+        ValueError: If the time format is invalid.
+    """
+    return _parse_time_with_units(time, constants.TIME_UNITS_SECONDS)
+
+
 def normalize_any_of_resources_config(
         any_of: List[Dict[str, Any]]) -> Tuple[str, ...]:
     """Normalize a list of any_of resources config to a canonical form.
diff --git a/sky/utils/rich_utils.py b/sky/utils/rich_utils.py
index afac427e36b..eeddcfaba15 100644
--- a/sky/utils/rich_utils.py
+++ b/sky/utils/rich_utils.py
@@ -274,10 +274,17 @@ def safe_logger():
 
 
 class RichSafeStreamHandler(logging.StreamHandler):
+    """A logging handler that safely handles Rich status spinners."""
 
     def emit(self, record: logging.LogRecord) -> None:
         with safe_logger():
-            return super().emit(record)
+            try:
+                return super().emit(record)
+            except ValueError as e:
+                # Ignore "I/O operation on closed file" errors that occur
+                # when pytest-xdist workers close stdout during parallel tests
+                if str(e) != 'I/O operation on closed file':
+                    raise
 
 
 def client_status(msg: str) -> Union['rich_console.Status', _NoOpConsoleStatus]:
diff --git a/sky/utils/schemas.py b/sky/utils/schemas.py
index 653cddb3807..bf36b1f72c1 100644
--- a/sky/utils/schemas.py
+++ b/sky/utils/schemas.py
@@ -70,6 +70,13 @@ def _check_not_both_fields_present(field1: str, field2: str):
                     'type': 'string',
                     'case_insensitive_enum':
                         autostop_lib.AutostopWaitFor.supported_modes(),
+                },
+                'hook': {
+                    'type': 'string',
+                },
+                'hook_timeout': {
+                    'type': 'integer',
+                    'minimum': 1,
                 }
             },
         },
@@ -337,7 +344,10 @@ def _get_single_resources_schema():
                     },
                     'tpu_vm': {
                         'type': 'boolean',
-                    }
+                    },
+                    'gcp_queued_resource': {
+                        'type': 'boolean',
+                    },
                 }
             },
             '_no_missing_accel_warnings': {
@@ -674,7 +684,35 @@ def get_service_schema():
                 }]
             },
             'pool': {
-                'type': 'boolean',
+                'type': 'object',
+                'required': [],
+                'additionalProperties': False,
+                'properties': {
+                    'workers': {
+                        'type': 'integer',
+                        'minimum': 0,
+                    },
+                    'min_workers': {
+                        'type': 'integer',
+                        'minimum': 0,
+                    },
+                    'queue_length_threshold': {
+                        'type': 'integer',
+                        'minimum': 1,
+                    },
+                    'max_workers': {
+                        'type': 'integer',
+                        'minimum': 0,
+                    },
+                    'upscale_delay_seconds': {
+                        'type': 'number',
+                        'minimum': 0,
+                    },
+                    'downscale_delay_seconds': {
+                        'type': 'number',
+                        'minimum': 0,
+                    },
+                },
             },
             'replica_policy': {
                 'type': 'object',
@@ -1182,7 +1220,19 @@ def get_default_remote_identity(cloud: str) -> str:
         'items': {
             'type': 'string'
         },
-    }
+    },
+    'set_pod_resource_limits': {
+        # Can be:
+        # - false: do not set limits (default)
+        # - true: set limits equal to requests (multiplier of 1)
+        # - number: set limits to requests * multiplier
+        'oneOf': [{
+            'type': 'boolean',
+        }, {
+            'type': 'number',
+            'minimum': 1,
+        }],
+    },
 }
 
 
@@ -1267,6 +1317,18 @@ def _get_controller_schema():
                         'type': 'string',
                     }, {
                         'type': 'null',
+                    }]
+                },
+                'vpc_names': {
+                    'oneOf': [{
+                        'type': 'string',
+                    }, {
+                        'type': 'null',
+                    }, {
+                        'type': 'array',
+                        'items': {
+                            'type': 'string'
+                        }
                     }],
                 },
                 'use_ssm': {
@@ -1841,6 +1903,9 @@ def _get_controller_schema():
                 'type': 'integer',
                 'minimum': 1,
             },
+            'install_conda': {
+                'type': 'boolean',
+            },
         }
     }
 
diff --git a/sky/utils/status_lib.py b/sky/utils/status_lib.py
index d3af033bc22..0f123fdc5fa 100644
--- a/sky/utils/status_lib.py
+++ b/sky/utils/status_lib.py
@@ -27,6 +27,15 @@ class ClusterStatus(enum.Enum):
 
     STOPPED = 'STOPPED'
     """The cluster is stopped."""
+
+    AUTOSTOPPING = 'AUTOSTOPPING'
+    """The cluster is in the process of autostopping.
+
+    This state indicates that the autostop process has been triggered and
+    the cluster is executing pre-stop hooks and preparing to stop or tear down.
+    Check the 'to_down' field to determine if it's a stop or teardown operation.
+    """
+
     PENDING = 'PENDING'
     """The cluster is pending scheduling.
 
@@ -43,6 +52,7 @@ def colored_str(self):
     ClusterStatus.INIT: colorama.Fore.BLUE,
     ClusterStatus.UP: colorama.Fore.GREEN,
     ClusterStatus.STOPPED: colorama.Fore.YELLOW,
+    ClusterStatus.AUTOSTOPPING: colorama.Fore.MAGENTA,
     ClusterStatus.PENDING: colorama.Fore.CYAN,
 }
 
@@ -71,3 +81,6 @@ class VolumeStatus(enum.Enum):
 
     # Volume is being used
     IN_USE = 'IN_USE'
+
+    # Volume is not ready (e.g., PVC is pending, may eventually become ready)
+    NOT_READY = 'NOT_READY'
diff --git a/sky/utils/volume.py b/sky/utils/volume.py
index 598e8109422..b8174ae9c16 100644
--- a/sky/utils/volume.py
+++ b/sky/utils/volume.py
@@ -78,6 +78,12 @@ def resolve(cls, path: str, volume_name: str) -> 'VolumeMount':
         if record is None:
             raise exceptions.VolumeNotFoundError(
                 f'Volume {volume_name} not found.')
+        if record.get('status') == status_lib.VolumeStatus.NOT_READY:
+            error_message = record.get('error_message')
+            msg = f'Volume {volume_name} is not ready.'
+            if error_message:
+                msg += f' Error: {error_message}'
+            raise exceptions.VolumeNotReadyError(msg)
         assert 'handle' in record, 'Volume handle is None.'
         volume_config: models.VolumeConfig = record['handle']
         return cls(path, volume_name, volume_config)
diff --git a/sky/volumes/client/sdk.py b/sky/volumes/client/sdk.py
index 7438bbbec1f..b2055cb9262 100644
--- a/sky/volumes/client/sdk.py
+++ b/sky/volumes/client/sdk.py
@@ -118,15 +118,27 @@ def validate(volume: volume_lib.Volume) -> None:
 @usage_lib.entrypoint
 @server_common.check_server_healthy_or_start
 @annotations.client_api
-def ls() -> server_common.RequestId[List[responses.VolumeRecord]]:
+def ls(
+    refresh: bool = False
+) -> server_common.RequestId[List[responses.VolumeRecord]]:
     """Lists all volumes.
 
+    Args:
+        refresh: If True, refresh volume state from cloud APIs before returning.
+            This makes the call slower but returns the most up-to-date data.
+            If False (default), return cached data from the database which is
+            updated periodically by the background daemon.
+
     Returns:
         The request ID of the list request.
     """
+    params = {}
+    if refresh:
+        params['refresh'] = 'true'
     response = server_common.make_authenticated_request(
         'GET',
         '/volumes',
+        params=params,
     )
     return server_common.get_request_id(response)
 
diff --git a/sky/volumes/server/core.py b/sky/volumes/server/core.py
index a4b6e894fee..13e09b58bc1 100644
--- a/sky/volumes/server/core.py
+++ b/sky/volumes/server/core.py
@@ -25,34 +25,130 @@
 VOLUME_LOCK_TIMEOUT_SECONDS = 20
 
 
-def volume_refresh():
-    """Refreshes the volume status."""
-    volumes = volume_list(is_ephemeral=False)
+def volume_refresh() -> None:
+    """Refreshes volume status by querying cloud APIs.
+
+    This is called by the background daemon to update volume state.
+    It updates status, error messages, and usage information in the database.
+
+    Status transitions:
+    - NOT_READY: Volume has errors (e.g., pending due to misconfiguration)
+    - IN_USE: Volume is healthy and in use
+    - READY: Volume is healthy and not in use
+    """
+    volumes = global_user_state.get_volumes(is_ephemeral=False)
+
+    # Group volumes by cloud for batch API calls
+    cloud_to_configs: Dict[str, List[models.VolumeConfig]] = {}
+    volume_name_to_config: Dict[str, models.VolumeConfig] = {}
+    for volume in volumes:
+        config = volume.get('handle')
+        if config is None:
+            volume_name = volume.get('name')
+            logger.warning(f'Volume {volume_name} has no handle.')
+            continue
+        cloud = config.cloud
+        if cloud not in cloud_to_configs:
+            cloud_to_configs[cloud] = []
+        cloud_to_configs[cloud].append(config)
+        volume_name_to_config[volume.get('name')] = config
+
+    # Check for volume errors (e.g., misconfiguration)
+    cloud_to_volume_errors: Dict[str, Dict[str, Optional[str]]] = {}
+    for cloud, configs in cloud_to_configs.items():
+        try:
+            volume_errors = provision.get_all_volumes_errors(cloud, configs)
+            cloud_to_volume_errors[cloud] = volume_errors
+        except Exception as e:  # pylint: disable=broad-except
+            logger.debug(
+                f'Failed to get volume errors for volumes on {cloud}: {e}')
+            cloud_to_volume_errors[cloud] = {}
+
+    # Get usedby info for all volumes
+    cloud_to_used_by_pods: Dict[str, Dict[str, Any]] = {}
+    cloud_to_used_by_clusters: Dict[str, Dict[str, Any]] = {}
+    cloud_to_failed_volume_names: Dict[str, set] = {}
+    for cloud, configs in cloud_to_configs.items():
+        try:
+            used_by_pods, used_by_clusters, failed_volume_names = (
+                provision.get_all_volumes_usedby(cloud, configs))
+            cloud_to_used_by_pods[cloud] = used_by_pods
+            cloud_to_used_by_clusters[cloud] = used_by_clusters
+            cloud_to_failed_volume_names[cloud] = failed_volume_names
+        except Exception as e:  # pylint: disable=broad-except
+            logger.warning(
+                f'Failed to get usedby info for volumes on {cloud}: {e}')
+            cloud_to_used_by_pods[cloud] = {}
+            cloud_to_used_by_clusters[cloud] = {}
+            cloud_to_failed_volume_names[cloud] = {
+                config.name for config in configs
+            }
+
+    # Update volume statuses in database
     for volume in volumes:
-        volume_name = volume.name
-        if volume.usedby_fetch_failed:
-            logger.info(f'Skipping status update for volume {volume_name} '
-                        f'due to failed usedby fetch')
+        volume_name = volume.get('name')
+        config = volume_name_to_config.get(volume_name)
+        if config is None:
             continue
-        usedby_pods = volume.usedby_pods
+
+        cloud = config.cloud
+
+        # Skip if usedby fetch failed
+        if volume_name in cloud_to_failed_volume_names.get(cloud, set()):
+            logger.debug(f'Skipping status update for volume {volume_name} '
+                         f'due to failed usedby fetch')
+            continue
+
+        # Check for volume errors first
+        volume_error = cloud_to_volume_errors.get(cloud, {}).get(volume_name)
+
+        # Get usedby info
+        usedby_pods, usedby_clusters = provision.map_all_volumes_usedby(
+            cloud,
+            cloud_to_used_by_pods.get(cloud, {}),
+            cloud_to_used_by_clusters.get(cloud, {}),
+            config,
+        )
+
         with _volume_lock(volume_name):
             latest_volume = global_user_state.get_volume_by_name(volume_name)
             if latest_volume is None:
                 logger.warning(f'Volume {volume_name} not found.')
                 continue
-            status = latest_volume.get('status')
-            if not usedby_pods:
-                if status != status_lib.VolumeStatus.READY:
-                    logger.info(f'Update volume {volume_name} '
-                                f'status to READY')
-                    global_user_state.update_volume_status(
-                        volume_name, status=status_lib.VolumeStatus.READY)
+
+            current_status = latest_volume.get('status')
+            current_error = latest_volume.get('error_message')
+            current_usedby_pods = latest_volume.get('usedby_pods', [])
+            current_usedby_clusters = latest_volume.get('usedby_clusters', [])
+
+            # Determine new status and error_message
+            if volume_error:
+                new_status = status_lib.VolumeStatus.NOT_READY
+                new_error = volume_error
+            elif usedby_pods:
+                new_status = status_lib.VolumeStatus.IN_USE
+                new_error = None
             else:
-                if status != status_lib.VolumeStatus.IN_USE:
-                    logger.info(f'Update volume {volume_name} '
-                                f'status to IN_USE, usedby: {usedby_pods}')
-                    global_user_state.update_volume_status(
-                        volume_name, status=status_lib.VolumeStatus.IN_USE)
+                new_status = status_lib.VolumeStatus.READY
+                new_error = None
+
+            # Update if anything changed
+            status_changed = current_status != new_status
+            error_changed = current_error != new_error
+            usedby_changed = (
+                set(current_usedby_pods) != set(usedby_pods) or
+                set(current_usedby_clusters) != set(usedby_clusters))
+
+            if status_changed or error_changed or usedby_changed:
+                logger.info(f'Update volume {volume_name} status to '
+                            f'{new_status.value}'
+                            f'{", error: " + new_error if new_error else ""}')
+                global_user_state.update_volume_status(
+                    volume_name,
+                    status=new_status,
+                    error_message=new_error,
+                    usedby_pods=usedby_pods,
+                    usedby_clusters=usedby_clusters)
             volume_config = latest_volume.get('handle')
             if volume_config is None:
                 continue
@@ -71,11 +167,14 @@ def volume_refresh():
 
 
 def volume_list(
-        is_ephemeral: Optional[bool] = None) -> List[responses.VolumeRecord]:
-    """Gets the volumes.
+    is_ephemeral: Optional[bool] = None,
+    refresh: bool = False,
+) -> List[responses.VolumeRecord]:
+    """Gets volumes from the database.
 
     Args:
         is_ephemeral: Whether to include ephemeral volumes.
+        refresh: If True, refresh volume state from cloud APIs before returning.
 
     Returns:
         [
@@ -98,48 +197,27 @@ def volume_list(
                 'usedby_clusters': List[str],
                 'usedby_fetch_failed': bool,
                 'is_ephemeral': bool,
+                'error_message': Optional[str],
             }
         ]
     """
+    if refresh:
+        volume_refresh()
     with rich_utils.safe_status(ux_utils.spinner_message('Listing volumes')):
         volumes = global_user_state.get_volumes(is_ephemeral=is_ephemeral)
-        cloud_to_configs: Dict[str, List[models.VolumeConfig]] = {}
+        all_users = global_user_state.get_all_users()
+        user_map = {user.id: user.name for user in all_users}
+
+        records = []
         for volume in volumes:
+            volume_name = volume.get('name')
             config = volume.get('handle')
             if config is None:
-                volume_name = volume.get('name')
                 logger.warning(f'Volume {volume_name} has no handle.')
                 continue
-            cloud = config.cloud
-            if cloud not in cloud_to_configs:
-                cloud_to_configs[cloud] = []
-            cloud_to_configs[cloud].append(config)
-
-        cloud_to_used_by_pods, cloud_to_used_by_clusters = {}, {}
-        cloud_to_failed_volume_names = {}
-        for cloud, configs in cloud_to_configs.items():
-            try:
-                used_by_pods, used_by_clusters, failed_volume_names = (
-                    provision.get_all_volumes_usedby(cloud, configs))
-                cloud_to_used_by_pods[cloud] = used_by_pods
-                cloud_to_used_by_clusters[cloud] = used_by_clusters
-                cloud_to_failed_volume_names[cloud] = failed_volume_names
-            except Exception as e:  # pylint: disable=broad-except
-                logger.warning(
-                    f'Failed to get usedby info for volumes on {cloud}: {e}')
-                cloud_to_used_by_pods[cloud] = {}
-                cloud_to_used_by_clusters[cloud] = {}
-                cloud_to_failed_volume_names[cloud] = {
-                    config.name for config in configs
-                }
-                continue
 
-        all_users = global_user_state.get_all_users()
-        user_map = {user.id: user.name for user in all_users}
-        records = []
-        for volume in volumes:
-            volume_name = volume.get('name')
-            record = {
+            status = volume.get('status')
+            record: Dict[str, Any] = {
                 'name': volume_name,
                 'launched_at': volume.get('launched_at'),
                 'user_hash': volume.get('user_hash'),
@@ -147,39 +225,20 @@ def volume_list(
                 'workspace': volume.get('workspace'),
                 'last_attached_at': volume.get('last_attached_at'),
                 'last_use': volume.get('last_use'),
-                'usedby_pods': [],
-                'usedby_clusters': [],
+                'status': status.value if status is not None else '',
+                'usedby_pods': volume.get('usedby_pods', []),
+                'usedby_clusters': volume.get('usedby_clusters', []),
                 'usedby_fetch_failed': False,
                 'is_ephemeral': volume.get('is_ephemeral', False),
+                'error_message': volume.get('error_message'),
+                'type': config.type,
+                'cloud': config.cloud,
+                'region': config.region,
+                'zone': config.zone,
+                'size': config.size,
+                'config': config.config,
+                'name_on_cloud': config.name_on_cloud,
             }
-            status = volume.get('status')
-            if status is not None:
-                record['status'] = status.value
-            else:
-                record['status'] = ''
-            config = volume.get('handle')
-            if config is None:
-                logger.warning(f'Volume {volume_name} has no handle.')
-                continue
-            cloud = config.cloud
-            if volume_name in cloud_to_failed_volume_names[cloud]:
-                record['usedby_fetch_failed'] = True
-            else:
-                usedby_pods, usedby_clusters = provision.map_all_volumes_usedby(
-                    cloud,
-                    cloud_to_used_by_pods[cloud],
-                    cloud_to_used_by_clusters[cloud],
-                    config,
-                )
-                record['usedby_pods'] = usedby_pods
-                record['usedby_clusters'] = usedby_clusters
-            record['type'] = config.type
-            record['cloud'] = config.cloud
-            record['region'] = config.region
-            record['zone'] = config.zone
-            record['size'] = config.size
-            record['config'] = config.config
-            record['name_on_cloud'] = config.name_on_cloud
             records.append(responses.VolumeRecord(**record))
         return records
 
diff --git a/sky/volumes/server/server.py b/sky/volumes/server/server.py
index 3ab82927b50..90ca863f804 100644
--- a/sky/volumes/server/server.py
+++ b/sky/volumes/server/server.py
@@ -19,19 +19,21 @@
 
 
 @router.get('')
-async def volume_list(request: fastapi.Request) -> None:
-    """Gets the volumes."""
-    auth_user = request.state.auth_user
-    auth_user_env_vars_kwargs = {
-        'env_vars': auth_user.to_env_vars()
-    } if auth_user else {}
-    request_body = payloads.RequestBody(**auth_user_env_vars_kwargs)
+async def volume_list(request: fastapi.Request, refresh: bool = False) -> None:
+    """Gets the volumes.
+
+    Args:
+        refresh: If True, refresh volume state from cloud APIs before returning.
+            If False (default), return cached data from the database.
+    """
+    request_body = payloads.VolumeListBody(refresh=refresh)
     await executor.schedule_request_async(
         request_id=request.state.request_id,
         request_name=request_names.RequestName.VOLUME_LIST,
         request_body=request_body,
         func=core.volume_list,
         schedule_type=requests_lib.ScheduleType.SHORT,
+        auth_user=request.state.auth_user,
     )
 
 
@@ -45,6 +47,7 @@ async def volume_delete(request: fastapi.Request,
         request_body=volume_delete_body,
         func=core.volume_delete,
         schedule_type=requests_lib.ScheduleType.LONG,
+        auth_user=request.state.auth_user,
     )
 
 
@@ -121,4 +124,5 @@ async def volume_apply(request: fastapi.Request,
         request_body=volume_apply_body,
         func=core.volume_apply,
         schedule_type=requests_lib.ScheduleType.LONG,
+        auth_user=request.state.auth_user,
     )
diff --git a/sky/workspaces/server.py b/sky/workspaces/server.py
index a527a465b20..a9a008fd657 100644
--- a/sky/workspaces/server.py
+++ b/sky/workspaces/server.py
@@ -15,20 +15,13 @@
 # pylint: disable=redefined-builtin
 async def get(request: fastapi.Request) -> None:
     """Gets workspace config on the server."""
-    # Have to manually inject user info into the request body because the
-    # request body is not available in the GET endpoint.
-    auth_user = request.state.auth_user
-    auth_user_env_vars_kwargs = {
-        'env_vars': auth_user.to_env_vars()
-    } if auth_user else {}
-    request_body = payloads.RequestBody(**auth_user_env_vars_kwargs)
-
     await executor.schedule_request_async(
         request_id=request.state.request_id,
         request_name=request_names.RequestName.WORKSPACES_GET,
-        request_body=request_body,
+        request_body=payloads.RequestBody(),
         func=core.get_workspaces,
         schedule_type=api_requests.ScheduleType.SHORT,
+        auth_user=request.state.auth_user,
     )
 
 
@@ -42,6 +35,7 @@ async def update(request: fastapi.Request,
         request_body=update_workspace_body,
         func=core.update_workspace,
         schedule_type=api_requests.ScheduleType.SHORT,
+        auth_user=request.state.auth_user,
     )
 
 
@@ -55,6 +49,7 @@ async def create(request: fastapi.Request,
         request_body=create_workspace_body,
         func=core.create_workspace,
         schedule_type=api_requests.ScheduleType.SHORT,
+        auth_user=request.state.auth_user,
     )
 
 
@@ -68,23 +63,20 @@ async def delete(request: fastapi.Request,
         request_body=delete_workspace_body,
         func=core.delete_workspace,
         schedule_type=api_requests.ScheduleType.SHORT,
+        auth_user=request.state.auth_user,
     )
 
 
 @router.get('/config')
 async def get_config(request: fastapi.Request) -> None:
     """Gets the entire SkyPilot configuration."""
-    auth_user = request.state.auth_user
-    auth_user_env_vars_kwargs = {
-        'env_vars': auth_user.to_env_vars()
-    } if auth_user else {}
-    get_config_body = payloads.GetConfigBody(**auth_user_env_vars_kwargs)
     await executor.schedule_request_async(
         request_id=request.state.request_id,
         request_name=request_names.RequestName.WORKSPACES_GET_CONFIG,
-        request_body=get_config_body,
+        request_body=payloads.GetConfigBody(),
         func=core.get_config,
         schedule_type=api_requests.ScheduleType.SHORT,
+        auth_user=request.state.auth_user,
     )
 
 
@@ -98,4 +90,5 @@ async def update_config(request: fastapi.Request,
         request_body=update_config_body,
         func=core.update_config,
         schedule_type=api_requests.ScheduleType.SHORT,
+        auth_user=request.state.auth_user,
     )
diff --git a/tests/common_test_fixtures.py b/tests/common_test_fixtures.py
index 6ea29172af4..499b5d252bf 100644
--- a/tests/common_test_fixtures.py
+++ b/tests/common_test_fixtures.py
@@ -18,6 +18,7 @@
 import sky
 from sky import global_user_state
 from sky import sky_logging
+from sky import skypilot_config
 from sky.backends.cloud_vm_ray_backend import CloudVmRayBackend
 from sky.catalog import vsphere_catalog
 from sky.provision import common as provision_common
@@ -43,7 +44,6 @@
 
 @pytest.fixture
 def aws_config_region(monkeypatch: pytest.MonkeyPatch) -> str:
-    from sky import skypilot_config
     region = 'us-east-2'
     if skypilot_config.loaded():
         ssh_proxy_command = skypilot_config.get_nested(
@@ -199,7 +199,8 @@ def get_clouds_factory(*args, **kwargs):
         list_empty_reservations)
 
     # Kubernetes mocks
-    monkeypatch.setattr('sky.adaptors.kubernetes._load_config', dummy_function)
+    monkeypatch.setattr('sky.adaptors.kubernetes._get_api_client',
+                        dummy_function)
     monkeypatch.setattr(
         'sky.provision.kubernetes.utils.detect_gpu_label_formatter',
         get_kubernetes_label_formatter)
@@ -626,8 +627,44 @@ def skyignore_dir():
         yield temp_dir
 
 
+def _safe_reload_config():
+    """Safely reload config, handling cases where config file doesn't exist.
+
+    Some tests set SKYPILOT_CONFIG env var to a temp file that may be deleted
+    before other tests run. This function handles that case gracefully.
+    """
+    # If SKYPILOT_CONFIG points to a non-existent file, temporarily unset it
+    env_var_name = 'SKYPILOT_CONFIG'
+    original_value = os.environ.get(env_var_name)
+    if original_value and not os.path.exists(original_value):
+        del os.environ[env_var_name]
+        try:
+            skypilot_config.reload_config()
+        finally:
+            # Restore the original value (even though file doesn't exist)
+            # in case the test that set it is still running
+            os.environ[env_var_name] = original_value
+    else:
+        skypilot_config.reload_config()
+
+
 @pytest.fixture(autouse=True)
 def reset_global_state():
     """Reset global state before each test."""
     annotations.is_on_api_server = True
+    # Clear global caches that can leak state between tests.
+    # These caches can be polluted by tests that modify the config file
+    # (e.g., test_api_login sets api_server.endpoint to a test URL).
+    server_common.get_server_url.cache_clear()
+    server_common.is_api_server_local.cache_clear()
+    server_common.get_dashboard_url.cache_clear()
+    # Reload config from default paths to reset any in-memory config changes
+    # from previous tests that might have modified the config.
+    _safe_reload_config()
     yield
+    # Clear again after the test to prevent pollution to subsequent tests
+    server_common.get_server_url.cache_clear()
+    server_common.is_api_server_local.cache_clear()
+    server_common.get_dashboard_url.cache_clear()
+    # Reload config again to reset any changes made by this test
+    _safe_reload_config()
diff --git a/tests/conftest.py b/tests/conftest.py
index 2b1abe3a5de..2b25a846de7 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -233,6 +233,13 @@ def pytest_addoption(parser):
         help=('Plugin YAML file (configured in Buildkite pipeline; '
               'has no effect when running locally)'),
     )
+    parser.addoption(
+        '--submodule-base-branch',
+        type=str,
+        default=None,
+        help=('Base branch for submodule tests (configured in Buildkite '
+              'pipeline; has no effect when running locally)'),
+    )
     parser.addoption(
         '--backend-test-cluster',
         type=str,
@@ -260,6 +267,9 @@ def pytest_configure(config):
     config.addinivalue_line('markers', 'slow: mark test as slow to run')
     config.addinivalue_line('markers',
                             'local: mark test to run only on local API server')
+    config.addinivalue_line(
+        'markers', 'no_auto_retry: mark test to disable automatic retries '
+        'in Buildkite CI (manual retries still allowed)')
     for cloud in all_clouds_in_smoke_tests:
         cloud_keyword = cloud_to_pytest_keyword[cloud]
         config.addinivalue_line(
diff --git a/tests/integration_tests/backends/test_cloud_vm_ray_backend.py b/tests/integration_tests/backends/test_cloud_vm_ray_backend.py
index 7cc85eb171d..51812601059 100644
--- a/tests/integration_tests/backends/test_cloud_vm_ray_backend.py
+++ b/tests/integration_tests/backends/test_cloud_vm_ray_backend.py
@@ -1,9 +1,8 @@
 import concurrent.futures
-import os
 import random
 import statistics
 import time
-from typing import List, Tuple
+from typing import List, Optional, Tuple
 
 import pytest
 
@@ -54,8 +53,11 @@ def _simulate_ssh_process_kill(handle, kill_probability: float) -> bool:
 
 
 def _test_autostop(
-        cluster_name: str, num: int, thread_id: int,
-        kill_ssh_probability: float) -> Tuple[List[bool], List[float]]:
+        cluster_name: str,
+        num: int,
+        thread_id: int,
+        kill_ssh_probability: float,
+        hook: Optional[str] = None) -> Tuple[List[bool], List[float]]:
     """Worker function that tests skylet set_autostop integration repeatedly."""
     results = []
     latencies = []
@@ -77,6 +79,8 @@ def _test_autostop(
                 backend=cloud_vm_ray_backend.CloudVmRayBackend.NAME,
                 wait_for=autostopv1_pb2.AUTOSTOP_WAIT_FOR_JOBS_AND_SSH,
                 down=True)
+            if hook is not None:
+                request.hook = hook
             backend_utils.invoke_skylet_with_retries(
                 lambda: cloud_vm_ray_backend.SkyletClient(
                     handle.get_grpc_channel()).set_autostop(request))
@@ -121,8 +125,12 @@ def test_skylet_grpc_connectivity(test_cluster, parallelism: int,
             max_workers=parallelism) as executor:
         futures = []
         for thread_id in range(parallelism):
-            future = executor.submit(_test_autostop, cluster_name, num_tests,
-                                     thread_id, kill_prob)
+            future = executor.submit(_test_autostop,
+                                     cluster_name,
+                                     num_tests,
+                                     thread_id,
+                                     kill_prob,
+                                     hook=f"echo thread {thread_id}")
             futures.append(future)
 
         for future in concurrent.futures.as_completed(futures):
diff --git a/tests/load_tests/db_scale_tests/test_large_production_performance.sh b/tests/load_tests/db_scale_tests/test_large_production_performance.sh
index 79ed75df26f..c1902c702eb 100644
--- a/tests/load_tests/db_scale_tests/test_large_production_performance.sh
+++ b/tests/load_tests/db_scale_tests/test_large_production_performance.sh
@@ -35,15 +35,22 @@ while [[ $# -gt 0 ]]; do
 done
 
 # Configuration
-ACTIVE_CLUSTER_NAME="scale-test-active"
-TERMINATED_CLUSTER_NAME="scale-test-terminated"
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 INJECT_SCRIPT="${SCRIPT_DIR}/inject_production_scale_data.py"
 CREATE_DB_SCRIPT="${SCRIPT_DIR}/create_aws_postgres_db.sh"
 JOB_ID_FILE="/tmp/prod_test_job_id_$$"
 
-# RDS configuration (instance name related to test case)
-RDS_INSTANCE_ID="skypilot-large-production-test-db"
+# Generate a unique suffix to allow multiple concurrent test runs
+# Use short UUID (first 8 chars) for uniqueness while keeping name manageable
+# Fallbacks: uuidgen -> hash of timestamp -> plain timestamp
+UNIQUE_SUFFIX=$( (uuidgen 2>/dev/null || date +%s%N | (sha256sum 2>/dev/null || shasum -a 256) || date +%s) | cut -c1-8 | tr '[:upper:]' '[:lower:]' )
+
+# Cluster names with unique suffix to avoid conflicts
+ACTIVE_CLUSTER_NAME="scale-test-active-${UNIQUE_SUFFIX}"
+TERMINATED_CLUSTER_NAME="scale-test-term-${UNIQUE_SUFFIX}"
+
+# RDS configuration with unique suffix
+RDS_INSTANCE_ID="skypilot-prod-test-db-${UNIQUE_SUFFIX}"
 RDS_REGION="${AWS_REGION:-us-east-2}"
 DB_SUBNET_GROUP_NAME="skypilot-test-subnet-group-${RDS_INSTANCE_ID}"
 SKYPILOT_DB_CONNECTION_URI=""
@@ -289,7 +296,7 @@ echo "✓ sky jobs queue --all test passed (${duration}s)"
 
 # Step 8: Do a minimal sky launch to ensure API server is running
 echo "Step 8: Performing minimal sky launch to ensure API server is running..."
-MINIMAL_CLUSTER_NAME="scale-test-minimal-$$"
+MINIMAL_CLUSTER_NAME="scale-test-min-${UNIQUE_SUFFIX}"
 sky launch --infra k8s -c "$MINIMAL_CLUSTER_NAME" -y "echo 'minimal test cluster'"
 # Get logs and verify the echo content appears
 LOGS_OUTPUT=$(sky logs "$MINIMAL_CLUSTER_NAME" --no-follow 2>&1)
diff --git a/tests/skyserve/llm/service.yaml b/tests/skyserve/llm/service.yaml
index 24bffb7333a..36a72080050 100644
--- a/tests/skyserve/llm/service.yaml
+++ b/tests/skyserve/llm/service.yaml
@@ -25,11 +25,17 @@ setup: |
   uv pip install vllm==0.10.0
   # Have to use triton==3.2.0 to avoid https://github.com/triton-lang/triton/issues/6698
   uv pip install triton==3.2.0
+  # Pin transformers to avoid https://github.com/verl-project/verl/issues/4337
+  uv pip install transformers==4.57.3
   uv pip install openai
 
 run: |
   source .venv/bin/activate
   export PATH=$PATH:/sbin
+  # Use XFORMERS backend to avoid Triton compilation issues on T4/Turing GPUs
+  # See: https://github.com/vllm-project/vllm/issues/17639
+  export VLLM_ATTENTION_BACKEND=XFORMERS
+  export VLLM_USE_TRITON_FLASH_ATTN=0
   vllm serve $MODEL_NAME --trust-remote-code \
     --host 0.0.0.0 --port 8087 \
     --api-key $AUTH_TOKEN
diff --git a/tests/smoke_tests/backward_compat/test_backward_compat.py b/tests/smoke_tests/backward_compat/test_backward_compat.py
index 225a7f9745b..3c4fc0f344c 100644
--- a/tests/smoke_tests/backward_compat/test_backward_compat.py
+++ b/tests/smoke_tests/backward_compat/test_backward_compat.py
@@ -372,6 +372,19 @@ def test_multi_node_operations(self, generic_cloud: str):
 
     def test_managed_jobs(self, generic_cloud: str):
         """Test managed jobs functionality across versions"""
+        # Check skylet version compatibility
+        # PR 8324 introduced a breaking change in skylet version 28 that adds
+        # user-specific exit codes. If one version is >= 28 and the other is
+        # not, the test should be skipped as they are incompatible.
+        base_skylet_version = int(self._get_base_skylet_version())
+        current_skylet_version = int(skylet_constants.SKYLET_VERSION)
+        if (base_skylet_version >= 28) != (current_skylet_version >= 28):
+            pytest.skip(
+                f'Skipping test due to incompatible skylet versions: '
+                f'base={base_skylet_version}, current={current_skylet_version}. '
+                f'Skylet version 28 introduced breaking changes for '
+                f'user-specific exit codes.')
+
         managed_job_name = smoke_tests_utils.get_cluster_name()
 
         def launch_job(job_name: str, command: str):
diff --git a/tests/smoke_tests/smoke_tests_utils.py b/tests/smoke_tests/smoke_tests_utils.py
index 0a0570265b2..55b4df71656 100644
--- a/tests/smoke_tests/smoke_tests_utils.py
+++ b/tests/smoke_tests/smoke_tests_utils.py
@@ -14,6 +14,7 @@
 from types import MethodType
 from typing import (Any, BinaryIO, Callable, Dict, Generator, List, NamedTuple,
                     Optional, Sequence, Set, Tuple, Union)
+from unittest.mock import patch
 import uuid
 
 import colorama
@@ -48,7 +49,8 @@
 # different job id.
 test_id = str(uuid.uuid4())[-2:]
 
-LAMBDA_TYPE = '--infra lambda --gpus A10'
+LAMBDA_GPU_TYPE = 'A100'
+LAMBDA_TYPE = f'--infra lambda --gpus {LAMBDA_GPU_TYPE}'
 FLUIDSTACK_TYPE = '--infra fluidstack --gpus RTXA4000'
 
 SCP_TYPE = '--infra scp'
@@ -470,6 +472,12 @@ def override_sky_config(
     if config_dict is not None:
         override_sky_config_dict.update(config_dict)
 
+    # Collect env overrides that need to be set in real os.environ for SDK
+    # calls. When env_dict is a copy (passed from run_one_test), SDK calls
+    # read from real os.environ, not env_dict. We use patch.dict to properly
+    # manage these overrides with automatic cleanup.
+    env_overrides: Dict[str, str] = {}
+
     if is_remote_server_test():
         endpoint = get_api_server_url()
         override_sky_config_dict.set_nested(('api_server', 'endpoint'),
@@ -479,11 +487,7 @@ def override_sky_config(
         # before we override the environment, so we need to disabled the
         # lru_cache of get_server_url and set SKY_API_SERVER_URL_ENV_VAR
         # to make sure the new endpoint is used.
-        env_dict[constants.SKY_API_SERVER_URL_ENV_VAR] = endpoint
-        # Clear the get_server_url cache
-        server_common.get_server_url.cache_clear()
-        # Clear the is_api_server_local cache
-        server_common.is_api_server_local.cache_clear()
+        env_overrides[constants.SKY_API_SERVER_URL_ENV_VAR] = endpoint
         echo(
             f'Overriding API server endpoint: '
             f'{override_sky_config_dict.get_nested(("api_server", "endpoint"), "UNKNOWN")}'
@@ -495,7 +499,7 @@ def override_sky_config(
                 ('api_server', 'service_account_token'), 'UNKNOWN')
             override_sky_config_dict.set_nested(
                 ('api_server', 'service_account_token'), service_account_token)
-            env_dict[
+            env_overrides[
                 constants.SERVICE_ACCOUNT_TOKEN_ENV_VAR] = service_account_token
             echo(
                 f'Overriding service account token {service_account_token[:4]}...'
@@ -511,7 +515,7 @@ def override_sky_config(
             f'{override_sky_config_dict.get_nested(("jobs", "controller", "resources", "cloud"), "UNKNOWN")}'
         )
     if is_grpc_enabled_test():
-        env_dict[env_options.Options.ENABLE_GRPC.env_key] = '1'
+        env_overrides[env_options.Options.ENABLE_GRPC.env_key] = '1'
 
     if not override_sky_config_dict:
         yield None
@@ -528,14 +532,36 @@ def override_sky_config(
         original_config, override_sky_config_dict)
     temp_config_file.write(yaml_utils.dump_yaml_str(dict(overlay_config)))
     temp_config_file.flush()
-    # Update the environment variable to use the temporary file
-    env_dict[skypilot_config.ENV_VAR_GLOBAL_CONFIG] = temp_config_file.name
+
+    # Add config file to env overrides
+    env_overrides[skypilot_config.ENV_VAR_GLOBAL_CONFIG] = temp_config_file.name
+
+    # Update env_dict for subprocess calls
+    env_dict.update(env_overrides)
     if (env_before_override is not None and
             skypilot_config.ENV_VAR_GLOBAL_CONFIG in env_before_override):
         env_dict[skypilot_config.ENV_VAR_GLOBAL_CONFIG +
                  '_ORIGINAL'] = env_before_override[
                      skypilot_config.ENV_VAR_GLOBAL_CONFIG]
-    yield temp_config_file
+
+    def _clear_caches():
+        """Clear caches so they pick up new/restored env vars."""
+        server_common.get_server_url.cache_clear()
+        server_common.is_api_server_local.cache_clear()
+        skypilot_config.reload_config()
+
+    # Use patch.dict to properly manage os.environ for SDK calls.
+    # This ensures env vars are set in real os.environ (not just env_dict)
+    # and automatically restored when the context exits.
+    with patch.dict(os.environ, env_overrides):
+        _clear_caches()
+        try:
+            yield temp_config_file
+        finally:
+            pass  # patch.dict handles os.environ restoration
+    # After patch.dict exits, clear caches again to pick up restored env
+    _clear_caches()
+
     if env_before_override is not None:
         os.environ.clear()
         os.environ.update(env_before_override)
diff --git a/tests/smoke_tests/test_api_server.py b/tests/smoke_tests/test_api_server.py
index 33b82581103..dca33f0a13f 100644
--- a/tests/smoke_tests/test_api_server.py
+++ b/tests/smoke_tests/test_api_server.py
@@ -544,7 +544,9 @@ def test_tail_jobs_logs_blocks_ssh(generic_cloud: str):
             sky.Resources(infra=generic_cloud,
                           **smoke_tests_utils.LOW_RESOURCE_PARAM))
         req_id = jobs.launch(task, name=job_name)
-        job_id, _ = sky.stream_and_get(req_id)
+        job_ids, _ = sky.stream_and_get(req_id)
+        assert len(job_ids) == 1
+        job_id = job_ids[0]
 
         # Wait for the job to start.
         def is_job_started(job_id: int):
diff --git a/tests/smoke_tests/test_basic.py b/tests/smoke_tests/test_basic.py
index 6c6e9a3042a..8cb21390679 100644
--- a/tests/smoke_tests/test_basic.py
+++ b/tests/smoke_tests/test_basic.py
@@ -27,7 +27,7 @@
 import textwrap
 import threading
 import time
-from typing import Generator
+from typing import Generator, Optional
 
 import pytest
 from smoke_tests import smoke_tests_utils
@@ -89,13 +89,16 @@ def test_minimal(generic_cloud: str):
             f'sky logs {name} 5 --status',  # Ensure the job succeeded.
             f'sky exec {name} \'echo "$SKYPILOT_CLUSTER_INFO" | jq .cloud | grep -i {generic_cloud}\'',
             f'sky logs {name} 6 --status',  # Ensure the job succeeded.
+            # Check SKYPILOT_USER is set
+            f'sky exec {name} \'[[ ! -z "$SKYPILOT_USER" ]] && echo "SKYPILOT_USER=$SKYPILOT_USER"\'',
+            f'sky logs {name} 7 --status',  # Ensure the job succeeded.
             # Test '-c' for exec
             f'sky exec -c {name} echo',
-            f'sky logs {name} 7 --status',
-            f'sky exec echo -c {name}',
             f'sky logs {name} 8 --status',
+            f'sky exec echo -c {name}',
+            f'sky logs {name} 9 --status',
             f'sky exec -c {name} echo hi test',
-            f'sky logs {name} 9 | grep "hi test"',
+            f'sky logs {name} 10 | grep "hi test"',
             f'sky exec {name} && exit 1 || true',
             f'sky exec -c {name} && exit 1 || true',
             f's=$(sky cost-report --all) && echo $s && echo $s | grep {name} && echo $s | grep "Total Cost"',
@@ -224,7 +227,6 @@ def test_minimal_with_git_workdir(generic_cloud: str):
 
 
 @pytest.mark.no_runpod
-@pytest.mark.no_slurm  # Slurm does not support containers yet
 def test_minimal_with_git_workdir_docker(generic_cloud: str):
     name = smoke_tests_utils.get_cluster_name()
     test = smoke_tests_utils.Test(
@@ -305,37 +307,351 @@ def test_launch_fast(generic_cloud: str):
 @pytest.mark.no_hyperbolic
 @pytest.mark.no_shadeform
 @pytest.mark.no_seeweb
-def test_launch_fast_with_autostop(generic_cloud: str):
+def test_launch_fast_with_autostop_hook(generic_cloud: str):
     name = smoke_tests_utils.get_cluster_name()
     # Azure takes ~ 7m15s (435s) to autostop a VM, so here we use 600 to ensure
     # the VM is stopped.
     autostop_timeout = 600 if generic_cloud == 'azure' else 250
-    test = smoke_tests_utils.Test(
-        'test_launch_fast_with_autostop',
-        [
-            # First launch to create the cluster with a short autostop
-            f's=$(SKYPILOT_DEBUG=0 sky launch -y -c {name} --infra {generic_cloud} --fast -i 1 {smoke_tests_utils.LOW_RESOURCE_ARG} tests/test_yamls/minimal.yaml) && {smoke_tests_utils.VALIDATE_LAUNCH_OUTPUT}',
-            f'sky logs {name} 1 --status',
-            f'sky status -r {name} | grep UP',
+    special_str = f'hook-executed-{time.time()}'
+    # Use a long-running hook to ensure we can catch the AUTOSTOPPING state
+    hook_duration = 60  # seconds
+
+    # Load the existing minimal.yaml and add resources section with autostop hook
+    minimal_yaml_path = 'tests/test_yamls/minimal.yaml'
+    yaml_config = yaml_utils.read_yaml(minimal_yaml_path)
+    yaml_config['resources'] = {
+        'autostop': {
+            'idle_minutes': 1,
+            'hook': f'echo {special_str} && sleep {hook_duration} && echo "Hook completed"'
+        }
+    }
 
-            # Ensure cluster is stopped
-            smoke_tests_utils.get_cmd_wait_until_cluster_status_contains(
-                cluster_name=name,
-                cluster_status=[sky.ClusterStatus.STOPPED],
-                timeout=autostop_timeout),
-            # Even the cluster is stopped, cloud platform may take a while to
-            # delete the VM.
-            # FIXME(aylei): this can be flaky, sleep longer for now.
-            f'sleep 60',
-            # Launch again. Do full output validation - we expect the cluster to re-launch
-            f's=$(SKYPILOT_DEBUG=0 sky launch -y -c {name} --fast -i 1 tests/test_yamls/minimal.yaml) && {smoke_tests_utils.VALIDATE_LAUNCH_OUTPUT}',
-            f'sky logs {name} 2 --status',
-            f'sky status -r {name} | grep UP',
-        ],
-        f'sky down -y {name}',
-        timeout=smoke_tests_utils.get_timeout(generic_cloud) + autostop_timeout,
-    )
-    smoke_tests_utils.run_one_test(test)
+    with tempfile.NamedTemporaryFile(mode='w', suffix='.yaml') as f:
+        yaml_utils.dump_yaml(f.name, yaml_config)
+        f.flush()
+
+        test = smoke_tests_utils.Test(
+            'test_launch_fast_with_autostop_hook',
+            [
+                # First launch to create the cluster with a short autostop and a hook from YAML
+                f's=$(SKYPILOT_DEBUG=0 sky launch -y -c {name} --infra {generic_cloud} --fast {smoke_tests_utils.LOW_RESOURCE_ARG} {f.name}) && {smoke_tests_utils.VALIDATE_LAUNCH_OUTPUT}',
+                f'sky logs {name} 1 --status',
+                f'sky status -r {name} | grep UP',
+
+                # Wait until cluster enters AUTOSTOPPING state (after idle_minutes + hook starts)
+                # The long-running hook ensures we can catch the AUTOSTOPPING state
+                smoke_tests_utils.get_cmd_wait_until_cluster_status_contains(
+                    cluster_name=name,
+                    cluster_status=[sky.ClusterStatus.AUTOSTOPPING],
+                    timeout=autostop_timeout),
+
+                # Ensure cluster eventually stops after hook completes
+                smoke_tests_utils.get_cmd_wait_until_cluster_status_contains(
+                    cluster_name=name,
+                    cluster_status=[sky.ClusterStatus.STOPPED],
+                    timeout=autostop_timeout),
+                # Even the cluster is stopped, cloud platform may take a while to
+                # delete the VM.
+                # FIXME(aylei): this can be flaky, sleep longer for now.
+                f'sleep 60',
+
+                # Launch again. Do full output validation - we expect the cluster to re-launch
+                f's=$(SKYPILOT_DEBUG=0 sky launch -y -c {name} --fast tests/test_yamls/minimal.yaml) && {smoke_tests_utils.VALIDATE_LAUNCH_OUTPUT}',
+                f'sky logs {name} 2 --status',
+                f'sky status -r {name} | grep UP',
+
+                # Verify the hook was executed by checking the autostop hook log and skylet logs
+                f'hook_log_output=$(sky logs {name} --autostop --no-follow) && echo "$hook_log_output" | grep "{special_str}"',
+                f'hook_log_output=$(sky logs {name} --autostop --no-follow) && echo "$hook_log_output" | grep "Hook completed"',
+                f'skylet_log_output=$(sky exec {name} "cat ~/{constants.SKYLET_LOG_FILE}") && echo "$skylet_log_output" | grep "Autostop hook executed successfully"',
+            ],
+            f'sky down -y {name}',
+            timeout=smoke_tests_utils.get_timeout(generic_cloud) +
+            autostop_timeout,
+        )
+        smoke_tests_utils.run_one_test(test)
+
+
+# See cloud exclusion explanations in test_autostop
+@pytest.mark.no_fluidstack
+@pytest.mark.no_lambda_cloud
+@pytest.mark.no_ibm
+@pytest.mark.no_kubernetes
+@pytest.mark.no_slurm
+@pytest.mark.no_hyperbolic
+@pytest.mark.no_shadeform
+@pytest.mark.no_seeweb
+def test_autostop_hook_timeout(generic_cloud: str):
+    """Test that autostop hook timeout works correctly.
+
+    This verifies that when a hook exceeds its timeout:
+    1. The hook is terminated
+    2. The cluster still transitions to STOPPED state
+    3. The timeout error is logged
+    """
+    name = smoke_tests_utils.get_cluster_name()
+    # Azure takes ~ 7m15s (435s) to autostop a VM, so here we use 600 to ensure
+    # the VM is stopped.
+    autostop_timeout = 600 if generic_cloud == 'azure' else 250
+    # Use a timeout longer than the polling interval (10s) to ensure
+    # AUTOSTOPPING state is visible during status polling, but shorter than
+    # hook_duration to verify the timeout actually triggers.
+    hook_timeout = 30  # seconds
+    hook_duration = 120  # seconds (longer than timeout)
+
+    # Load the existing minimal.yaml and add resources section with autostop hook
+    minimal_yaml_path = 'tests/test_yamls/minimal.yaml'
+    yaml_config = yaml_utils.read_yaml(minimal_yaml_path)
+    yaml_config['resources'] = {
+        'autostop': {
+            'idle_minutes': 1,
+            'hook': f'echo "Hook started" && sleep {hook_duration}',
+            'hook_timeout': hook_timeout
+        }
+    }
+
+    with tempfile.NamedTemporaryFile(mode='w', suffix='.yaml') as f:
+        yaml_utils.dump_yaml(f.name, yaml_config)
+        f.flush()
+
+        test = smoke_tests_utils.Test(
+            'test_autostop_hook_timeout',
+            [
+                # Launch the cluster with a short autostop and a hook that will timeout
+                f's=$(SKYPILOT_DEBUG=0 sky launch -y -c {name} --infra {generic_cloud} --fast {smoke_tests_utils.LOW_RESOURCE_ARG} {f.name}) && {smoke_tests_utils.VALIDATE_LAUNCH_OUTPUT}',
+                f'sky logs {name} 1 --status',
+                f'sky status -r {name} | grep UP',
+
+                # Wait until cluster enters AUTOSTOPPING state
+                smoke_tests_utils.get_cmd_wait_until_cluster_status_contains(
+                    cluster_name=name,
+                    cluster_status=[sky.ClusterStatus.AUTOSTOPPING],
+                    timeout=autostop_timeout),
+
+                # Cluster should still stop despite hook timeout.
+                # Use shorter timeout: hook_timeout + buffer for stop operation.
+                # This also verifies the hook was actually terminated by timeout
+                # (not waiting for the full hook_duration of 60s).
+                # Azure needs longer buffer (~7min) for VM stop operation.
+                smoke_tests_utils.get_cmd_wait_until_cluster_status_contains(
+                    cluster_name=name,
+                    cluster_status=[sky.ClusterStatus.STOPPED],
+                    timeout=autostop_timeout),
+
+                # Launch again to check logs. Use simple validation since
+                # restarting a just-stopped cluster may show warnings about
+                # instance still being in STOPPING state, which breaks the
+                # standard VALIDATE_LAUNCH_OUTPUT grep patterns.
+                f'sky launch -y -c {name} --fast tests/test_yamls/minimal.yaml',
+                f'sky status -r {name} | grep UP',
+
+                # Verify hook started but timed out by checking skylet logs
+                f'skylet_log_output=$(sky exec {name} "cat ~/{constants.SKYLET_LOG_FILE}") && echo "$skylet_log_output" | grep "timed out"',
+            ],
+            f'sky down -y {name}',
+            timeout=smoke_tests_utils.get_timeout(generic_cloud) +
+            autostop_timeout,
+        )
+        smoke_tests_utils.run_one_test(test)
+
+
+# See cloud exclusion explanations in test_autostop
+@pytest.mark.no_fluidstack
+@pytest.mark.no_lambda_cloud
+@pytest.mark.no_ibm
+@pytest.mark.no_kubernetes
+@pytest.mark.no_slurm
+@pytest.mark.no_hyperbolic
+@pytest.mark.no_shadeform
+@pytest.mark.no_seeweb
+def test_launch_waits_for_autostopping(generic_cloud: str):
+    """Test that launch waits for autostopping to complete.
+
+    This verifies that a new launch request waits for the autostop process
+    (including hook execution) to complete, and then restarts the cluster.
+    """
+    name = smoke_tests_utils.get_cluster_name()
+    autostop_timeout = 600 if generic_cloud == 'azure' else 250
+    # Use a shorter hook since we now wait for it to complete
+    hook_duration = 30
+
+    # Load the existing minimal.yaml and add resources section with autostop hook
+    minimal_yaml_path = 'tests/test_yamls/minimal.yaml'
+    yaml_config = yaml_utils.read_yaml(minimal_yaml_path)
+    yaml_config['resources'] = {
+        'autostop': {
+            'idle_minutes': 1,
+            'hook': f'echo "Hook running" && sleep {hook_duration}'
+        }
+    }
+
+    with tempfile.NamedTemporaryFile(mode='w', suffix='.yaml') as f:
+        yaml_utils.dump_yaml(f.name, yaml_config)
+        f.flush()
+
+        test = smoke_tests_utils.Test(
+            'test_launch_waits_for_autostopping',
+            [
+                # Launch cluster
+                f's=$(SKYPILOT_DEBUG=0 sky launch -y -c {name} --infra {generic_cloud} {smoke_tests_utils.LOW_RESOURCE_ARG} {f.name}) && {smoke_tests_utils.VALIDATE_LAUNCH_OUTPUT}',
+                f'sky status -r {name} | grep UP',
+
+                # Wait until cluster enters AUTOSTOPPING state
+                smoke_tests_utils.get_cmd_wait_until_cluster_status_contains(
+                    cluster_name=name,
+                    cluster_status=[sky.ClusterStatus.AUTOSTOPPING],
+                    timeout=autostop_timeout),
+
+                # Launch while autostopping - should wait for autostop to
+                # complete, then restart. Use script to capture terminal output
+                # including spinner messages (which use ANSI escape codes).
+                f'SCRIPT_OUT=$(mktemp) && script -q "$SCRIPT_OUT" -c "SKYPILOT_DEBUG=0 sky launch -y -c {name} {smoke_tests_utils.LOW_RESOURCE_ARG} \'echo after_autostop\'" && '
+                'grep -a "Waiting for autostop to complete" "$SCRIPT_OUT" && rm -f "$SCRIPT_OUT"',
+
+                # Verify cluster is UP and job ran
+                f'sky logs {name} 2 --status',
+                f'sky status -r {name} | grep UP',
+            ],
+            f'sky down -y {name}',
+            timeout=smoke_tests_utils.get_timeout(generic_cloud) +
+            autostop_timeout + hook_duration,
+        )
+        smoke_tests_utils.run_one_test(test)
+
+
+# See cloud exclusion explanations in test_autostop
+@pytest.mark.no_fluidstack
+@pytest.mark.no_lambda_cloud
+@pytest.mark.no_ibm
+@pytest.mark.no_kubernetes
+@pytest.mark.no_slurm
+@pytest.mark.no_hyperbolic
+@pytest.mark.no_shadeform
+@pytest.mark.no_seeweb
+def test_stop_on_autostopping(generic_cloud: str):
+    """Test stopping a cluster while it is autostopping."""
+    name = smoke_tests_utils.get_cluster_name()
+    autostop_timeout = 600 if generic_cloud == 'azure' else 250
+    hook_duration = 300
+
+    minimal_yaml_path = 'tests/test_yamls/minimal.yaml'
+    yaml_config = yaml_utils.read_yaml(minimal_yaml_path)
+    yaml_config['resources'] = {
+        'autostop': {
+            'idle_minutes': 1,
+            'hook': f'echo "Hook running" && sleep {hook_duration}'
+        }
+    }
+
+    with tempfile.NamedTemporaryFile(mode='w', suffix='.yaml') as f:
+        yaml_utils.dump_yaml(f.name, yaml_config)
+        f.flush()
+
+        test = smoke_tests_utils.Test(
+            'test_stop_on_autostopping',
+            [
+                # Launch cluster
+                f's=$(SKYPILOT_DEBUG=0 sky launch -y -c {name} --infra {generic_cloud} {smoke_tests_utils.LOW_RESOURCE_ARG} {f.name}) && {smoke_tests_utils.VALIDATE_LAUNCH_OUTPUT}',
+                f'sky status -r {name} | grep UP',
+
+                # Wait until cluster enters AUTOSTOPPING state
+                smoke_tests_utils.get_cmd_wait_until_cluster_status_contains(
+                    cluster_name=name,
+                    cluster_status=[sky.ClusterStatus.AUTOSTOPPING],
+                    timeout=autostop_timeout),
+
+                # Stop the cluster manually
+                f'sky stop -y {name}',
+
+                # Verify cluster is STOPPED
+                smoke_tests_utils.get_cmd_wait_until_cluster_status_contains(
+                    cluster_name=name,
+                    cluster_status=[sky.ClusterStatus.STOPPED],
+                    timeout=autostop_timeout),
+            ],
+            f'sky down -y {name}',
+            timeout=smoke_tests_utils.get_timeout(generic_cloud) +
+            autostop_timeout,
+        )
+        smoke_tests_utils.run_one_test(test)
+
+
+# See cloud exclusion explanations in test_autostop
+@pytest.mark.no_fluidstack
+@pytest.mark.no_lambda_cloud
+@pytest.mark.no_ibm
+@pytest.mark.no_kubernetes
+@pytest.mark.no_slurm
+@pytest.mark.no_hyperbolic
+@pytest.mark.no_shadeform
+@pytest.mark.no_seeweb
+def test_autostopping_behaviors(generic_cloud: str):
+    """Test various behaviors on AUTOSTOPPING cluster.
+
+    This test verifies:
+    1. Endpoint access (sky status --endpoint) works on AUTOSTOPPING cluster
+    2. SSH access still works (for debugging/intervention)
+    3. Task submission (sky exec) is rejected on AUTOSTOPPING cluster
+    """
+    name = smoke_tests_utils.get_cluster_name()
+    autostop_timeout = 600 if generic_cloud == 'azure' else 250
+    # Hook needs to be long enough for tests to complete, but shorter than
+    # autostop_timeout so we can wait for STOPPED status
+    hook_duration = 120
+
+    minimal_yaml_path = 'tests/test_yamls/minimal.yaml'
+    yaml_config = yaml_utils.read_yaml(minimal_yaml_path)
+    yaml_config['resources'] = {
+        'autostop': {
+            'idle_minutes': 1,
+            'hook': f'echo "Hook running" && sleep {hook_duration}'
+        }
+    }
+
+    with tempfile.NamedTemporaryFile(mode='w', suffix='.yaml') as f:
+        yaml_utils.dump_yaml(f.name, yaml_config)
+        f.flush()
+
+        test = smoke_tests_utils.Test(
+            'test_autostopping_behaviors',
+            [
+                # Launch cluster with a port for endpoint testing
+                f's=$(SKYPILOT_DEBUG=0 sky launch -y -c {name} --infra {generic_cloud} '
+                f'{smoke_tests_utils.LOW_RESOURCE_ARG} {f.name} --ports 8080) && '
+                f'{smoke_tests_utils.VALIDATE_LAUNCH_OUTPUT}',
+                f'sky status -r {name} | grep UP',
+
+                # Wait until cluster enters AUTOSTOPPING state
+                smoke_tests_utils.get_cmd_wait_until_cluster_status_contains(
+                    cluster_name=name,
+                    cluster_status=[sky.ClusterStatus.AUTOSTOPPING],
+                    timeout=autostop_timeout),
+
+                # Test 1: sky status --endpoint should work on AUTOSTOPPING cluster
+                f'sky status {name} --endpoint 8080',
+
+                # Test 2: SSH access should still work (for debugging/intervention)
+                f's=$(ssh {name} "echo ssh_works" 2>&1) && echo "$s" | grep "ssh_works"',
+
+                # Test 3: sky exec should be rejected on AUTOSTOPPING cluster
+                # Verify the error message contains the specific rejection text
+                f's=$(sky exec {name} "echo test" 2>&1); '
+                f'echo "$s" | grep "Please wait for autostop to complete" || exit 1',
+
+                # Verify cluster is still in AUTOSTOPPING state after tests
+                f'sky status -r {name} | grep AUTOSTOPPING',
+
+                # Wait for hook to complete and cluster to STOP
+                smoke_tests_utils.get_cmd_wait_until_cluster_status_contains(
+                    cluster_name=name,
+                    cluster_status=[sky.ClusterStatus.STOPPED],
+                    timeout=autostop_timeout),
+            ],
+            f'sky down -y {name}',
+            timeout=smoke_tests_utils.get_timeout(generic_cloud) +
+            autostop_timeout,
+        )
+        smoke_tests_utils.run_one_test(test)
 
 
 # See cloud exclusion explanations in test_autostop
@@ -536,7 +852,22 @@ def test_aws_manual_restart_recovery():
             # instance would get a new IP address.
             # We should see a warning message on how to recover
             # from this state.
-            f'sky status -r {name} | grep -i "Failed getting cluster status" | grep -i "sky start" | grep -i "to recover from INIT status."',
+            # Note: We retry this command because the background status refresh
+            # daemon may cause lock contention, resulting in cached status being
+            # returned instead of the expected warning message.
+            (f'start_time=$SECONDS; '
+             f'while true; do '
+             f'if (( $SECONDS - $start_time > 120 )); then '
+             f'  echo "Timeout after 120 seconds waiting for Failed getting cluster status message"; exit 1; '
+             f'fi; '
+             f's=$(sky status -r {name}); '
+             f'echo "$s"; '
+             f'if echo "$s" | grep -i "Failed getting cluster status" | grep -i "sky start" | grep -i "to recover from INIT status."; then '
+             f'  echo "Got expected warning message"; break; '
+             f'fi; '
+             f'echo "Retrying sky status -r in 10 seconds..."; '
+             f'sleep 10; '
+             f'done'),
             # Recover the cluster.
             f'sky start -y {name}',
             # Wait for the cluster to be up.
@@ -726,25 +1057,13 @@ def check_labels_in_status():
                 cluster_record = cluster
                 break
 
-        if cluster_record is None:
-            yield f'Cluster {name} not found in status'
-            return
-
-        if 'labels' not in cluster_record:
-            yield 'labels field missing from cluster record'
-            return
-
-        if cluster_record['labels'] is None:
-            yield 'labels field is None'
-            return
-
-        if cluster_record['labels'] != expected_labels:
-            yield (f"Expected labels {expected_labels}, "
-                   f"got {cluster_record['labels']}")
-            return
-
-        # Success - labels are correct
-        return
+        assert cluster_record is not None, f'Cluster {name} not found in status'
+        assert 'labels' in cluster_record, (
+            'labels field missing from cluster record')
+        assert cluster_record['labels'] is not None, 'labels field is None'
+        assert cluster_record['labels'] == expected_labels, (
+            f'Expected labels {expected_labels}, '
+            f'got {cluster_record["labels"]}')
 
     # Create YAML with labels
     yaml_content = textwrap.dedent("""\
@@ -827,8 +1146,10 @@ def test_jobs_launch_and_logs(generic_cloud: str):
             task.set_resources(
                 sky.Resources(infra=generic_cloud,
                               **smoke_tests_utils.LOW_RESOURCE_PARAM))
-            job_id, handle = sky.stream_and_get(sky.jobs.launch(task,
-                                                                name=name))
+            job_ids, handle = sky.stream_and_get(
+                sky.jobs.launch(task, name=name))
+            assert len(job_ids) == 1
+            job_id = job_ids[0]
             assert handle is not None
             # Check the job status from the dashboard
             queue_request_id = (
@@ -1050,7 +1371,9 @@ def unreachable_context():
         # TODO(aylei): There is a implicit API server restart before starting
         # smoke tests in CI pipeline. We should move that to fixture to make
         # the test coherent.
-        'sky api stop || true && sky api start',
+        # Run sky check after restart to populate the enabled clouds cache
+        # synchronously, avoiding race with the background on-boot check.
+        'sky api stop || true && sky api start && sky check kubernetes',
         shell=True,
         check=True)
 
@@ -1124,14 +1447,14 @@ def test_kubernetes_context_failover(unreachable_context):
                 'kubectl get namespaces --context kind-skypilot | grep test-namespace || '
                 '{ echo "Should set the namespace to test-namespace for kind-skypilot. Check the instructions in '
                 'tests/test_smoke.py::test_kubernetes_context_failover." && exit 1; }',
-                'sky show-gpus --infra kubernetes/kind-skypilot | grep H100 | grep "1, 2, 4, 8"',
+                'output=$(sky show-gpus --infra kubernetes/kind-skypilot) && echo "$output" && echo "$output" | grep H100 | grep "1, 2, 4, 8"',
                 # Get contexts and set current context to the other cluster that is not kind-skypilot
                 f'kubectl config use-context {context}',
                 # H100 should not be in the current context
-                f'! sky show-gpus --infra kubernetes/{context} | grep H100',
+                f'output=$(sky show-gpus --infra kubernetes/{context}) && echo "$output" && ! echo "$output" | grep H100',
                 # H100 should be displayed as long as it is available in one of the contexts
-                'sky show-gpus --infra kubernetes | grep H100',
-                f'sky launch -y -c {name}-1 --cpus 1 echo hi',
+                'output=$(sky show-gpus --infra kubernetes) && echo "$output" && echo "$output" | grep H100',
+                f'sky launch -y -c {name}-1 --cpus 1 --infra kubernetes echo hi',
                 f'sky logs {name}-1 --status',
                 # It should be launched not on kind-skypilot
                 f'sky status -v {name}-1 | grep "{context}"',
@@ -1145,7 +1468,7 @@ def test_kubernetes_context_failover(unreachable_context):
                 # It should be launched on kind-skypilot
                 f'sky status -v {name}-3 | grep "kind-skypilot"',
                 # Should be 7 free GPUs
-                f'sky show-gpus --infra kubernetes/kind-skypilot | grep H100 | grep "  7"',
+                f'output=$(sky show-gpus --infra kubernetes/kind-skypilot) && echo "$output" && echo "$output" | grep H100 | grep "  7"',
                 # Remove the line with "kind-skypilot"
                 f'sed -i "/kind-skypilot/d" {f.name}',
                 f'export KUBECONFIG={f.name}',
@@ -1153,7 +1476,7 @@ def test_kubernetes_context_failover(unreachable_context):
                 f'kubectl config use-context {unreachable_context}',
                 f'sky launch -y -c {name}-4 --gpus H100 --cpus 1 --infra kubernetes/{unreachable_context} echo hi && exit 1 || true',
                 # Test failover from unreachable context
-                f'sky launch -y -c {name}-5 --cpus 1 echo hi',
+                f'sky launch -y -c {name}-5 --cpus 1 --infra kubernetes echo hi',
                 # switch back to kind-skypilot where GPU cluster is launched
                 f'kubectl config use-context kind-skypilot',
                 # test if sky status-kubernetes shows H100
@@ -1854,16 +2177,21 @@ def test_cancel_logs_request(generic_cloud: str):
 @pytest.mark.no_lambda_cloud
 @pytest.mark.no_runpod
 @pytest.mark.no_azure
-def test_kubernetes_slurm_ssh_proxy_connection(generic_cloud: str):
+@pytest.mark.parametrize('image_id', [None, 'docker:ubuntu:24.04'])
+def test_kubernetes_slurm_ssh_proxy_connection(generic_cloud: str,
+                                               image_id: Optional[str]):
     """Test Kubernetes/Slurm SSH proxy connection.
     """
     cluster_name = smoke_tests_utils.get_cluster_name()
+    image_id_arg = ''
+    if image_id:
+        image_id_arg = f'--image-id {image_id}'
 
     test = smoke_tests_utils.Test(
         'kubernetes_ssh_proxy_connection',
         [
             # Launch a minimal Kubernetes/Slurm cluster for SSH proxy testing
-            f'sky launch -y -c {cluster_name} --infra {generic_cloud} {smoke_tests_utils.LOW_RESOURCE_ARG} echo "SSH test cluster ready"',
+            f'sky launch -y -c {cluster_name} --infra {generic_cloud} {image_id_arg} {smoke_tests_utils.LOW_RESOURCE_ARG} echo "SSH test cluster ready"',
             # Run an SSH command on the cluster.
             f'ssh {cluster_name} echo "SSH command executed"',
         ],
@@ -1873,6 +2201,80 @@ def test_kubernetes_slurm_ssh_proxy_connection(generic_cloud: str):
     smoke_tests_utils.run_one_test(test)
 
 
+@pytest.mark.slurm
+def test_slurm_ssh_agent_auth(generic_cloud: str):
+    """Test Slurm SSH authentication via ssh-agent (no IdentityFile in config).
+
+    This tests the fix for IdentitiesOnly=yes preventing ssh-agent fallback.
+    The test temporarily removes IdentityFile from ~/.slurm/config and relies
+    on ssh-agent for authentication.
+    """
+    name = smoke_tests_utils.get_cluster_name()
+
+    agent_env_file = f'/tmp/sky_test_ssh_agent_env_{name}'
+    backup_config = f'/tmp/sky_test_slurm_config_backup_{name}'
+
+    # Helper to source agent env at start of each command
+    source_agent = f'[ -f {agent_env_file} ] && source {agent_env_file};'
+
+    test = smoke_tests_utils.Test(
+        'slurm_ssh_agent_auth',
+        [
+            # Backup config, extract keys, remove IdentityFile lines, start ssh-agent
+            f'''
+            set -e
+            SLURM_CONFIG=~/.slurm/config
+
+            # Backup original config
+            cp "$SLURM_CONFIG" {backup_config}
+            echo "Backed up config to {backup_config}"
+
+            # Extract unique IdentityFile paths
+            IDENTITY_FILES=$(grep -i "^[[:space:]]*IdentityFile" "$SLURM_CONFIG" | awk '{{print $2}}' | sort -u)
+            echo "Found identity files: $IDENTITY_FILES"
+
+            # Remove all IdentityFile lines (macOS-compatible)
+            grep -v -i "^[[:space:]]*IdentityFile" "$SLURM_CONFIG" > "$SLURM_CONFIG.new" || true
+            mv "$SLURM_CONFIG.new" "$SLURM_CONFIG"
+            echo "Removed IdentityFile lines from config"
+            cat "$SLURM_CONFIG"
+
+            # Start ssh-agent and save env vars
+            eval "$(ssh-agent -s)"
+            echo "export SSH_AUTH_SOCK=$SSH_AUTH_SOCK" > {agent_env_file}
+            echo "export SSH_AGENT_PID=$SSH_AGENT_PID" >> {agent_env_file}
+
+            # Add keys to agent
+            for key in $IDENTITY_FILES; do
+                expanded_key=$(eval echo "$key")
+                if [ -f "$expanded_key" ]; then
+                    ssh-add "$expanded_key" 2>/dev/null && echo "Added key: $expanded_key" || true
+                fi
+            done
+            ssh-add -l
+            ''',
+            f'{source_agent} sky check slurm 2>&1 | tee /dev/stderr | grep -E "(├──|└──)" | grep -v "disabled"',
+            f'{source_agent} sky launch -y -c {name} --infra {generic_cloud} {smoke_tests_utils.LOW_RESOURCE_ARG} -- echo "SSH agent auth works"',
+            f'{source_agent} sky logs {name} 1 --status',
+            f'{source_agent} ssh {name} whoami',
+            f'{source_agent} sky status {name} -r | grep UP',
+        ],
+        # Cleanup: restore config, kill ssh-agent, down cluster
+        f'''
+        # Restore original config
+        [ -f {backup_config} ] && cp {backup_config} ~/.slurm/config && rm -f {backup_config}
+
+        # Kill ssh-agent
+        [ -f {agent_env_file} ] && source {agent_env_file} && kill $SSH_AGENT_PID 2>/dev/null || true
+        rm -f {agent_env_file}
+
+        # Down the cluster
+        sky down -y {name} 2>/dev/null || true
+        ''',
+    )
+    smoke_tests_utils.run_one_test(test)
+
+
 # Only checks for processes in local machine, so skip remote server test.
 # TODO(kevin): Add metrics for number of open SSH tunnels and refactor this test to use it.
 @pytest.mark.no_remote_server
@@ -2042,3 +2444,21 @@ def test_docker_pass_redacted(generic_cloud: str):
             teardown=f'sky down -y {name}',
         )
         smoke_tests_utils.run_one_test(test)
+
+
+@pytest.mark.slurm
+@pytest.mark.no_auto_retry
+def test_slurm_multi_node_proctrack():
+    """Test Slurm multi-node against proctrack/cgroup behaviour."""
+    name = smoke_tests_utils.get_cluster_name()
+    test = smoke_tests_utils.Test(
+        'slurm_multi_node_proctrack',
+        [
+            f'sky launch -y -c {name} --infra slurm --num-nodes 2 tests/test_yamls/slurm_bg_proc.yaml',
+            f'sky logs {name} 1 --status',
+            f'sky logs {name} 1 | grep "SUCCESS"',
+        ],
+        f'sky down -y {name}',
+        smoke_tests_utils.get_timeout('slurm'),
+    )
+    smoke_tests_utils.run_one_test(test)
diff --git a/tests/smoke_tests/test_cluster_job.py b/tests/smoke_tests/test_cluster_job.py
index b6a958921df..4447f82cf67 100644
--- a/tests/smoke_tests/test_cluster_job.py
+++ b/tests/smoke_tests/test_cluster_job.py
@@ -99,7 +99,6 @@ def test_job_queue(generic_cloud: str, accelerator: Dict[str, str]):
 @pytest.mark.no_kubernetes  # Doesn't support Kubernetes for now
 @pytest.mark.no_hyperbolic  # Doesn't support Hyperbolic for now
 @pytest.mark.no_seeweb  # Seeweb does not support Docker images
-@pytest.mark.no_slurm  # Slurm does not support docker images and/or image_id
 @pytest.mark.parametrize('accelerator', [{'do': 'H100', 'nebius': 'H100'}])
 @pytest.mark.parametrize(
     'image_id',
@@ -174,9 +173,9 @@ def test_lambda_job_queue():
         'lambda_job_queue',
         [
             f'sky launch -y -c {name} {smoke_tests_utils.LAMBDA_TYPE} examples/job_queue/cluster.yaml',
-            f'sky exec {name} -n {name}-1 --gpus A10:0.5 -d examples/job_queue/job.yaml',
-            f'sky exec {name} -n {name}-2 --gpus A10:0.5 -d examples/job_queue/job.yaml',
-            f'sky exec {name} -n {name}-3 --gpus A10:0.5 -d examples/job_queue/job.yaml',
+            f'sky exec {name} -n {name}-1 --gpus {smoke_tests_utils.LAMBDA_GPU_TYPE}:0.5 -d examples/job_queue/job.yaml',
+            f'sky exec {name} -n {name}-2 --gpus {smoke_tests_utils.LAMBDA_GPU_TYPE}:0.5 -d examples/job_queue/job.yaml',
+            f'sky exec {name} -n {name}-3 --gpus {smoke_tests_utils.LAMBDA_GPU_TYPE}:0.5 -d examples/job_queue/job.yaml',
             f'sky queue {name} | grep {name}-1 | grep RUNNING',
             f'sky queue {name} | grep {name}-2 | grep RUNNING',
             f'sky queue {name} | grep {name}-3 | grep PENDING',
@@ -456,8 +455,6 @@ def test_ibm_job_queue_multinode():
 @pytest.mark.no_hyperbolic  # Doesn't support Hyperbolic for now
 @pytest.mark.no_shadeform  # Doesn't support Shadeform for now
 @pytest.mark.no_seeweb  # Seeweb does not support Docker images
-@pytest.mark.no_slurm  # Slurm does not support docker images and/or image_id
-# TODO(zhwu): we should fix this for kubernetes
 def test_docker_preinstalled_package(generic_cloud: str):
     name = smoke_tests_utils.get_cluster_name()
     test = smoke_tests_utils.Test(
@@ -473,6 +470,55 @@ def test_docker_preinstalled_package(generic_cloud: str):
     smoke_tests_utils.run_one_test(test)
 
 
+@pytest.mark.slurm
+def test_docker_preinstalled_package_slurm_sqsh(generic_cloud: str):
+    """Test local .sqsh container images on Slurm (both absolute and relative paths)."""
+    name = smoke_tests_utils.get_cluster_name()
+    name_rel = f'{name}-rel'
+
+    # Parse "sky check slurm" output: "    ├── cluster: enabled" -> "cluster"
+    # sed strips ANSI escape codes from colored output
+    get_slurm_cluster = ("sky check slurm 2>&1 | "
+                         "sed 's/\\x1b\\[[0-9;]*m//g' | "
+                         "grep -E '(├──|└──)' | "
+                         "grep 'enabled' | "
+                         "head -1 | "
+                         "awk -F': ' '{print $1}' | "
+                         "awk '{print $NF}'")
+
+    test = smoke_tests_utils.Test(
+        'docker_preinstalled_sqsh',
+        [
+            # Get slurm cluster name and remote home directory
+            f'SLURM_CLUSTER=$({get_slurm_cluster}) && '
+            f'echo "Using Slurm cluster: $SLURM_CLUSTER" && '
+            f'SLURM_HOME=$(ssh -F ~/.slurm/config $SLURM_CLUSTER "echo \\$HOME") && '
+            f'echo "Remote home: $SLURM_HOME" && '
+            # Import nginx image to create .sqsh file in ~/nginx+latest.sqsh
+            f'ssh -F ~/.slurm/config $SLURM_CLUSTER "srun enroot import docker://nginx:latest" && '
+            # Test 1: Absolute path - launch with full path to .sqsh
+            f'sky launch -y -c {name} --infra slurm/$SLURM_CLUSTER '
+            f'{smoke_tests_utils.LOW_RESOURCE_ARG} '
+            f'--image-id docker:$SLURM_HOME/nginx+latest.sqsh -- nginx -V',
+            # Verify absolute path worked
+            f'sky logs {name} 1 --status',
+            f'sky exec {name} whoami | grep root',
+            # Test 2: Relative path - must use ./ prefix for pyxis
+            f'SLURM_CLUSTER=$({get_slurm_cluster}) && '
+            f'sky launch -y -c {name_rel} --infra slurm/$SLURM_CLUSTER '
+            f'{smoke_tests_utils.LOW_RESOURCE_ARG} '
+            f'--image-id docker:./nginx+latest.sqsh -- nginx -V',
+            # Verify relative path worked
+            f'sky logs {name_rel} 1 --status',
+            f'sky exec {name} whoami | grep root',
+        ],
+        f'sky down -y {name} {name_rel}; '
+        f'SLURM_CLUSTER=$({get_slurm_cluster}) && '
+        f'ssh -F ~/.slurm/config $SLURM_CLUSTER "rm -f ~/nginx+latest.sqsh"',
+    )
+    smoke_tests_utils.run_one_test(test)
+
+
 # ---------- Submitting multiple tasks to the same cluster. ----------
 @pytest.mark.no_vast  # Vast has low availability of T4 GPUs
 @pytest.mark.no_shadeform  # Shadeform does not have T4 GPUs
@@ -495,6 +541,8 @@ def test_multi_echo(generic_cloud: str):
         # Slurm and Kubernetes do not support spot instances
         use_spot = False
         accelerator = smoke_tests_utils.get_available_gpus(infra=generic_cloud)
+        if not accelerator:
+            pytest.fail(f'No GPUs available for {generic_cloud}.')
 
     # Determine timeout for 15 running jobs check: 2 min for remote server, single check for local
     is_remote = smoke_tests_utils.is_remote_server_test()
@@ -623,6 +671,8 @@ def test_multi_echo(generic_cloud: str):
 def test_huggingface(generic_cloud: str, accelerator: Dict[str, str]):
     if generic_cloud in ('kubernetes', 'slurm'):
         accelerator = smoke_tests_utils.get_available_gpus(infra=generic_cloud)
+        if not accelerator:
+            pytest.fail(f'No GPUs available for {generic_cloud}.')
     else:
         accelerator = accelerator.get(generic_cloud, 'T4')
     name = smoke_tests_utils.get_cluster_name()
@@ -1250,16 +1300,50 @@ def test_volume_env_mount_kubernetes():
 # ---------- Container logs from task on Kubernetes ----------
 
 
-def _check_container_logs(name, logs, total_lines, count):
+def _check_container_logs(name, logs, total_lines, count, timeout=60):
     """Check if the container logs contain the expected number of logging lines.
 
     Each line should be only one number in the given range and should show up
     count number of times. We skip the messages that we see in the job from
     running setup with set -x.
+
+    This function includes a retry mechanism because there can be a small delay
+    between job completion and when container logs become fully available via
+    kubectl logs.
     """
-    output_cmd = f's=$({logs});'
-    for num in range(1, total_lines + 1):
-        output_cmd += f' echo "$s" | grep -x "{num}" | wc -l | grep {count};'
+    # The awk script checks if each number from 1 to total_lines appears
+    # exactly 'count' times and that no other numbers are present.
+    awk_check = f"""awk '
+  /^[0-9]+$/ {{ counts[$0]++ }}
+  END {{
+    if (length(counts) != {total_lines}) {{
+      exit 1
+    }}
+    for (i = 1; i <= {total_lines}; i++) {{
+      if (counts[i] != {count}) {{
+        exit 1
+      }}
+    }}
+    exit 0
+  }}'"""
+
+    # Wrap in a retry loop with timeout
+    output_cmd = f'''
+start_time=$SECONDS
+while true; do
+    if (( $SECONDS - start_time > {timeout} )); then
+        echo "Timeout after {timeout} seconds waiting for container logs"
+        exit 1
+    fi
+    s=$({logs})
+    if echo "$s" | {awk_check}; then
+        echo "Container logs verified successfully"
+        break
+    fi
+    echo "Waiting for container logs to be ready..."
+    sleep 5
+done
+'''
     return smoke_tests_utils.run_cloud_cmd_on_cluster(
         name,
         output_cmd,
@@ -1286,7 +1370,7 @@ def test_container_logs_multinode_kubernetes():
             [
                 smoke_tests_utils.launch_cluster_for_cloud_cmd(
                     'kubernetes', name),
-                f'sky launch -y -c {name} {task_yaml} --num-nodes 2',
+                f'sky launch -y -c {name} --infra kubernetes {task_yaml} --num-nodes 2',
                 _check_container_logs(name, head_logs, 9, 1),
                 _check_container_logs(name, worker_logs, 9, 1),
             ],
@@ -1340,7 +1424,7 @@ def test_container_logs_two_simultaneous_jobs_kubernetes():
             [
                 smoke_tests_utils.launch_cluster_for_cloud_cmd(
                     'kubernetes', name),
-                f'sky launch -y -c {name}',
+                f'sky launch -y -c {name} --infra kubernetes',
                 f'sky exec -c {name} -d {task_yaml}',
                 f'sky exec -c {name} -d {task_yaml}',
                 'sleep 30',
@@ -1666,6 +1750,8 @@ def test_cancel_azure():
 def test_cancel_pytorch(generic_cloud: str, accelerator: Dict[str, str]):
     if generic_cloud in ('kubernetes', 'slurm'):
         accelerator = smoke_tests_utils.get_available_gpus(infra=generic_cloud)
+        if not accelerator:
+            pytest.fail(f'No GPUs available for {generic_cloud}.')
     else:
         accelerator = accelerator.get(generic_cloud, 'T4')
     name = smoke_tests_utils.get_cluster_name()
@@ -1867,6 +1953,28 @@ def test_aws_custom_image():
     smoke_tests_utils.run_one_test(test)
 
 
+@pytest.mark.aws
+@pytest.mark.parametrize(
+    'image_id',
+    [
+        'docker:verlai/verl:sgl055.latest',
+        # 'docker:nvcr.io/nvidia/quantum/cuda-quantum:cu12-0.10.0',
+    ])
+def test_aws_custom_docker_image_with_motd(image_id):
+    """Test AWS custom image with MOTD contamination"""
+    name = smoke_tests_utils.get_cluster_name()
+    test = smoke_tests_utils.Test(
+        'test-aws-custom-image',
+        [
+            f'sky launch -c {name} {smoke_tests_utils.LOW_RESOURCE_ARG} --retry-until-up -y tests/test_yamls/test_custom_image.yaml --infra aws --image-id {image_id}',
+            f'sky logs {name} 1 --status',
+        ],
+        f'sky down -y {name}',
+        timeout=30 * 60,
+    )
+    smoke_tests_utils.run_one_test(test)
+
+
 @pytest.mark.resource_heavy
 @pytest.mark.kubernetes
 @pytest.mark.parametrize(
@@ -1899,13 +2007,14 @@ def test_kubernetes_custom_image(image_id):
     """Test Kubernetes custom image"""
     accelerator = smoke_tests_utils.get_available_gpus()
     name = smoke_tests_utils.get_cluster_name()
+    gpus_arg = f'{accelerator}:1' if accelerator else 'none'
     test = smoke_tests_utils.Test(
         'test-kubernetes-custom-image',
         [
-            f'sky launch -c {name} {smoke_tests_utils.LOW_RESOURCE_ARG} --retry-until-up -y tests/test_yamls/test_custom_image.yaml --infra kubernetes/none --image-id {image_id} --gpus {accelerator}:1',
+            f'sky launch -c {name} {smoke_tests_utils.LOW_RESOURCE_ARG} --retry-until-up -y tests/test_yamls/test_custom_image.yaml --infra kubernetes/none --image-id {image_id} --gpus {gpus_arg}',
             f'sky logs {name} 1 --status',
             # Try exec to run again and check if the logs are printed
-            f'sky exec {name} tests/test_yamls/test_custom_image.yaml --infra kubernetes/none --image-id {image_id} --gpus {accelerator}:1 | grep "Hello 100"',
+            f'sky exec {name} tests/test_yamls/test_custom_image.yaml --infra kubernetes/none --image-id {image_id} --gpus {gpus_arg} | grep "Hello 100"',
             # Make sure ssh is working with custom username
             f'ssh {name} echo hi | grep hi',
         ],
@@ -1936,6 +2045,46 @@ def test_kubernetes_pod_failure_detection():
     smoke_tests_utils.run_one_test(test)
 
 
+@pytest.mark.kubernetes
+@pytest.mark.resource_heavy  # Not actually resource heavy, but can't reproduce on kind clusters.
+@pytest.mark.no_auto_retry
+def test_kubernetes_container_status_unknown_status_refresh():
+    """Test sky status --refresh handles evicted pods without crashing.
+
+    When pods are evicted due to ephemeral storage limits, containers may enter
+    ContainerStatusUnknown state with terminated.finishedAt=null. This test
+    verifies that SkyPilot handles evicted pods without erroring.
+
+    Note: This test is inherently flaky, it may succeed even before the fix.
+    Triggering ContainerStatusUnknown (where finishedAt is null) requires the kubelet
+    to lose contact with the container runtime during eviction, which is racy. The pod
+    may instead get a clean termination with finishedAt set.
+
+    Regression test for #8674.
+    """
+    name = smoke_tests_utils.get_cluster_name()
+
+    test = smoke_tests_utils.Test(
+        'kubernetes_container_status_unknown_status_refresh',
+        [
+            f'sky launch -y -c {name} --infra kubernetes --num-nodes 8 --detach-run tests/test_yamls/test_k8s_ephemeral_storage_eviction.yaml',
+            # Poll sky status --refresh, fail fast if error found.
+            # Before the fix this logged: "Failed to query ... [TypeError]..."
+            (f'for i in $(seq 1 20); do '
+             f'echo "=== status refresh attempt $i ===" && '
+             f'OUT=$(sky status {name} -v --refresh 2>&1) && '
+             f'echo "$OUT" && '
+             f'if echo "$OUT" | grep -q "TypeError"; then '
+             f'echo "FAIL: TypeError found" && exit 1; fi && '
+             f'if echo "$OUT" | grep -q "Failed to refresh status"; then '
+             f'echo "FAIL: Refresh failed" && exit 1; fi; done'),
+        ],
+        f'sky down -y {name}',
+        timeout=10 * 60,
+    )
+    smoke_tests_utils.run_one_test(test)
+
+
 @pytest.mark.kubernetes
 def test_kubernetes_pod_pending_reason():
     """Ensure pending pod reasons are surfaced in provision logs."""
@@ -2478,6 +2627,16 @@ def test_scp_autodown():
     smoke_tests_utils.run_one_test(test)
 
 
+def _get_k8s_service_cleanup_check_cmd(name: str, name_on_cloud: str) -> str:
+    """Returns the command to check that Kubernetes services are cleaned up."""
+    return smoke_tests_utils.run_cloud_cmd_on_cluster(
+        name,
+        f'services=$(kubectl get svc -l skypilot-cluster-name={name_on_cloud} -o name || true); '
+        'echo "Services: [$services]"; '
+        'if [ -n "$services" ]; then echo "ERROR: services still exist"; exit 1; '
+        'else echo "OK: services cleaned up"; fi')
+
+
 # ---------- Testing Recovery on Kubernetes ----------
 @pytest.mark.kubernetes
 def test_kubernetes_recovery():
@@ -2488,6 +2647,8 @@ def test_kubernetes_recovery():
     head = f'{name_on_cloud}-head'
     worker2 = f'{name_on_cloud}-worker2'
     worker3 = f'{name_on_cloud}-worker3'
+    service_cleanup_check = _get_k8s_service_cleanup_check_cmd(
+        name, name_on_cloud)
     test = smoke_tests_utils.Test(
         'kubernetes_pod_recovery',
         [
@@ -2520,13 +2681,93 @@ def test_kubernetes_recovery():
             # Check status
             f'sky status -r {name} --no-show-pools --no-show-services --no-show-managed-jobs',
         ],
-        f'sky down -y {name} && '
+        f'sky down -y {name} && {service_cleanup_check} && '
         f'{smoke_tests_utils.down_cluster_for_cloud_cmd(name)}',
         timeout=30 * 60,
     )
     smoke_tests_utils.run_one_test(test)
 
 
+@pytest.mark.kubernetes
+def test_kubernetes_service_cleanup_on_down():
+    """Test that Kubernetes services are cleaned up when running sky down
+    after pods have been externally deleted."""
+    name = smoke_tests_utils.get_cluster_name()
+    name_on_cloud = common_utils.make_cluster_name_on_cloud(
+        name, sky.Kubernetes.max_cluster_name_length())
+    service_cleanup_check = _get_k8s_service_cleanup_check_cmd(
+        name, name_on_cloud)
+    test = smoke_tests_utils.Test(
+        'kubernetes_service_cleanup_on_down',
+        [
+            smoke_tests_utils.launch_cluster_for_cloud_cmd('kubernetes', name),
+            # Launch cluster
+            f'sky launch -y -c {name} --infra kubernetes --cpus 0.1+ --num-nodes 2 echo hello',
+            # Verify services exist
+            smoke_tests_utils.run_cloud_cmd_on_cluster(
+                name,
+                f'services=$(kubectl get svc -l skypilot-cluster-name={name_on_cloud} -o name); '
+                'echo "Services before deletion: [$services]"; '
+                'if [ -z "$services" ]; then echo "ERROR: no services found"; exit 1; fi'
+            ),
+            # Delete all pods externally (simulating external deletion)
+            smoke_tests_utils.run_cloud_cmd_on_cluster(
+                name,
+                f'kubectl delete pod -l ray-cluster-name={name_on_cloud} --wait'
+            ),
+            # Run sky down - this should clean up services even though pods are gone
+            f'sky down -y {name}',
+            # Verify services are cleaned up
+            service_cleanup_check,
+        ],
+        f'{smoke_tests_utils.down_cluster_for_cloud_cmd(name)}',
+        timeout=15 * 60,
+    )
+    smoke_tests_utils.run_one_test(test)
+
+
+@pytest.mark.kubernetes
+def test_kubernetes_service_cleanup_on_status_refresh():
+    """Test that Kubernetes services are cleaned up by the status refresh
+    daemon after pods have been externally deleted."""
+    name = smoke_tests_utils.get_cluster_name()
+    name_on_cloud = common_utils.make_cluster_name_on_cloud(
+        name, sky.Kubernetes.max_cluster_name_length())
+    service_cleanup_check = _get_k8s_service_cleanup_check_cmd(
+        name, name_on_cloud)
+    test = smoke_tests_utils.Test(
+        'kubernetes_service_cleanup_on_status_refresh',
+        [
+            smoke_tests_utils.launch_cluster_for_cloud_cmd('kubernetes', name),
+            # Launch cluster
+            f'sky launch -y -c {name} --infra kubernetes --cpus 0.1+ --num-nodes 2 echo hello',
+            # Verify services exist
+            smoke_tests_utils.run_cloud_cmd_on_cluster(
+                name,
+                f'services=$(kubectl get svc -l skypilot-cluster-name={name_on_cloud} -o name); '
+                'echo "Services before deletion: [$services]"; '
+                'if [ -z "$services" ]; then echo "ERROR: no services found"; exit 1; fi'
+            ),
+            # Delete all pods externally (simulating external deletion)
+            smoke_tests_utils.run_cloud_cmd_on_cluster(
+                name,
+                f'kubectl delete pod -l ray-cluster-name={name_on_cloud} --wait'
+            ),
+            # Wait for status refresh to detect termination and clean up
+            # The status refresh daemon runs periodically and should detect
+            # that all pods are gone, triggering post_teardown_cleanup
+            f'sleep 90',
+            # Verify cluster is removed from sky status
+            f'sky status {name} --no-show-pools --no-show-services --no-show-managed-jobs 2>&1 | grep -q "not found"',
+            # Verify services are cleaned up
+            service_cleanup_check,
+        ],
+        f'sky down -y {name} || true; {smoke_tests_utils.down_cluster_for_cloud_cmd(name)}',
+        timeout=20 * 60,
+    )
+    smoke_tests_utils.run_one_test(test)
+
+
 # ---------- Testing Kubernetes pod_config ----------
 @pytest.mark.kubernetes
 def test_kubernetes_pod_config_pvc():
@@ -2692,6 +2933,153 @@ def test_kubernetes_pod_config_change_detection():
         os.unlink(task_yaml_2_path)
 
 
+# ---------- Testing Kubernetes remote_identity override ----------
+@pytest.mark.kubernetes
+def test_kubernetes_remote_identity_override():
+    """Test that config.kubernetes.remote_identity can be overridden in task YAML.
+
+    This test verifies that:
+    1. With remote_identity: LOCAL_CREDENTIALS, kubeconfig is uploaded
+    2. With remote_identity: NO_UPLOAD, kubeconfig is NOT uploaded
+
+    This is important for users running SkyPilot from within a SkyPilot pod,
+    where the auto-mounted service account should be used instead of uploading
+    kubeconfig which may have exec auth or unreachable IPs.
+
+    Fixes: https://github.com/skypilot-org/skypilot/issues/8321
+    """
+    name = smoke_tests_utils.get_cluster_name()
+
+    test = smoke_tests_utils.Test(
+        'kubernetes_remote_identity_override',
+        [
+            # First, launch with LOCAL_CREDENTIALS - kubeconfig should be uploaded
+            f'sky launch -y -c {name} --infra kubernetes {smoke_tests_utils.LOW_RESOURCE_ARG} tests/test_yamls/test_k8s_remote_identity_local_creds.yaml',
+            f'sky logs {name} 1 --status',
+            # Down the cluster
+            f'sky down -y {name}',
+            # Launch with NO_UPLOAD - kubeconfig should NOT be uploaded
+            f'sky launch -y -c {name} --infra kubernetes {smoke_tests_utils.LOW_RESOURCE_ARG} tests/test_yamls/test_k8s_remote_identity_no_upload.yaml',
+            f'sky logs {name} 1 --status',
+        ],
+        f'sky down -y {name}',
+        timeout=15 * 60,
+    )
+    smoke_tests_utils.run_one_test(test)
+
+
+@pytest.mark.kubernetes
+def test_kubernetes_pod_config_sidecar():
+    """Test Kubernetes pod_config with sidecar container injection.
+
+    This test verifies that SkyPilot correctly handles pods with multiple
+    containers (sidecars) by:
+    1. Launching a cluster with a sidecar container via pod_config
+    2. Verifying the pod has both ray-node and sidecar containers
+    3. Verifying sky exec commands run in the ray-node container
+    4. Verifying the sidecar container is running
+    """
+    name = smoke_tests_utils.get_cluster_name()
+    name_on_cloud = common_utils.make_cluster_name_on_cloud(
+        name, sky.Kubernetes.max_cluster_name_length())
+
+    template_str = pathlib.Path(
+        'tests/test_yamls/test_k8s_pod_config_sidecar.yaml.j2').read_text()
+    template = jinja2.Template(template_str)
+    task_yaml_content = template.render()
+
+    with tempfile.NamedTemporaryFile(suffix='.yaml', mode='w',
+                                     delete=False) as f:
+        f.write(task_yaml_content)
+        f.flush()
+        task_yaml_path = f.name
+
+        test = smoke_tests_utils.Test(
+            'kubernetes_pod_config_sidecar',
+            [
+                smoke_tests_utils.launch_cluster_for_cloud_cmd(
+                    'kubernetes', name),
+                # Launch SkyPilot cluster with sidecar
+                f'sky launch -y -c {name} --infra kubernetes '
+                f'{smoke_tests_utils.LOW_RESOURCE_ARG} {task_yaml_path}',
+                # Verify pod has 2 containers (ray-node and sidecar)
+                smoke_tests_utils.run_cloud_cmd_on_cluster(
+                    name,
+                    f'kubectl get pod -l skypilot-cluster-name={name_on_cloud} '
+                    '-o jsonpath=\'{.items[0].spec.containers[*].name}\' | '
+                    'grep -E "ray-node.*sidecar|sidecar.*ray-node"'),
+                # Verify sky exec runs in ray-node container
+                f'sky exec {name} "echo CONTAINER_CHECK: ray-node is working"',
+                # Verify sidecar is running
+                smoke_tests_utils.run_cloud_cmd_on_cluster(
+                    name,
+                    f'kubectl logs -l skypilot-cluster-name={name_on_cloud} '
+                    '-c sidecar --tail=5 | grep "sidecar running"'),
+            ],
+            f'sky down -y {name} && '
+            f'{smoke_tests_utils.down_cluster_for_cloud_cmd(name)}',
+            timeout=10 * 60,
+        )
+        smoke_tests_utils.run_one_test(test)
+        os.unlink(task_yaml_path)
+
+
+# ---------- Testing Kubernetes set_pod_resource_limits ----------
+@pytest.mark.kubernetes
+def test_kubernetes_set_pod_resource_limits():
+    """Test that set_pod_resource_limits config sets CPU/memory limits on pods.
+
+    This test verifies that when kubernetes.set_pod_resource_limits is set to
+    a numeric multiplier, the pod limits are set to requests * multiplier.
+    """
+    name = smoke_tests_utils.get_cluster_name()
+    name_on_cloud = common_utils.make_cluster_name_on_cloud(
+        name, sky.Kubernetes.max_cluster_name_length())
+
+    # Config with set_pod_resource_limits with a 2x multiplier
+    config = textwrap.dedent("""
+    kubernetes:
+        set_pod_resource_limits: 2.0
+    """)
+
+    with tempfile.NamedTemporaryFile(suffix='.yaml', mode='w',
+                                     delete=False) as config_file:
+        config_file.write(config)
+        config_file.flush()
+        config_path = config_file.name
+
+        test = smoke_tests_utils.Test(
+            'kubernetes_set_pod_resource_limits',
+            [
+                smoke_tests_utils.launch_cluster_for_cloud_cmd(
+                    'kubernetes', name),
+                # Launch a cluster with set_pod_resource_limits=2.0
+                # Using --cpus 2 --memory 2 so limits should be 4 CPU, 4G memory
+                f'sky launch -y -c {name} --infra kubernetes --cpus 2 --memory 2',
+                # Verify CPU limit is set (should be 4 with 2x multiplier)
+                smoke_tests_utils.run_cloud_cmd_on_cluster(
+                    name,
+                    f'kubectl get pod -l ray-cluster-name={name_on_cloud} '
+                    '-o jsonpath=\'{.items[0].spec.containers[0].resources.limits.cpu}\' '
+                    '| grep -E "^4"'),
+                # Verify memory limit is set (should be 4G with 2x multiplier)
+                smoke_tests_utils.run_cloud_cmd_on_cluster(
+                    name,
+                    f'kubectl get pod -l ray-cluster-name={name_on_cloud} '
+                    '-o jsonpath=\'{.items[0].spec.containers[0].resources.limits.memory}\' '
+                    '| grep -E "^4.*G"'),
+            ],
+            f'sky down -y {name} && '
+            f'{smoke_tests_utils.down_cluster_for_cloud_cmd(name)} && '
+            f'rm -f {config_path}',
+            timeout=10 * 60,
+            env={
+                skypilot_config.ENV_VAR_GLOBAL_CONFIG: config_path,
+            },
+        )
+        smoke_tests_utils.run_one_test(test)
+
+
 # ---------- SSH Proxy Performance Test ----------
 @pytest.mark.kubernetes
 @pytest.mark.no_remote_server
diff --git a/tests/smoke_tests/test_examples.py b/tests/smoke_tests/test_examples.py
index 2c440bddf4c..1c3691a15c0 100644
--- a/tests/smoke_tests/test_examples.py
+++ b/tests/smoke_tests/test_examples.py
@@ -23,6 +23,8 @@ def test_min_gpt(generic_cloud: str, train_file: str, accelerator: Dict[str,
                                                                         str]):
     if generic_cloud in ('kubernetes', 'slurm'):
         accelerator = smoke_tests_utils.get_available_gpus(infra=generic_cloud)
+        if not accelerator:
+            pytest.fail(f'No GPUs available for {generic_cloud}.')
     else:
         accelerator = accelerator.get(generic_cloud, 'T4')
     name = smoke_tests_utils.get_cluster_name()
@@ -75,6 +77,8 @@ def read_and_modify(file_path: str) -> str:
 def test_ray_train(generic_cloud: str, accelerator: Dict[str, str]) -> None:
     if generic_cloud in ('kubernetes', 'slurm'):
         accelerator = smoke_tests_utils.get_available_gpus(infra=generic_cloud)
+        if not accelerator:
+            pytest.fail(f'No GPUs available for {generic_cloud}.')
     else:
         accelerator = accelerator.get(generic_cloud, 'T4')
     name = smoke_tests_utils.get_cluster_name()
@@ -196,6 +200,8 @@ def test_nemorl(generic_cloud: str, accelerator: Dict[str, str]) -> None:
     memory = '60+'
     if generic_cloud in ('kubernetes', 'slurm'):
         accelerator = smoke_tests_utils.get_available_gpus(infra=generic_cloud)
+        if not accelerator:
+            pytest.fail(f'No GPUs available for {generic_cloud}.')
     else:
         accelerator = accelerator.get(generic_cloud, 'L4')
 
diff --git a/tests/smoke_tests/test_images.py b/tests/smoke_tests/test_images.py
index 3c7bc4d6550..5b1a8e7aeab 100644
--- a/tests/smoke_tests/test_images.py
+++ b/tests/smoke_tests/test_images.py
@@ -613,7 +613,7 @@ def private_docker_registry_setup(request):
 @pytest.mark.no_azure
 @pytest.mark.no_kubernetes
 @pytest.mark.no_shadeform
-@pytest.mark.no_slurm  # Slurm does not support docker images and/or image_id
+@pytest.mark.no_slurm  # Slurm does not support private docker registries yet
 @pytest.mark.parametrize(
     'private_docker_registry_setup,cloud_provider',
     [
diff --git a/tests/smoke_tests/test_managed_job.py b/tests/smoke_tests/test_managed_job.py
index 484a8598e7a..1f36644fe73 100644
--- a/tests/smoke_tests/test_managed_job.py
+++ b/tests/smoke_tests/test_managed_job.py
@@ -2134,3 +2134,394 @@ def test_managed_jobs_instance_links(generic_cloud: str):
             timeout=15 * 60,
         )
         smoke_tests_utils.run_one_test(test)
+
+
+# ---------- Testing JobGroups ----------
+
+
+def _render_job_group_yaml(yaml_template_path: str, name: str, cloud: str,
+                           **kwargs) -> str:
+    """Render a JobGroup YAML template with name, cloud, and extra variables."""
+    with open(yaml_template_path, 'r') as f:
+        template_content = f.read()
+
+    template = jinja2.Template(template_content)
+    rendered = template.render(name=name, cloud=cloud, **kwargs)
+
+    with tempfile.NamedTemporaryFile(suffix='.yaml', mode='w',
+                                     delete=False) as f:
+        f.write(rendered)
+        f.flush()
+        return f.name
+
+
+@pytest.mark.managed_jobs
+@pytest.mark.kubernetes
+def test_job_group_basic(generic_cloud: str):
+    """Test basic JobGroup with 2 parallel jobs."""
+    name = smoke_tests_utils.get_cluster_name()
+    yaml_path = _render_job_group_yaml('tests/test_job_groups/smoke_basic.yaml',
+                                       name, generic_cloud)
+
+    test = smoke_tests_utils.Test(
+        'job_group_basic',
+        [
+            f'sky jobs launch {yaml_path} -y -d',
+            smoke_tests_utils.
+            get_cmd_wait_until_managed_job_status_contains_matching_job_name(
+                job_name=name,
+                job_status=[sky.ManagedJobStatus.SUCCEEDED],
+                timeout=360),
+            f'sky jobs queue | grep {name} | grep SUCCEEDED',
+        ],
+        f'sky jobs cancel -y -n {name}',
+        env=smoke_tests_utils.LOW_CONTROLLER_RESOURCE_ENV,
+        timeout=15 * 60,
+    )
+    smoke_tests_utils.run_one_test(test)
+
+
+@pytest.mark.managed_jobs
+@pytest.mark.kubernetes
+def test_job_group_networking(generic_cloud: str):
+    """Test JobGroup cross-job networking via hostname resolution."""
+    name = smoke_tests_utils.get_cluster_name()
+    yaml_path = _render_job_group_yaml(
+        'tests/test_job_groups/smoke_networking.yaml', name, generic_cloud)
+
+    # NOTE: We use job ID instead of `-n {name}` for `sky jobs logs` because
+    # `sky jobs logs -n <name>` only works for running (non-terminal) jobs.
+    # For completed jobs, we need to use the job ID directly.
+    get_job_id_cmd = (f'sky jobs queue | grep {name} | head -1 | '
+                      f'awk \'{{print $1}}\'')
+    test = smoke_tests_utils.Test(
+        'job_group_networking',
+        [
+            f'sky jobs launch {yaml_path} -y -d',
+            smoke_tests_utils.
+            get_cmd_wait_until_managed_job_status_contains_matching_job_name(
+                job_name=name,
+                job_status=[sky.ManagedJobStatus.SUCCEEDED],
+                timeout=360),
+            f'sky jobs logs $({get_job_id_cmd}) --no-follow | '
+            f'grep "SUCCESS: Connected to server"',
+        ],
+        f'sky jobs cancel -y -n {name}',
+        env=smoke_tests_utils.LOW_CONTROLLER_RESOURCE_ENV,
+        timeout=15 * 60,
+    )
+    smoke_tests_utils.run_one_test(test)
+
+
+@pytest.mark.managed_jobs
+@pytest.mark.kubernetes
+@pytest.mark.parametrize(
+    'image_id',
+    [
+        # Ubuntu base image - no sudo installed by default
+        'docker:ubuntu:22.04',
+        # Miniconda image - commonly used, has Python, no sudo
+        'docker:continuumio/miniconda3:24.1.2-0',
+    ])
+def test_job_group_networking_custom_image(generic_cloud: str, image_id: str):
+    """Test JobGroup networking with custom images that have no sudo installed.
+
+    This tests the fix for containers running as root but without sudo binary.
+    The DNS updater script must handle this case by aliasing sudo to empty
+    when running as root (using ALIAS_SUDO_TO_EMPTY_FOR_ROOT_CMD).
+    """
+    # Include image name in cluster name for uniqueness across parametrized runs
+    image_suffix = image_id.split(':')[-1].replace('.', '-')[:8]
+    name = smoke_tests_utils.get_cluster_name() + f'-{image_suffix}'
+    yaml_path = _render_job_group_yaml(
+        'tests/test_job_groups/smoke_networking_custom_image.yaml',
+        name,
+        generic_cloud,
+        image_id=image_id)
+
+    get_job_id_cmd = (f'sky jobs queue | grep {name} | head -1 | '
+                      f'awk \'{{print $1}}\'')
+    test = smoke_tests_utils.Test(
+        f'job_group_networking_custom_image_{image_suffix}',
+        [
+            f'sky jobs launch {yaml_path} -y -d',
+            smoke_tests_utils.
+            get_cmd_wait_until_managed_job_status_contains_matching_job_name(
+                job_name=name,
+                job_status=[sky.ManagedJobStatus.SUCCEEDED],
+                timeout=360),
+            # Verify the client connected successfully (proves networking worked)
+            f'sky jobs logs $({get_job_id_cmd}) --no-follow | '
+            f'grep "SUCCESS: Connected to server on custom image without sudo"',
+        ],
+        f'sky jobs cancel -y -n {name}',
+        env=smoke_tests_utils.LOW_CONTROLLER_RESOURCE_ENV,
+        timeout=15 * 60,
+    )
+    smoke_tests_utils.run_one_test(test)
+
+
+@pytest.mark.managed_jobs
+@pytest.mark.kubernetes
+def test_job_group_rl_architecture(generic_cloud: str):
+    """Test JobGroup with RL-style heterogeneous architecture (4 components)."""
+    name = smoke_tests_utils.get_cluster_name()
+    yaml_path = _render_job_group_yaml(
+        'tests/test_job_groups/smoke_rl_architecture.yaml', name, generic_cloud)
+
+    test = smoke_tests_utils.Test(
+        'job_group_rl_architecture',
+        [
+            f'sky jobs launch {yaml_path} -y -d',
+            smoke_tests_utils.
+            get_cmd_wait_until_managed_job_status_contains_matching_job_name(
+                job_name=name,
+                job_status=[sky.ManagedJobStatus.SUCCEEDED],
+                timeout=600),
+            f'sky jobs queue | grep {name} | grep SUCCEEDED',
+        ],
+        f'sky jobs cancel -y -n {name}',
+        env=smoke_tests_utils.LOW_CONTROLLER_RESOURCE_ENV,
+        timeout=20 * 60,
+    )
+    smoke_tests_utils.run_one_test(test)
+
+
+@pytest.mark.managed_jobs
+@pytest.mark.kubernetes
+def test_job_group_task_logs(generic_cloud: str):
+    """Test task-specific log viewing for JobGroups."""
+    name = smoke_tests_utils.get_cluster_name()
+    yaml_path = _render_job_group_yaml('tests/test_job_groups/smoke_basic.yaml',
+                                       name, generic_cloud)
+
+    get_job_id_cmd = (f'sky jobs queue | grep {name} | head -1 | '
+                      f'awk \'{{print $1}}\'')
+    test = smoke_tests_utils.Test(
+        'job_group_task_logs',
+        [
+            f'sky jobs launch {yaml_path} -y -d',
+            smoke_tests_utils.
+            get_cmd_wait_until_managed_job_status_contains_matching_job_name(
+                job_name=name,
+                job_status=[sky.ManagedJobStatus.SUCCEEDED],
+                timeout=360),
+            # Test default behavior - should show all tasks
+            f'sky jobs logs $({get_job_id_cmd}) --no-follow | grep "Job A" && '
+            f'sky jobs logs $({get_job_id_cmd}) --no-follow | grep "Job B"',
+            # Test viewing logs by task ID - should only show job-a
+            f'sky jobs logs $({get_job_id_cmd}) 0 --no-follow | '
+            f'grep "Job A" && ! sky jobs logs $({get_job_id_cmd}) 0 '
+            f'--no-follow | grep "Job B"',
+            # Test viewing logs by task name - should only show job-b
+            f'sky jobs logs $({get_job_id_cmd}) job-b --no-follow | '
+            f'grep "Job B" && ! sky jobs logs $({get_job_id_cmd}) job-b '
+            f'--no-follow | grep "Job A"',
+            # Test invalid task ID/name - should show error
+            f'sky jobs logs $({get_job_id_cmd}) 999 --no-follow 2>&1 | '
+            f'grep "No task found matching"',
+            f'sky jobs logs $({get_job_id_cmd}) nonexistent --no-follow 2>&1 | '
+            f'grep "No task found matching"',
+        ],
+        f'sky jobs cancel -y -n {name}',
+        env=smoke_tests_utils.LOW_CONTROLLER_RESOURCE_ENV,
+        timeout=15 * 60,
+    )
+    smoke_tests_utils.run_one_test(test)
+
+
+@pytest.mark.managed_jobs
+@pytest.mark.kubernetes
+def test_job_group_task_logs_sdk(generic_cloud: str):
+    """Test SDK task filtering with typed task parameter (int vs str).
+
+    This test verifies that the SDK correctly handles:
+    - task=int filters by task_id
+    - task=str filters by task_name
+    - Invalid task values return appropriate errors
+    """
+    name = smoke_tests_utils.get_cluster_name()
+    yaml_path = _render_job_group_yaml('tests/test_job_groups/smoke_basic.yaml',
+                                       name, generic_cloud)
+
+    def sdk_task_filter_test():
+        # Get job_id for the launched job
+        queue_request_id = jobs_sdk.queue_v2(refresh=False)
+        queue_records = sky.stream_and_get(queue_request_id)
+        # Parse the queue response (queue_v2 returns a tuple)
+        if isinstance(queue_records, tuple):
+            jobs_list, _, _, _ = queue_records
+        else:
+            jobs_list = queue_records
+        job_id = None
+        for job in jobs_list:
+            if job.get('job_name') == name:
+                job_id = job.get('job_id')
+                break
+        assert job_id is not None, f'Job {name} not found in queue'
+
+        # Test 1: task=int(0) should filter by task_id and show only job-a
+        output = io.StringIO()
+        jobs_sdk.tail_logs(job_id=job_id,
+                           follow=False,
+                           task=0,
+                           output_stream=output)
+        content = output.getvalue()
+        assert 'Job A' in content, f'Expected "Job A" in output for task=0'
+        assert 'Job B' not in content, f'Unexpected "Job B" in output for task=0'
+
+        # Test 2: task=str('job-b') should filter by task_name and show only job-b
+        output = io.StringIO()
+        jobs_sdk.tail_logs(job_id=job_id,
+                           follow=False,
+                           task='job-b',
+                           output_stream=output)
+        content = output.getvalue()
+        assert 'Job B' in content, f'Expected "Job B" in output for task="job-b"'
+        assert 'Job A' not in content, f'Unexpected "Job A" for task="job-b"'
+
+        # Test 3: task=int(999) should fail (non-existent task_id)
+        output = io.StringIO()
+        jobs_sdk.tail_logs(job_id=job_id,
+                           follow=False,
+                           task=999,
+                           output_stream=output)
+        content = output.getvalue()
+        assert 'No task found matching 999' in content, (
+            f'Expected error for task=999, got: {content}')
+
+        # Test 4: task=str('nonexistent') should fail (non-existent task_name)
+        output = io.StringIO()
+        jobs_sdk.tail_logs(job_id=job_id,
+                           follow=False,
+                           task='nonexistent',
+                           output_stream=output)
+        content = output.getvalue()
+        assert 'No task found matching' in content, (
+            f'Expected error for task="nonexistent", got: {content}')
+
+    test = smoke_tests_utils.Test(
+        'job_group_task_logs_sdk',
+        [
+            f'sky jobs launch {yaml_path} -y -d',
+            smoke_tests_utils.
+            get_cmd_wait_until_managed_job_status_contains_matching_job_name(
+                job_name=name,
+                job_status=[sky.ManagedJobStatus.SUCCEEDED],
+                timeout=360),
+            sdk_task_filter_test,
+        ],
+        f'sky jobs cancel -y -n {name}',
+        env=smoke_tests_utils.LOW_CONTROLLER_RESOURCE_ENV,
+        timeout=15 * 60,
+    )
+    smoke_tests_utils.run_one_test(test)
+
+
+# ---------- Testing JobGroup Primary/Auxiliary ----------
+@pytest.mark.managed_jobs
+@pytest.mark.no_hyperbolic  # Hyperbolic doesn't support host controllers and auto-stop
+@pytest.mark.no_shadeform  # Shadeform does not support host controllers
+def test_job_group_primary_auxiliary(generic_cloud: str):
+    """Test JobGroup with primary/auxiliary tasks termination behavior.
+
+    Tests that:
+    1. Primary task (trainer) completes successfully
+    2. Auxiliary task (replay-buffer) is automatically terminated after primary
+    3. The termination_delay is respected before auxiliary termination
+    4. The job group status is SUCCEEDED when primary succeeds
+    """
+    name = smoke_tests_utils.get_cluster_name()
+    # Use short delay (5s) for faster testing
+    delay = '5s'
+
+    # Generate the test YAML using Jinja template
+    template_str = pathlib.Path(
+        'tests/test_job_groups/smoke_primary_auxiliary.yaml').read_text()
+    template = jinja2.Template(template_str)
+    content = template.render(cloud=generic_cloud, name=name, delay=delay)
+
+    with tempfile.NamedTemporaryFile(suffix='.yaml', mode='w',
+                                     delete=False) as f:
+        f.write(content)
+        f.flush()
+        yaml_path = f.name
+
+        test = smoke_tests_utils.Test(
+            'job_group_primary_auxiliary',
+            [
+                f'sky jobs launch {yaml_path} -y -d',
+                # Wait for the job to complete (should succeed based on primary)
+                smoke_tests_utils.
+                get_cmd_wait_until_managed_job_status_contains_matching_job_name(
+                    job_name=name,
+                    job_status=[sky.ManagedJobStatus.SUCCEEDED],
+                    timeout=600),
+                # Verify primary task succeeded
+                f's=$({smoke_tests_utils.GET_JOB_QUEUE} | grep -A 2 {name}); '
+                f'echo "$s"; echo "$s" | grep trainer | grep SUCCEEDED',
+                # Verify auxiliary task was cancelled (terminated after primary)
+                # Check for CANCELLING or CANCELLED as the state may still be
+                # transitioning when we check
+                f's=$({smoke_tests_utils.GET_JOB_QUEUE} | grep -A 2 {name}); '
+                f'echo "$s"; echo "$s" | grep replay-buffer | grep -E "CANCELLING|CANCELLED"',
+                # Verify logs show the termination delay message
+                f'sky jobs logs --controller -n {name} --no-follow | '
+                f'grep -E "Waiting.*before terminating|Terminating auxiliary"',
+            ],
+            f'sky jobs cancel -y -n {name}',
+            env=smoke_tests_utils.LOW_CONTROLLER_RESOURCE_ENV,
+            timeout=20 * 60,
+        )
+        smoke_tests_utils.run_one_test(test)
+
+
+@pytest.mark.managed_jobs
+@pytest.mark.no_hyperbolic  # Hyperbolic doesn't support host controllers and auto-stop
+@pytest.mark.no_shadeform  # Shadeform does not support host controllers
+def test_job_group_primary_failure_immediate_termination(generic_cloud: str):
+    """Test that auxiliary tasks are terminated immediately when primary fails.
+
+    Tests that:
+    1. Primary task fails
+    2. Auxiliary task is terminated immediately (no delay, despite config)
+    3. The job group status is FAILED
+    """
+    name = smoke_tests_utils.get_cluster_name()
+
+    # Generate the test YAML using Jinja template
+    template_str = pathlib.Path(
+        'tests/test_job_groups/smoke_primary_failure.yaml').read_text()
+    template = jinja2.Template(template_str)
+    content = template.render(cloud=generic_cloud, name=name)
+
+    with tempfile.NamedTemporaryFile(suffix='.yaml', mode='w',
+                                     delete=False) as f:
+        f.write(content)
+        f.flush()
+        yaml_path = f.name
+
+        test = smoke_tests_utils.Test(
+            'job_group_primary_failure',
+            [
+                f'sky jobs launch {yaml_path} -y -d',
+                # Wait for the job to complete (should fail based on primary)
+                smoke_tests_utils.
+                get_cmd_wait_until_managed_job_status_contains_matching_job_name(
+                    job_name=name,
+                    job_status=[sky.ManagedJobStatus.FAILED],
+                    timeout=600),
+                # Verify primary task failed
+                f's=$({smoke_tests_utils.GET_JOB_QUEUE} | grep -A 2 {name}); '
+                f'echo "$s"; echo "$s" | grep failing-trainer | grep FAILED',
+                # Verify auxiliary task was cancelled (terminated immediately)
+                # Check for CANCELLING or CANCELLED as the state may still be
+                # transitioning when we check
+                f's=$({smoke_tests_utils.GET_JOB_QUEUE} | grep -A 2 {name}); '
+                f'echo "$s"; echo "$s" | grep replay-buffer | grep -E "CANCELLING|CANCELLED"',
+            ],
+            f'sky jobs cancel -y -n {name}',
+            env=smoke_tests_utils.LOW_CONTROLLER_RESOURCE_ENV,
+            timeout=20 * 60,
+        )
+        smoke_tests_utils.run_one_test(test)
diff --git a/tests/smoke_tests/test_mount_and_storage.py b/tests/smoke_tests/test_mount_and_storage.py
index ae2f7a78c86..7382add91ee 100644
--- a/tests/smoke_tests/test_mount_and_storage.py
+++ b/tests/smoke_tests/test_mount_and_storage.py
@@ -219,10 +219,14 @@ def _storage_mounts_commands_generator(f: TextIO, cluster_name: str,
     return test_commands, clean_command
 
 
-def _storage_mount_cached_test_command_generator(f1: TextIO, f2: TextIO,
+def _storage_mount_cached_test_command_generator(f1: TextIO,
+                                                 f2: TextIO,
                                                  cluster_name: str,
-                                                 storage_name: str, cloud: str):
-    assert cloud in ['aws', 'gcp', 'azure', 'kubernetes']
+                                                 storage_name: str,
+                                                 cloud: str,
+                                                 image_id: Optional[str] = None,
+                                                 graceful: bool = False):
+    assert cloud in ['aws', 'gcp', 'azure', 'kubernetes', 'slurm']
     template_str = pathlib.Path(
         'tests/test_yamls/test_storage_mount_cached.yaml.j2').read_text()
     template = jinja2.Template(template_str)
@@ -244,13 +248,17 @@ def _storage_mount_cached_test_command_generator(f1: TextIO, f2: TextIO,
     write_file_path = f1.name
     check_file_path = f2.name
 
+    image_id_arg = ''
+    if image_id is not None:
+        image_id_arg = f'--image-id {image_id}'
+
     test_commands = [
         smoke_tests_utils.launch_cluster_for_cloud_cmd(cloud, cluster_name),
         *smoke_tests_utils.STORAGE_SETUP_COMMANDS,
-        f'sky launch -y -c {cluster_name} --infra {cloud} {smoke_tests_utils.LOW_RESOURCE_ARG} {write_file_path}',
+        f'sky launch -y -c {cluster_name} --infra {cloud} {smoke_tests_utils.LOW_RESOURCE_ARG} {image_id_arg} {write_file_path}',
         f'sky logs {cluster_name} 1 --status',  # Ensure job succeeded.
-        f'sky down -y {cluster_name}',
-        f'sky launch -y -c {cluster_name} --infra {cloud} {smoke_tests_utils.LOW_RESOURCE_ARG} {check_file_path}',
+        f'sky down -y {cluster_name}{" --graceful" if graceful else ""}',
+        f'sky launch -y -c {cluster_name} --infra {cloud} {smoke_tests_utils.LOW_RESOURCE_ARG} {image_id_arg} {check_file_path}',
         f'sky logs {cluster_name} 1 --status',  # Ensure job succeeded.
     ]
     clean_command = (
@@ -304,6 +312,7 @@ def test_aws_storage_mounts_arm64():
 
 
 @pytest.mark.aws
+@pytest.mark.parametrize('graceful', [False, True])
 @pytest.mark.parametrize(
     'ami',
     [
@@ -334,14 +343,14 @@ def test_aws_storage_mounts_arm64():
         # --output text --region us-east-2
         'ami-0a5a5b7e2278263e5'  # Amazon Linux 2023 (yum)
     ])
-def test_aws_storage_mounts_cached(ami: Optional[str]):
+def test_aws_storage_mounts_cached(ami: Optional[str], graceful: bool):
     name = smoke_tests_utils.get_cluster_name()
     cloud = 'aws'
     storage_name = f'sky-test-{int(time.time())}'
     with tempfile.NamedTemporaryFile(suffix='.yaml', mode='w') as f1:
         with tempfile.NamedTemporaryFile(suffix='.yaml', mode='w') as f2:
             test_commands, clean_command = _storage_mount_cached_test_command_generator(
-                f1, f2, name, storage_name, cloud)
+                f1, f2, name, storage_name, cloud, graceful=graceful)
 
             for i, cmd in enumerate(test_commands):
                 if cmd.startswith('sky launch') and '--infra aws' in cmd:
@@ -523,6 +532,30 @@ def test_kubernetes_storage_mounts_cached():
             smoke_tests_utils.run_one_test(test)
 
 
+@pytest.mark.slurm
+@pytest.mark.parametrize(
+    'image_id',
+    [
+        None,  # No container image
+        'docker:ubuntu:24.04',
+    ])
+def test_slurm_storage_mounts_cached(image_id: Optional[str]):
+    name = smoke_tests_utils.get_cluster_name()
+    cloud = 'slurm'
+    storage_name = f'sky-test-{int(time.time())}'
+    with tempfile.NamedTemporaryFile(suffix='.yaml', mode='w') as f1:
+        with tempfile.NamedTemporaryFile(suffix='.yaml', mode='w') as f2:
+            test_commands, clean_command = _storage_mount_cached_test_command_generator(
+                f1, f2, name, storage_name, cloud, image_id)
+            test = smoke_tests_utils.Test(
+                'slurm_storage_mount_cached',
+                test_commands,
+                clean_command,
+                timeout=20 * 60,  # 20 mins
+            )
+            smoke_tests_utils.run_one_test(test)
+
+
 @pytest.mark.kubernetes
 def test_kubernetes_context_switch():
     name = smoke_tests_utils.get_cluster_name()
diff --git a/tests/smoke_tests/test_plugin.py b/tests/smoke_tests/test_plugin.py
index 0e0874c4fc8..117ab387a62 100644
--- a/tests/smoke_tests/test_plugin.py
+++ b/tests/smoke_tests/test_plugin.py
@@ -1,10 +1,18 @@
 """Smoke tests for SkyPilot plugins."""
+import os
+import pathlib
+import subprocess
 import tempfile
 import textwrap
 
 import pytest
 from smoke_tests import smoke_tests_utils
 
+import sky
+
+# Resolve plugin path relative to this file (tests/smoke_tests/test_plugin.py)
+_PLUGIN_SOURCE_DIR = pathlib.Path(__file__).resolve().parent / 'plugin'
+
 
 @pytest.mark.no_dependency
 @pytest.mark.no_remote_server  # Restart is required to load the plugin
@@ -34,3 +42,82 @@ def test_plugin(generic_cloud: str):
         f'sky down -y {name}',
     )
     smoke_tests_utils.run_one_test(test)
+
+
+@pytest.mark.managed_jobs
+@pytest.mark.no_dependency
+@pytest.mark.no_remote_server  # Restart required so API picks up plugin upload config
+def test_plugin_upload_to_jobs_controller(generic_cloud: str):
+    """Smoke test that plugin wheels are uploaded and installed on the jobs controller.
+
+    With remote_plugins.yaml and controller_wheel_path in plugins.yaml set,
+    launching a job should trigger controller launch with plugin wheels
+    mounted and installed. This test verifies that flow by running a job
+    and checking controller logs for evidence of wheel install.
+    """
+    wheel_dir = tempfile.mkdtemp(prefix='sky_plugin_wheels_')
+    subprocess.run(
+        [
+            'pip',
+            'wheel',
+            str(_PLUGIN_SOURCE_DIR),
+            '-w',
+            wheel_dir,
+        ],
+        check=True,
+        capture_output=True,
+        text=True,
+    )
+
+    plugins_yaml_path = os.path.join(wheel_dir, 'plugins.yaml')
+    with open(plugins_yaml_path, 'w') as f:
+        f.write(
+            textwrap.dedent(f"""\
+            controller_wheel_path: {wheel_dir}
+            plugins: []
+            """))
+
+    remote_plugins_yaml_path = os.path.join(wheel_dir, 'remote_plugins.yaml')
+    with open(remote_plugins_yaml_path, 'w') as f:
+        f.write(
+            textwrap.dedent("""\
+            plugins:
+            - class: skypilot_plugin_smoketest.TestPlugin
+            """))
+
+    name = smoke_tests_utils.get_cluster_name()
+    env = dict(smoke_tests_utils.LOW_CONTROLLER_RESOURCE_ENV)
+    env['SKYPILOT_SERVER_PLUGINS_CONFIG'] = plugins_yaml_path
+    env['SKYPILOT_SERVER_REMOTE_PLUGINS_CONFIG'] = remote_plugins_yaml_path
+
+    test = smoke_tests_utils.Test(
+        'plugin_upload_jobs_controller',
+        [
+            smoke_tests_utils.SKY_API_RESTART,
+            f'sky jobs launch -n {name} --infra {generic_cloud} '
+            f'{smoke_tests_utils.LOW_RESOURCE_ARG} "echo plugin_upload_ok" -y -d',
+            smoke_tests_utils.
+            get_cmd_wait_until_managed_job_status_contains_matching_job_name(
+                job_name=name,
+                job_status=[sky.ManagedJobStatus.SUCCEEDED],
+                # It is possible this timeout is not enough
+                # (since the jobs controller needs to be created from scratch).
+                # Increase this as needed.
+                timeout=200,
+            ),
+            # Controller setup includes "uv pip install" (or "pip install") for plugin wheels
+            (
+                'sky logs $(sky status | grep sky-jobs-controller | awk \'NR==1{{print $1}}\') '
+                f'$(sky jobs queue | grep {name} | awk \'NR==1{{print $1}}\') '
+                # We assume this is the first time the job controller is started.
+                # Else, we'd see logline like
+                # "Audited 1 package in 2ms" instead of the wheel install log
+                # (since the wheel is already installed)
+                '--no-follow | grep -E "\\.whl"'),
+            f'sky jobs cancel -y -n {name}',
+        ],
+        f'sky jobs cancel -y -n {name}',
+        env=env,
+        timeout=20 * 60,
+    )
+    smoke_tests_utils.run_one_test(test)
diff --git a/tests/smoke_tests/test_pools.py b/tests/smoke_tests/test_pools.py
index bacc8782c74..aab6b3a6648 100644
--- a/tests/smoke_tests/test_pools.py
+++ b/tests/smoke_tests/test_pools.py
@@ -165,6 +165,30 @@ def wait_until_job_status(
     return s
 
 
+def check_logs(job_id: int, expected_pattern: str):
+    """Check that job logs contain the expected pattern.
+
+    Args:
+        job_id: The job ID to check logs for.
+        expected_pattern: The pattern to grep for in the logs.
+    """
+    return (
+        f'for attempt in 1 2; do '
+        f'  logs=$(sky jobs logs --controller {job_id} --no-follow 2>&1); '
+        f'  echo "$logs"; '
+        f'  if echo "$logs" | grep "{expected_pattern}"; then '
+        f'    echo "Job {job_id} logs contain expected pattern: {expected_pattern}"; '
+        f'    exit 0; '
+        f'  fi; '
+        f'  if [ $attempt -eq 1 ]; then '
+        f'    echo "Pattern not found on attempt $attempt, retrying in 5 seconds..."; '
+        f'    sleep 5; '
+        f'  fi; '
+        f'done; '
+        f'echo "ERROR: Job {job_id} logs do not contain expected pattern: {expected_pattern} after 2 attempts"; '
+        f'exit 1')
+
+
 def wait_until_job_status_by_id(
         job_id: int,
         good_statuses: List[str],
@@ -195,29 +219,11 @@ def wait_until_job_status_by_id(
     return s
 
 
-def check_logs(job_id: int, expected_pattern: str):
-    """Check that job logs contain the expected pattern.
-
-    Args:
-        job_id: The job ID to check logs for.
-        expected_pattern: The pattern to grep for in the logs.
-    """
-    return (
-        f'logs=$(sky jobs logs --controller {job_id} --no-follow 2>&1); '
-        f'echo "$logs"; '
-        f'if ! echo "$logs" | grep "{expected_pattern}"; then '
-        f'  echo "ERROR: Job {job_id} logs do not contain expected pattern: {expected_pattern}"; '
-        f'  exit 1; '
-        f'fi; '
-        f'echo "Job {job_id} logs contain expected pattern: {expected_pattern}"'
-    )
-
-
 def check_num_running_jobs(job_names: List[str],
                            expected_count: int,
                            timeout: int = 30):
     """Check that exactly expected_count jobs are in RUNNING state.
-    
+
     Args:
         job_names: List of job names to check.
         expected_count: Expected number of jobs in RUNNING state.
@@ -288,6 +294,62 @@ def check_for_recovery_message_on_controller(job_name: str):
             f'echo "$s"; echo; echo; echo "$s" | grep "RECOVERING"')
 
 
+def wait_for_message_in_pool_logs(pool_name: str,
+                                  message: str,
+                                  timeout: int = 300,
+                                  time_between_checks: int = 10):
+    """Wait for a specific message to appear in pool logs.
+    
+    Args:
+        pool_name: Name of the pool to check logs for.
+        message: The message to search for in the logs (case-insensitive).
+        timeout: Maximum time to wait in seconds.
+        time_between_checks: Time to wait between checks in seconds.
+    """
+    num_checks = timeout // time_between_checks
+    return (
+        f'for i in {{1..{num_checks}}}; do '
+        f'logs=$(sky jobs pool logs --controller {pool_name} --no-follow 2>&1); '
+        'echo "$logs"; '
+        f'if echo "$logs" | grep -i "{message}"; then '
+        f'  echo "Found {message} in logs"; '
+        '  exit 0; '
+        'fi; '
+        f'echo "Check $i/{num_checks}: {message} not found yet"; '
+        f'sleep {time_between_checks}; '
+        'done; '
+        f'echo "ERROR: {message} not found in logs after timeout"; '
+        'exit 1')
+
+
+def wait_for_message_in_pool_logs(pool_name: str,
+                                  message: str,
+                                  timeout: int = 300,
+                                  time_between_checks: int = 10):
+    """Wait for a specific message to appear in pool logs.
+    
+    Args:
+        pool_name: Name of the pool to check logs for.
+        message: The message to search for in the logs (case-insensitive).
+        timeout: Maximum time to wait in seconds.
+        time_between_checks: Time to wait between checks in seconds.
+    """
+    num_checks = timeout // time_between_checks
+    return (
+        f'for i in {{1..{num_checks}}}; do '
+        f'logs=$(sky jobs pool logs --controller {pool_name} --no-follow 2>&1); '
+        'echo "$logs"; '
+        f'if echo "$logs" | grep -i "{message}"; then '
+        f'  echo "Found {message} in logs"; '
+        '  exit 0; '
+        'fi; '
+        f'echo "Check $i/{num_checks}: {message} not found yet"; '
+        f'sleep {time_between_checks}; '
+        'done; '
+        f'echo "ERROR: {message} not found in logs after timeout"; '
+        'exit 1')
+
+
 def basic_pool_conf(
     num_workers: int,
     infra: str,
@@ -1111,10 +1173,7 @@ def test_pools_num_jobs_basic(generic_cloud: str):
         with tempfile.NamedTemporaryFile(delete=True) as job_yaml:
             write_yaml(pool_yaml, pool_config)
             write_yaml(job_yaml, job_config)
-            if smoke_tests_utils.server_side_is_consolidation_mode():
-                job_ids = list(range(1, 1 + num_jobs))
-            else:
-                job_ids = list(range(2, 2 + num_jobs))
+            job_ids = list(range(1, 1 + num_jobs))
             test = smoke_tests_utils.Test(
                 'test_pools_num_jobs',
                 [
@@ -1197,16 +1256,7 @@ def test_pools_num_jobs_option(generic_cloud: str):
                     # Test parallel job launching with --num-jobs 3
                     ('s=$(sky jobs launch --pool {pool_name} {job_yaml} --num-jobs 10 -d -y); '
                      'echo "$s"; '
-                     'echo; echo; echo "$s" | grep "Job submitted, ID: 1"; '
-                     'echo "$s" | grep "Job submitted, ID: 2"; '
-                     'echo "$s" | grep "Job submitted, ID: 3"; '
-                     'echo "$s" | grep "Job submitted, ID: 4"; '
-                     'echo "$s" | grep "Job submitted, ID: 5"; '
-                     'echo "$s" | grep "Job submitted, ID: 6"; '
-                     'echo "$s" | grep "Job submitted, ID: 7"; '
-                     'echo "$s" | grep "Job submitted, ID: 8"; '
-                     'echo "$s" | grep "Job submitted, ID: 9"; '
-                     'echo "$s" | grep "Job submitted, ID: 10"; '
+                     'echo; echo; echo "$s" | grep "Jobs submitted with IDs: 2,3,4,5,6,7,8,9,10,11"; '
                      'sleep 5').format(pool_name=pool_name,
                                        job_yaml=job_yaml.name)
                 ],
@@ -1261,9 +1311,7 @@ def test_pools_single_yaml(generic_cloud: str):
             [
                 _LAUNCH_POOL_AND_CHECK_SUCCESS.format(
                     pool_name=pool_name, pool_yaml=one_config_yaml.name),
-                (f's=$(sky jobs launch --pool {pool_name} {one_config_yaml.name} --name {job_name} -d -y); '
-                 'echo "$s"; '
-                 'echo; echo; echo "$s" | grep "Job submitted"'),
+                f'sky jobs launch --pool {pool_name} {one_config_yaml.name} --name {job_name} -d -y',
                 wait_until_job_status(job_name, ['SUCCEEDED'], timeout=timeout),
             ],
             timeout=smoke_tests_utils.get_timeout(generic_cloud),
@@ -1316,12 +1364,12 @@ def test_pools_heterogeneous_any_of(generic_cloud: str):
 @pytest.mark.no_remote_server  # see note 1 above
 def test_pools_heterogeneous_resource_scheduling(generic_cloud: str):
     """Test resource-aware scheduling with heterogeneous job requirements.
-    
+
     This test validates that jobs with any_of resources (T4 or A100) can be
     scheduled on a worker with only T4s. The scheduler should recognize that
     T4s are available and schedule jobs accordingly, even though A100s are
     also specified as an option.
-    
+
     Test scenario:
     - Pool: 1 worker with g4dn.12xlarge (4 T4 GPUs)
     - Job: Requests any_of T4:1 or A100:1
@@ -1335,11 +1383,11 @@ def test_pools_heterogeneous_resource_scheduling(generic_cloud: str):
     pool_config = textwrap.dedent(f"""
     pool:
         workers: 1
-    
+
     resources:
         instance_type: g4dn.12xlarge
         infra: aws
-    
+
     setup: |
         echo "Pool worker setup complete"
     """)
@@ -1348,10 +1396,10 @@ def test_pools_heterogeneous_resource_scheduling(generic_cloud: str):
     job_name_prefix = f'{name}-job'
     job_config = textwrap.dedent(f"""
     name: {job_name_prefix}
-    
+
     resources:
         accelerators: {{'T4:1', 'A100:1'}}
-    
+
     run: |
         echo "Job running with GPU:"
         nvidia-smi --query-gpu=name --format=csv,noheader
@@ -1732,12 +1780,12 @@ def test_pool_resource_contention_two_workers(generic_cloud: str):
     name = smoke_tests_utils.get_cluster_name()
     pool_name = f'{name}-pool'
 
-    # Pool with 2 CPUs and 4GB memory, single worker
+    # Pool with 2 CPUs and 8GB memory, single worker
     pool_config = basic_pool_conf(
         num_workers=2,
         infra=generic_cloud,
         cpus='2',
-        memory='4GB',
+        memory='8GB',
     )
 
     def _get_job_config(job_name: str) -> str:
@@ -1745,12 +1793,12 @@ def _get_job_config(job_name: str) -> str:
             job_name=job_name,
             run_cmd=f'echo "Job {job_name} running" && sleep infinity',
             cpus='2',
-            memory='4GB',
+            memory='8GB',
         )
 
     num_jobs = 4
 
-    # Four jobs, each taking 2 CPUs and 4GB memory (only two can fit)
+    # Four jobs, each taking 2 CPUs and 8GB memory (only two can fit)
     job_names = [f'{name}-job-{i}' for i in range(num_jobs)]
     job_configs = [_get_job_config(job_name) for job_name in job_names]
 
@@ -1878,28 +1926,28 @@ def test_pool_resource_reclamation(generic_cloud: str):
     name = smoke_tests_utils.get_cluster_name()
     pool_name = f'{name}-pool'
 
-    # Pool with 2 CPUs and 4GB memory, single worker
+    # Pool with 2 CPUs and 8GB memory, single worker
     pool_config = basic_pool_conf(
         num_workers=1,
         infra=generic_cloud,
         cpus='2',
-        memory='4GB',
+        memory='8GB',
     )
 
-    # Two jobs, each taking 2 CPUs and 4GB memory (can't both fit initially)
+    # Two jobs, each taking 2 CPUs and 8GB memory (can't both fit initially)
     job_name_1 = f'{name}-job-1'
     job_name_2 = f'{name}-job-2'
     job_config_1 = basic_job_conf(
         job_name=job_name_1,
         run_cmd='echo "hi"',
         cpus='2',
-        memory='4GB',
+        memory='8GB',
     )
     job_config_2 = basic_job_conf(
         job_name=job_name_2,
         run_cmd='echo "hi"',
         cpus='2',
-        memory='4GB',
+        memory='8GB',
     )
 
     with tempfile.NamedTemporaryFile(delete=True) as pool_yaml:
@@ -1943,15 +1991,15 @@ def test_pool_resource_fallback_to_unaware(generic_cloud: str):
     name = smoke_tests_utils.get_cluster_name()
     pool_name = f'{name}-pool'
 
-    # Pool with 2 CPUs and 4GB memory, single worker
+    # Pool with 2 CPUs and 8GB memory, single worker
     pool_config = basic_pool_conf(
         num_workers=1,
         infra=generic_cloud,
         cpus='2',
-        memory='4GB',
+        memory='8GB',
     )
 
-    # Two jobs, each taking 2 CPUs and 4GB memory (can't both fit initially)
+    # Two jobs, each taking 2 CPUs and 8GB memory (can't both fit initially)
     resource_aware_job_name = f'{name}-job-1'
     resource_unaware_job_name = f'{name}-job-2'
 
@@ -1959,7 +2007,7 @@ def test_pool_resource_fallback_to_unaware(generic_cloud: str):
         job_name=resource_aware_job_name,
         run_cmd='echo "hi"',
         cpus='2',
-        memory='4GB',
+        memory='8GB',
     )
     resource_unaware_job_config = basic_job_conf(
         job_name=resource_unaware_job_name,
@@ -2092,7 +2140,7 @@ def test_pool_fractional_gpu_scheduling(generic_cloud: str):
 @pytest.mark.no_remote_server  # see note 1 above
 def test_pool_one_job_per_worker_no_resources(generic_cloud: str):
     """Test that when no resources are specified, only 1 job runs per worker.
-    
+
     This test validates that jobs without resource specifications are
     limited to 1 job per worker. The test:
     1. Launches a pool with 1 worker
@@ -2161,3 +2209,506 @@ def test_pool_one_job_per_worker_no_resources(generic_cloud: str):
         )
 
         smoke_tests_utils.run_one_test(test)
+
+
+@pytest.mark.no_remote_server  # see note 1 above
+def test_pool_secrets_preserved_on_worker_update(generic_cloud: str):
+    """Test that secrets provided via CLI are preserved when updating pool workers.
+
+    This test:
+    1. Creates a pool with a secret defined as null in YAML but set via CLI
+    2. Verifies the secret is accessible in setup commands on worker 1
+    3. Updates worker count to 2
+    4. Verifies the secret is accessible in setup commands on worker 2
+    """
+
+    def check_for_secret_in_worker_logs(pool_name: str, worker_id: int,
+                                        secret_value: str):
+        """Check that worker logs contain the expected secret value."""
+        return (
+            f's=$(sky jobs pool logs {pool_name} {worker_id} --no-follow 2>&1); '
+            f'echo "$s"; '
+            f'if ! echo "$s" | grep "{secret_value}"; then '
+            f'  echo "ERROR: Worker {worker_id} logs do not contain expected secret value: {secret_value}"; '
+            f'  exit 1; '
+            f'fi; '
+            f'echo "Worker {worker_id} logs contain expected secret value: {secret_value}"'
+        )
+
+    secret_value = 'test-secret-value-12345'
+    secret_name = 'FAKE_GITLAB_TOKEN'
+
+    # Create pool config with secret as null in YAML
+    pool_config = textwrap.dedent(f"""
+    secrets:
+        {secret_name}: null
+
+    pool:
+        workers: 1
+
+    resources:
+        infra: {generic_cloud}
+
+    setup: |
+        echo "Secret value: ${{{secret_name}}}"
+        echo "Setup complete"
+    """)
+
+    timeout = smoke_tests_utils.get_timeout(generic_cloud)
+    with tempfile.NamedTemporaryFile(delete=True) as pool_yaml:
+        write_yaml(pool_yaml, pool_config)
+        pool_name = f'{smoke_tests_utils.get_cluster_name()}-pool'
+
+        test = smoke_tests_utils.Test(
+            'test_pool_secrets_preserved_on_worker_update',
+            [
+                # Launch pool with secret provided via CLI
+                f's=$(sky jobs pool apply -p {pool_name} {pool_yaml.name} --secret {secret_name}={secret_value} -y); '
+                f'echo "$s"; '
+                f'echo; echo; echo "$s" | grep "Successfully created pool"',
+                # Wait for pool to be ready
+                wait_until_pool_ready(pool_name, timeout=timeout),
+                # Wait for worker 1 to be ready
+                wait_until_worker_status(
+                    pool_name, 'READY', timeout=timeout, num_occurrences=1),
+                # Check that worker 1 logs contain the secret value
+                check_for_secret_in_worker_logs(pool_name, 1, secret_value),
+                # Update workers to 2
+                _POOL_CHANGE_NUM_WORKERS_AND_CHECK_SUCCESS.format(
+                    pool_name=pool_name, num_workers=2),
+                # Wait for both workers to be ready
+                wait_until_num_workers(pool_name, 2, timeout=timeout),
+                wait_until_worker_status(
+                    pool_name, 'READY', timeout=timeout, num_occurrences=2),
+                # Check that worker 2 logs also contain the secret value
+                check_for_secret_in_worker_logs(pool_name, 2, secret_value),
+            ],
+            timeout=timeout,
+            teardown=_TEARDOWN_POOL.format(pool_name=pool_name),
+        )
+
+        smoke_tests_utils.run_one_test(test)
+
+
+@pytest.mark.no_remote_server  # see note 1 above
+def test_pools_num_jobs_rank(generic_cloud: str):
+    """Test that SKYPILOT_JOB_RANK is correctly set for jobs launched with --num-jobs.
+
+    Launches 3 jobs with --num-jobs 3, waits for each to succeed, and verifies
+    that each job's logs show the correct rank (which should be job_id - 1).
+    """
+    name = smoke_tests_utils.get_cluster_name()
+    pool_name = f'{name}-pool'
+    pool_config = basic_pool_conf(num_workers=1, infra=generic_cloud)
+    job_config = basic_job_conf(job_name=f'{name}-job',
+                                run_cmd='echo "My rank is $SKYPILOT_JOB_RANK"')
+    timeout = smoke_tests_utils.get_timeout(generic_cloud)
+    NUM_JOBS = 3
+
+    with tempfile.NamedTemporaryFile(delete=True) as pool_yaml:
+        with tempfile.NamedTemporaryFile(delete=True) as job_yaml:
+            write_yaml(pool_yaml, pool_config)
+            write_yaml(job_yaml, job_config)
+
+            # Build test commands
+            test_commands = [
+                _LAUNCH_POOL_AND_CHECK_SUCCESS.format(pool_name=pool_name,
+                                                      pool_yaml=pool_yaml.name),
+                wait_until_pool_ready(pool_name, timeout=timeout),
+            ]
+
+            launch_cmd = (
+                's=$(sky jobs launch --pool {pool_name} {job_yaml} --num-jobs {NUM_JOBS} -d -y); '
+                'echo "$s"; '
+                'echo "$s" | grep "Jobs submitted with IDs:" | sed "s/.*IDs: \\([0-9,]*\\).*/\\1/" > /tmp/job_ids.txt; '
+                'cat /tmp/job_ids.txt').format(pool_name=pool_name,
+                                               job_yaml=job_yaml.name,
+                                               NUM_JOBS=NUM_JOBS)
+            test_commands.append(launch_cmd)
+
+            START_JOB_ID = 1
+            job_ids = [i for i in range(START_JOB_ID, START_JOB_ID + NUM_JOBS)]
+            for job_id in job_ids:
+                test_commands.append(
+                    wait_until_job_status_by_id(
+                        job_id, ['SUCCEEDED'],
+                        ['CANCELLED', 'FAILED_CONTROLLER'],
+                        timeout=timeout))
+
+            # Wait for the job logs to be ready.
+            test_commands.append('sleep 30')
+
+            for job_id in job_ids:
+                test_commands.append(
+                    check_logs(job_id, f'My rank is {job_id - 1}'))
+
+            test = smoke_tests_utils.Test(
+                'test_pools_num_jobs_rank',
+                test_commands,
+                timeout=timeout * 2,  # Give extra time for multiple jobs
+                teardown=_TEARDOWN_POOL.format(pool_name=pool_name),
+            )
+            smoke_tests_utils.run_one_test(test)
+
+
+@pytest.mark.no_remote_server  # see note 1 above
+def test_pools_num_jobs_speed(generic_cloud: str):
+    """Test that we can launch a large number of jobs quickly.
+    """
+    name = smoke_tests_utils.get_cluster_name()
+    pool_name = f'{name}-pool'
+    pool_config = basic_pool_conf(num_workers=1, infra=generic_cloud)
+    job_config = basic_job_conf(job_name=f'{name}-job',
+                                run_cmd='echo "My rank is $SKYPILOT_JOB_RANK"')
+    timeout = smoke_tests_utils.get_timeout(generic_cloud)
+    NUM_JOBS = 10
+
+    with tempfile.NamedTemporaryFile(delete=True) as pool_yaml:
+        with tempfile.NamedTemporaryFile(delete=True) as job_yaml:
+            write_yaml(pool_yaml, pool_config)
+            write_yaml(job_yaml, job_config)
+
+            # Build test commands
+            test_commands = [
+                _LAUNCH_POOL_AND_CHECK_SUCCESS.format(pool_name=pool_name,
+                                                      pool_yaml=pool_yaml.name),
+                wait_until_pool_ready(pool_name, timeout=timeout),
+            ]
+            launch_timeout = 70
+            launch_cmd = (
+                'timeout {launch_timeout} bash -c "sky jobs launch --pool {pool_name} {job_yaml} --num-jobs {NUM_JOBS} -d -y"'
+            ).format(pool_name=pool_name,
+                     job_yaml=job_yaml.name,
+                     NUM_JOBS=NUM_JOBS,
+                     launch_timeout=launch_timeout)
+            test_commands.append(launch_cmd)
+
+            test = smoke_tests_utils.Test(
+                'test_pools_num_jobs_speed',
+                test_commands,
+                timeout=timeout * 2,  # Give extra time for multiple jobs
+                # Try to tear down multiple times since jobs may take a while
+                # to get to pending.
+                teardown=_TEARDOWN_POOL.format(pool_name=pool_name),
+            )
+            smoke_tests_utils.run_one_test(test)
+
+
+def autoscaling_pool_conf(
+    num_workers: int,
+    max_workers: int,
+    min_workers: Optional[int] = None,
+    queue_length_threshold: Optional[int] = None,
+    upscale_delay_seconds: Optional[int] = None,
+    downscale_delay_seconds: Optional[int] = None,
+    infra: str = 'aws',
+    cpus: Optional[str] = None,
+    memory: Optional[str] = None,
+    setup_cmd: str = 'echo "setup message"',
+):
+    """Create a pool config with autoscaling enabled.
+    
+    Args:
+        num_workers: Initial number of workers (also used as min if min_workers not set)
+        max_workers: Maximum number of workers for autoscaling
+        min_workers: Minimum number of workers for autoscaling (defaults to num_workers)
+        queue_length_threshold: Queue length threshold for autoscaling (defaults to 1)
+        upscale_delay_seconds: Delay before scaling up (defaults to None)
+        downscale_delay_seconds: Delay before scaling down (defaults to None)
+        infra: Infrastructure provider
+        cpus: CPU requirements
+        memory: Memory requirements
+        setup_cmd: Setup command
+    """
+    cpus_string = f'    cpus: {cpus}\n' if cpus else ''
+    memory_string = f'    memory: {memory}\n' if memory else ''
+    min_workers_str = f'        min_workers: {min_workers}\n' if min_workers is not None else ''
+    queue_threshold_str = f'        queue_length_threshold: {queue_length_threshold}\n' if queue_length_threshold is not None else ''
+    upscale_delay_str = f'        upscale_delay_seconds: {upscale_delay_seconds}\n' if upscale_delay_seconds is not None else ''
+    downscale_delay_str = f'        downscale_delay_seconds: {downscale_delay_seconds}\n' if downscale_delay_seconds is not None else ''
+    return textwrap.dedent(f"""
+    pool:
+        workers: {num_workers}
+        max_workers: {max_workers}
+{min_workers_str}{queue_threshold_str}{upscale_delay_str}{downscale_delay_str}
+    resources:
+        infra: {infra}
+{cpus_string}{memory_string}
+    setup: |
+        {setup_cmd}
+    """)
+
+
+def check_workers_do_not_exceed(pool_name: str,
+                                max_workers: int,
+                                duration: int = 60,
+                                time_between_checks: int = 10):
+    """Check that workers never exceed max_workers for a given duration."""
+    num_checks = duration // time_between_checks
+    return (
+        f'for i in {{1..{num_checks}}}; do '
+        f's=$(sky jobs pool status {pool_name} | grep "^{pool_name}"); '
+        'echo "$s"; '
+        f'current=$(echo "$s" | grep -oE "[0-9]+/[0-9]+" | head -1 | cut -d"/" -f1); '
+        f'if [ -n "$current" ] && [ "$current" -gt {max_workers} ]; then '
+        f'  echo "ERROR: Workers ($current) exceeded max_workers ({max_workers})"; '
+        '  exit 1; '
+        'fi; '
+        f'echo "Check $i/{num_checks}: Workers = $current (max = {max_workers})"; '
+        f'sleep {time_between_checks}; '
+        'done; '
+        'echo "Workers did not exceed max_workers during the check period"')
+
+
+@pytest.mark.no_remote_server  # see note 1 above
+def test_pool_autoscaling_scale_up(generic_cloud: str):
+    """Test that pool autoscales up when jobs are queued.
+    
+    This test:
+    1. Creates a pool with workers=1, max_workers=3 (2 higher than initial)
+    2. Launches multiple jobs that will queue up
+    3. Verifies that workers scale up to max_workers
+    4. Verifies at least 2 scaling events occur (1->2, 2->3)
+    """
+    timeout = smoke_tests_utils.get_timeout(generic_cloud)
+    name = smoke_tests_utils.get_cluster_name()
+    pool_name = f'{name}-pool'
+
+    # Pool with autoscaling: start with 1 worker, can scale up to 3
+    pool_config = autoscaling_pool_conf(
+        num_workers=1,
+        max_workers=3,
+        infra=generic_cloud,
+        setup_cmd='echo hi',
+        upscale_delay_seconds=20,
+        downscale_delay_seconds=20,
+    )
+
+    # Job that runs for a while to keep queue length high
+    job_name = f'{name}-job'
+    job_config = basic_job_conf(
+        job_name=job_name,
+        run_cmd='sleep 3000',  # Long-running job
+    )
+
+    with tempfile.NamedTemporaryFile(delete=True) as pool_yaml:
+        with tempfile.NamedTemporaryFile(delete=True) as job_yaml:
+            write_yaml(pool_yaml, pool_config)
+            write_yaml(job_yaml, job_config)
+
+            test = smoke_tests_utils.Test(
+                'test_pool_autoscaling_scale_up',
+                [
+                    _LAUNCH_POOL_AND_CHECK_SUCCESS.format(
+                        pool_name=pool_name, pool_yaml=pool_yaml.name),
+                    wait_until_pool_ready(pool_name, timeout=timeout),
+                    # Launch multiple jobs to create queue
+                    # Launch 5 jobs - first one runs, others queue up
+                    *[
+                        _LAUNCH_JOB_AND_CHECK_SUCCESS_WITH_NAME.format(
+                            pool_name=pool_name,
+                            job_yaml=job_yaml.name,
+                            job_name=f'{job_name}-{i}') for i in range(1, 6)
+                    ],
+                    # Verify we scale up to 3 workers (second scaling event)
+                    wait_until_num_workers(pool_name, 3, timeout=300),
+                    # Verify we stay at 3 workers (max_workers)
+                    'sleep 30',
+                    wait_until_num_workers(pool_name, 3, timeout=30),
+                ],
+                timeout=timeout * 2,  # Autoscaling takes time
+                teardown=cancel_jobs_and_teardown_pool(pool_name, timeout=10),
+            )
+            smoke_tests_utils.run_one_test(test)
+
+
+@pytest.mark.no_remote_server  # see note 1 above
+def test_pool_autoscaling_no_scale_when_max_equals_workers(generic_cloud: str):
+    """Test that pool does not scale above workers when max_workers == workers.
+    
+    This test:
+    1. Creates a pool with workers=2, max_workers=2 (same as workers)
+    2. Launches multiple jobs that will queue up
+    3. Verifies that workers never exceed 2 even with jobs queued
+    """
+    timeout = smoke_tests_utils.get_timeout(generic_cloud)
+    name = smoke_tests_utils.get_cluster_name()
+    pool_name = f'{name}-pool'
+
+    # Pool with autoscaling: start with 2 workers, max_workers=2 (no scaling up)
+    pool_config = autoscaling_pool_conf(
+        num_workers=2,
+        max_workers=2,
+        infra=generic_cloud,
+        setup_cmd='echo hi',
+        upscale_delay_seconds=20,
+        downscale_delay_seconds=20,
+    )
+
+    # Job that runs for a while
+    job_name = f'{name}-job'
+    job_config = basic_job_conf(
+        job_name=job_name,
+        run_cmd='sleep 3000',  # Long-running job
+    )
+
+    with tempfile.NamedTemporaryFile(delete=True) as pool_yaml:
+        with tempfile.NamedTemporaryFile(delete=True) as job_yaml:
+            write_yaml(pool_yaml, pool_config)
+            write_yaml(job_yaml, job_config)
+
+            test = smoke_tests_utils.Test(
+                'test_pool_autoscaling_no_scale_when_max_equals_workers',
+                [
+                    _LAUNCH_POOL_AND_CHECK_SUCCESS.format(
+                        pool_name=pool_name, pool_yaml=pool_yaml.name),
+                    wait_until_pool_ready(pool_name, timeout=timeout),
+                    # Launch multiple jobs to create queue (more than workers can handle)
+                    # Launch 5 jobs - 2 run, 3 queue up
+                    *[
+                        _LAUNCH_JOB_AND_CHECK_SUCCESS_WITH_NAME.format(
+                            pool_name=pool_name,
+                            job_yaml=job_yaml.name,
+                            job_name=f'{job_name}-{i}') for i in range(1, 6)
+                    ],
+                    # Verify we start with 2 workers
+                    wait_until_num_workers(pool_name, 2, timeout=timeout),
+                    # Wait for jobs to queue
+                    'sleep 10',
+                    # Verify jobs are queued
+                    f's=$(sky jobs queue); echo "$s"; echo "$s" | grep "PENDING" || echo "Some jobs may have started"',
+                    # Check that workers never exceed 2 for a period of time
+                    check_workers_do_not_exceed(pool_name, 2, duration=120),
+                ],
+                timeout=timeout * 2,
+                teardown=cancel_jobs_and_teardown_pool(pool_name, timeout=10),
+            )
+            smoke_tests_utils.run_one_test(test)
+
+
+@pytest.mark.no_remote_server  # see note 1 above
+def test_pool_autoscaling_scale_down_to_zero(generic_cloud: str):
+    """Test that pool autoscales down to zero when no jobs and min_workers=0.
+    
+    This test:
+    1. Creates a pool with workers=1, max_workers=2, min_workers=0
+    2. Launches a job that completes quickly
+    3. Verifies that workers scale down to 0 after job completes
+    """
+    timeout = smoke_tests_utils.get_timeout(generic_cloud)
+    name = smoke_tests_utils.get_cluster_name()
+    pool_name = f'{name}-pool'
+
+    # Pool with autoscaling: start with 1 worker, can scale down to 0
+    pool_config = autoscaling_pool_conf(
+        num_workers=1,
+        max_workers=2,
+        min_workers=0,
+        infra=generic_cloud,
+        upscale_delay_seconds=20,
+        downscale_delay_seconds=20,
+    )
+
+    # Job that completes quickly
+    job_name = f'{name}-job'
+    job_config = basic_job_conf(
+        job_name=job_name,
+        run_cmd='echo "Job completed"',  # Quick job
+    )
+
+    with tempfile.NamedTemporaryFile(delete=True) as pool_yaml:
+        with tempfile.NamedTemporaryFile(delete=True) as job_yaml:
+            write_yaml(pool_yaml, pool_config)
+            write_yaml(job_yaml, job_config)
+
+            test = smoke_tests_utils.Test(
+                'test_pool_autoscaling_scale_down_to_zero',
+                [
+                    _LAUNCH_POOL_AND_CHECK_SUCCESS.format(
+                        pool_name=pool_name, pool_yaml=pool_yaml.name),
+                    # Launch a job
+                    _LAUNCH_JOB_AND_CHECK_SUCCESS.format(
+                        pool_name=pool_name, job_yaml=job_yaml.name),
+                    # Wait for job to complete
+                    wait_until_job_status(job_name, ['SUCCEEDED'],
+                                          timeout=timeout),
+                    # Verify we scale down to 0 workers
+                    wait_until_num_workers(pool_name, 0, timeout=timeout),
+                ],
+                timeout=timeout * 2,
+                teardown=cancel_jobs_and_teardown_pool(pool_name, timeout=10),
+            )
+            smoke_tests_utils.run_one_test(test)
+
+
+@pytest.mark.no_remote_server  # see note 1 above
+def test_pool_autoscaling_scale_up_to_max_then_down_to_zero(generic_cloud: str):
+    """Test that pool autoscales up to max_workers then down to zero.
+    
+    This test:
+    1. Creates a pool with workers=0, max_workers=3, min_workers=0
+    2. Queues up enough quick jobs (echo hi) to trigger scaling to 3 workers
+    3. Verifies that workers scale up to 3
+    4. Waits for all jobs to complete
+    5. Waits for SCALE_DOWN_TO_ZERO message in pool logs
+    6. Verifies that workers scale down to 0
+    """
+    timeout = smoke_tests_utils.get_timeout(generic_cloud)
+    name = smoke_tests_utils.get_cluster_name()
+    pool_name = f'{name}-pool'
+
+    # Pool with autoscaling: start with 0 workers, can scale up to 3, down to 0
+    pool_config = autoscaling_pool_conf(
+        num_workers=1,
+        max_workers=3,
+        min_workers=0,
+        infra=generic_cloud,
+        setup_cmd='echo hi',
+        upscale_delay_seconds=20,
+        downscale_delay_seconds=20,
+    )
+
+    # Quick job that just echoes hi and finishes instantly
+    job_name = f'{name}-job'
+    job_config = basic_job_conf(
+        job_name=job_name,
+        run_cmd='echo hi',  # Quick job that finishes instantly
+    )
+
+    with tempfile.NamedTemporaryFile(delete=True) as pool_yaml:
+        with tempfile.NamedTemporaryFile(delete=True) as job_yaml:
+            write_yaml(pool_yaml, pool_config)
+            write_yaml(job_yaml, job_config)
+
+            test = smoke_tests_utils.Test(
+                'test_pool_autoscaling_scale_up_to_max_then_down_to_zero',
+                [
+                    _LAUNCH_POOL_AND_CHECK_SUCCESS.format(
+                        pool_name=pool_name, pool_yaml=pool_yaml.name),
+                    # Queue up enough jobs to trigger scaling to 3 workers
+                    # We need at least 3 jobs queued to trigger scaling to 3
+                    # Launch 5 jobs to ensure we get to 3 workers
+                    *[
+                        _LAUNCH_JOB_AND_CHECK_SUCCESS_WITH_NAME.format(
+                            pool_name=pool_name,
+                            job_yaml=job_yaml.name,
+                            job_name=f'{job_name}-{i}') for i in range(1, 6)
+                    ],
+                    # Verify we scale up to 3 workers (max_workers)
+                    wait_until_num_workers(pool_name, 3, timeout=300),
+                    # Wait for all jobs to complete
+                    *[
+                        wait_until_job_status(f'{job_name}-{i}', ['SUCCEEDED'],
+                                              timeout=timeout)
+                        for i in range(1, 6)
+                    ],
+                    # Wait for SCALE_DOWN_TO_ZERO message in pool logs
+                    wait_for_message_in_pool_logs(
+                        pool_name, 'SCALE_DOWN_TO_ZERO', timeout=300),
+                    # Verify we scale down to 0 workers
+                    wait_until_num_workers(pool_name, 0, timeout=300),
+                ],
+                timeout=timeout * 3,  # Autoscaling takes time
+                teardown=cancel_jobs_and_teardown_pool(pool_name, timeout=10),
+            )
+            smoke_tests_utils.run_one_test(test)
diff --git a/tests/smoke_tests/test_recipes.py b/tests/smoke_tests/test_recipes.py
new file mode 100644
index 00000000000..14128cf7e52
--- /dev/null
+++ b/tests/smoke_tests/test_recipes.py
@@ -0,0 +1,350 @@
+# Smoke tests for Recipe Hub feature
+# Tests the ability to launch clusters, jobs, and pools using the recipes:<name> syntax
+#
+# Example usage:
+# Run all recipe tests:
+# > pytest tests/smoke_tests/test_recipes.py
+#
+# Run a specific test:
+# > pytest tests/smoke_tests/test_recipes.py::test_recipe_cluster_launch
+#
+# Run with a specific cloud:
+# > pytest tests/smoke_tests/test_recipes.py --generic-cloud aws
+
+import textwrap
+
+import pytest
+from smoke_tests import smoke_tests_utils
+
+import sky
+from sky.recipes import core as recipes_core
+from sky.recipes import db as recipes_db
+from sky.recipes.utils import RecipeType
+
+# Test recipe names - must follow the naming convention (letters, numbers, dashes)
+TEST_CLUSTER_RECIPE = 'test-smoke-cluster'
+TEST_JOB_RECIPE = 'test-smoke-job'
+TEST_POOL_RECIPE = 'test-smoke-pool'
+TEST_VOLUME_RECIPE = 'test-smoke-volume'
+
+
+def _create_test_recipe(name: str, recipe_type: RecipeType, content: str):
+    """Create a test recipe in the database, deleting any existing one first."""
+    # Delete existing recipe if it exists
+    try:
+        recipes_db.delete_recipe(name, user_id='test-user')
+    except Exception:
+        pass
+
+    # Create the recipe (core.create_recipe expects string for recipe_type)
+    recipes_core.create_recipe(
+        name=name,
+        content=content,
+        recipe_type=recipe_type.value,
+        user_id='test-user',
+        user_name='Test User',
+        description=f'Smoke test recipe for {recipe_type.value}',
+    )
+
+
+def _delete_test_recipe(name: str):
+    """Delete a test recipe from the database."""
+    try:
+        recipes_db.delete_recipe(name, user_id='test-user')
+    except Exception:
+        pass
+
+
+# Minimal YAML configs for testing
+CLUSTER_YAML = textwrap.dedent("""
+resources:
+  cpus: 2+
+  memory: 4+
+
+run: |
+  echo "Recipe cluster test completed successfully"
+""").strip()
+
+JOB_YAML = textwrap.dedent("""
+resources:
+  cpus: 2+
+  memory: 4+
+
+run: |
+  echo "Recipe managed job test completed successfully"
+""").strip()
+
+POOL_YAML = textwrap.dedent("""
+resources:
+  cpus: 2+
+  memory: 4+
+
+pool:
+  workers: 1
+""").strip()
+
+VOLUME_YAML = textwrap.dedent("""
+name: test-smoke-vol
+type: k8s-pvc
+size: 1Gi
+""").strip()
+
+
+@pytest.fixture(scope='function')
+def cluster_recipe():
+    """Create a cluster recipe for testing."""
+    _create_test_recipe(TEST_CLUSTER_RECIPE, RecipeType.CLUSTER, CLUSTER_YAML)
+    yield TEST_CLUSTER_RECIPE
+    _delete_test_recipe(TEST_CLUSTER_RECIPE)
+
+
+@pytest.fixture(scope='function')
+def job_recipe():
+    """Create a job recipe for testing."""
+    _create_test_recipe(TEST_JOB_RECIPE, RecipeType.JOB, JOB_YAML)
+    yield TEST_JOB_RECIPE
+    _delete_test_recipe(TEST_JOB_RECIPE)
+
+
+@pytest.fixture(scope='function')
+def pool_recipe():
+    """Create a pool recipe for testing."""
+    _create_test_recipe(TEST_POOL_RECIPE, RecipeType.POOL, POOL_YAML)
+    yield TEST_POOL_RECIPE
+    _delete_test_recipe(TEST_POOL_RECIPE)
+
+
+@pytest.fixture(scope='function')
+def volume_recipe():
+    """Create a volume recipe for testing."""
+    _create_test_recipe(TEST_VOLUME_RECIPE, RecipeType.VOLUME, VOLUME_YAML)
+    yield TEST_VOLUME_RECIPE
+    _delete_test_recipe(TEST_VOLUME_RECIPE)
+
+
+# ---------- Recipe Cluster Launch ----------
+# We assume that the recipe db is locally hosted.
+@pytest.mark.no_remote_server
+def test_recipe_cluster_launch(generic_cloud: str, cluster_recipe: str):
+    """Test launching a cluster using recipes:<name> syntax."""
+    name = smoke_tests_utils.get_cluster_name()
+    test = smoke_tests_utils.Test(
+        'recipe_cluster_launch',
+        [
+            # Launch cluster using recipe reference
+            f'sky launch -y -c {name} --infra {generic_cloud} recipes:{cluster_recipe}',
+            # Verify the run output
+            f'sky logs {name} 1 | grep "Recipe cluster test completed successfully"',
+        ],
+        f'sky down -y {name}',
+        smoke_tests_utils.get_timeout(generic_cloud),
+        env=smoke_tests_utils.LOW_CONTROLLER_RESOURCE_ENV,
+    )
+    smoke_tests_utils.run_one_test(test)
+
+
+# ---------- Recipe Managed Job Launch ----------
+# We assume that the recipe db is locally hosted.
+@pytest.mark.no_remote_server
+def test_recipe_managed_job_launch(generic_cloud: str, job_recipe: str):
+    """Test launching a managed job using recipes:<name> syntax."""
+    name = smoke_tests_utils.get_cluster_name()
+    test = smoke_tests_utils.Test(
+        'recipe_managed_job_launch',
+        [
+            # Launch managed job using recipe reference
+            f'sky jobs launch -y -n {name} --infra {generic_cloud} recipes:{job_recipe}',
+            # Wait for job to complete
+            smoke_tests_utils.
+            get_cmd_wait_until_managed_job_status_contains_matching_job_name(
+                job_name=name,
+                job_status=[sky.ManagedJobStatus.SUCCEEDED],
+                timeout=smoke_tests_utils.get_timeout(generic_cloud),
+            ),
+        ],
+        f'sky jobs cancel -n {name} -y',
+        smoke_tests_utils.get_timeout(generic_cloud),
+        env=smoke_tests_utils.LOW_CONTROLLER_RESOURCE_ENV,
+    )
+    smoke_tests_utils.run_one_test(test)
+
+
+# ---------- Recipe Pool Launch ----------
+# We assume that the recipe db is locally hosted.
+@pytest.mark.no_remote_server  # Pool tests may have resource conflicts
+def test_recipe_pool_launch(generic_cloud: str, pool_recipe: str):
+    """Test creating a job pool using recipes:<name> syntax."""
+    name = smoke_tests_utils.get_cluster_name()
+    test = smoke_tests_utils.Test(
+        'recipe_pool_launch',
+        [
+            # Create pool using recipe reference
+            f's=$(sky jobs pool apply -p {name} --infra {generic_cloud} recipes:{pool_recipe} -y); '
+            'echo "$s"; '
+            'echo "$s" | grep "Successfully created pool"',
+        ],
+        f'sky jobs pool down {name} -y',
+        smoke_tests_utils.get_timeout(generic_cloud),
+        env=smoke_tests_utils.LOW_CONTROLLER_RESOURCE_ENV,
+    )
+    smoke_tests_utils.run_one_test(test)
+
+
+# ---------- Recipe Volume Apply (Kubernetes only) ----------
+@pytest.mark.kubernetes
+# We assume that the recipe db is locally hosted.
+@pytest.mark.no_remote_server
+def test_recipe_volume_apply(volume_recipe: str):
+    """Test creating a volume using recipes:<name> syntax (Kubernetes only)."""
+    name = smoke_tests_utils.get_cluster_name()
+    volume_name = f'{name}-vol'
+    test = smoke_tests_utils.Test(
+        'recipe_volume_apply',
+        [
+            # Create volume using recipe reference
+            f'sky volumes apply -y -n {volume_name} --infra kubernetes recipes:{volume_recipe}',
+            # Verify volume was created
+            f'sky volumes ls | grep "{volume_name}"',
+        ],
+        f'sky volumes delete {volume_name} -y || true',
+        timeout=120,
+    )
+    smoke_tests_utils.run_one_test(test)
+
+
+# ---------- Recipe Not Found Error ----------
+def test_recipe_not_found(generic_cloud: str):
+    """Test that launching with a non-existent recipe gives a clear error."""
+    name = smoke_tests_utils.get_cluster_name()
+    test = smoke_tests_utils.Test(
+        'recipe_not_found',
+        [
+            # Try to launch with non-existent recipe - should fail
+            f'sky launch -y -c {name} --infra {generic_cloud} recipes:nonexistent-recipe-xyz && exit 1 || '
+            'echo "Expected failure for non-existent recipe"',
+        ],
+        f'sky down -y {name} 2>/dev/null || true',
+        timeout=60,
+    )
+    smoke_tests_utils.run_one_test(test)
+
+
+# ---------- Recipe Duplicate Name Error ----------
+def test_recipe_duplicate_name():
+    """Test that creating a recipe with a duplicate name raises an error."""
+    test = smoke_tests_utils.Test(
+        'recipe_duplicate_name',
+        [
+            _test_duplicate_recipe_name,
+        ],
+        None,
+        timeout=30,
+    )
+    smoke_tests_utils.run_one_test(test)
+
+
+def _test_duplicate_recipe_name():
+    """Helper to test that creating a recipe with duplicate name raises error."""
+    from sky import exceptions
+    recipe_name = 'test-duplicate-name-check'
+
+    # Clean up any existing recipe first
+    try:
+        recipes_db.delete_recipe(recipe_name, user_id='test-user')
+    except Exception:
+        pass
+
+    try:
+        # Create the first recipe (core.create_recipe expects string)
+        recipes_core.create_recipe(
+            name=recipe_name,
+            content='resources:\n  cpus: 1',
+            recipe_type='cluster',
+            user_id='test-user',
+        )
+        print(f'Created first recipe: {recipe_name}')
+
+        # Try to create a second recipe with the same name - should fail
+        try:
+            recipes_core.create_recipe(
+                name=recipe_name,
+                content='resources:\n  cpus: 2',
+                recipe_type='cluster',
+                user_id='test-user',
+            )
+            raise AssertionError(
+                f'Expected RecipeAlreadyExistsError for duplicate name: {recipe_name}'
+            )
+        except exceptions.RecipeAlreadyExistsError:
+            print(f'Correctly rejected duplicate recipe name: {recipe_name}')
+    finally:
+        # Clean up
+        try:
+            recipes_db.delete_recipe(recipe_name, user_id='test-user')
+        except Exception:
+            pass
+
+
+# ---------- Recipe Delete Authorization ----------
+def test_recipe_delete_authorization():
+    """Test that a different user cannot delete another user's recipe."""
+    test = smoke_tests_utils.Test(
+        'recipe_delete_authorization',
+        [
+            _test_recipe_delete_authorization,
+        ],
+        None,
+        timeout=30,
+    )
+    smoke_tests_utils.run_one_test(test)
+
+
+def _test_recipe_delete_authorization():
+    """Helper to test that only the owner can delete a recipe."""
+    recipe_name = 'test-delete-auth-check'
+    owner_user_id = 'owner-user'
+    other_user_id = 'other-user'
+
+    # Clean up any existing recipe first
+    try:
+        recipes_db.delete_recipe(recipe_name, user_id=owner_user_id)
+    except Exception:
+        pass
+
+    try:
+        # Create a recipe as the owner (core.create_recipe expects string)
+        recipes_core.create_recipe(
+            name=recipe_name,
+            content='resources:\n  cpus: 1',
+            recipe_type='cluster',
+            user_id=owner_user_id,
+            user_name='Owner User',
+        )
+        print(f'Created recipe as {owner_user_id}: {recipe_name}')
+
+        # Try to delete as a different user - should return False (not deleted)
+        deleted = recipes_db.delete_recipe(recipe_name, user_id=other_user_id)
+        if deleted:
+            raise AssertionError(
+                f'Recipe should not be deletable by non-owner: {other_user_id}')
+        print(f'Correctly prevented deletion by non-owner: {other_user_id}')
+
+        # Verify recipe still exists
+        recipe = recipes_db.get_recipe(recipe_name)
+        if recipe is None:
+            raise AssertionError(
+                'Recipe was deleted even though delete returned False')
+        print('Recipe still exists after failed delete attempt')
+
+        # Verify owner can delete
+        deleted = recipes_db.delete_recipe(recipe_name, user_id=owner_user_id)
+        if not deleted:
+            raise AssertionError('Owner should be able to delete their recipe')
+        print('Owner successfully deleted recipe')
+
+    finally:
+        # Clean up (in case test failed)
+        try:
+            recipes_db.delete_recipe(recipe_name, user_id=owner_user_id)
+        except Exception:
+            pass
diff --git a/tests/smoke_tests/test_region_and_zone.py b/tests/smoke_tests/test_region_and_zone.py
index e67bf1cebf9..3be6476d85a 100644
--- a/tests/smoke_tests/test_region_and_zone.py
+++ b/tests/smoke_tests/test_region_and_zone.py
@@ -243,7 +243,6 @@ def test_gcp_zone():
 @pytest.mark.no_shadeform  # Requires AWS
 @pytest.mark.no_seeweb  # Seeweb does not support storage mounting yet.
 @pytest.mark.no_dependency  # Requires full dependency installed
-@pytest.mark.no_slurm  # Slurm does not support docker images and/or image_id
 @pytest.mark.parametrize(
     'image_id',
     [
diff --git a/tests/smoke_tests/test_setup.py b/tests/smoke_tests/test_setup.py
index e5ff2da0f79..3082da98085 100644
--- a/tests/smoke_tests/test_setup.py
+++ b/tests/smoke_tests/test_setup.py
@@ -8,7 +8,6 @@
 
 
 # ---------- Test launching a cluster that has pyproject.toml in the workdir ----------
-@pytest.mark.no_slurm  # Slurm does not support docker images and/or image_id
 @pytest.mark.parametrize('image_id', [
     'docker:us-docker.pkg.dev/sky-dev-465/buildkite-test-images/test-workdir-pyproject:latest',
     'docker:us-docker.pkg.dev/sky-dev-465/buildkite-test-images/test-root-pyproject:latest',
diff --git a/tests/smoke_tests/test_sky_serve.py b/tests/smoke_tests/test_sky_serve.py
index fa18b417864..91205fedfee 100644
--- a/tests/smoke_tests/test_sky_serve.py
+++ b/tests/smoke_tests/test_sky_serve.py
@@ -358,6 +358,8 @@ def test_skyserve_llm(generic_cloud: str, accelerator: Dict[str, str]):
     """Test skyserve with real LLM usecase"""
     if generic_cloud == 'kubernetes':
         accelerator = smoke_tests_utils.get_available_gpus()
+        if not accelerator:
+            pytest.fail('No GPUs available for kubernetes.')
     else:
         accelerator = accelerator.get(generic_cloud, 'T4')
 
diff --git a/tests/test_cli.py b/tests/test_cli.py
index 85180204c64..ef1cc228a6c 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -13,7 +13,6 @@
 from sky.client.cli import command
 from sky.schemas.api import responses
 from sky.utils import status_lib
-from sky.utils import ux_utils
 
 CLOUDS_TO_TEST = [
     'aws', 'gcp', 'ibm', 'azure', 'lambda', 'scp', 'oci', 'vsphere', 'nebius'
@@ -605,6 +604,104 @@ def mock_stream_and_get_conditional(request_id):
         # Ensure it doesn't contain the problematic float format
         assert '1.0, 2.0, 4.0, 8.0' not in output, f"Found float format in output: {output}"
 
+    def test_env_secret_file_merger_comprehensive(self):
+        """"""
+        cli = [('hello', 'world'), ('one', 'two')]
+        env_file = {
+            'hello': 'notthis',
+            'something': 'different',
+            'secret': 'notsosecure'
+        }
+        secret_file = {'secret': 'supersecret', 'secret2': 'verysecret'}
+
+        final_envs = command._merge_cli_and_file_vars(
+            [None, env_file, None, secret_file], cli)
+        final_envs = dict(final_envs)
+        assert final_envs['hello'] == 'world'
+        assert final_envs['one'] == 'two'
+        assert final_envs['something'] == 'different'
+        assert final_envs['secret'] == 'supersecret'
+        assert final_envs['secret2'] == 'verysecret'
+
+    def test_env_secret_file_merger_one_file(self):
+        """Test with only file contents provided."""
+        env_file = {'key1': 'value1', 'key2': 'value2'}
+
+        final_envs = command._merge_cli_and_file_vars([env_file], [])
+        final_envs = dict(final_envs)
+
+        assert final_envs['key1'] == 'value1'
+        assert final_envs['key2'] == 'value2'
+        assert len(final_envs) == 2
+
+    def test_env_secret_file_merger_cli_only(self):
+        """Test with only CLI args provided."""
+        cli = [('key1', 'value1'), ('key2', 'value2')]
+
+        final_envs = command._merge_cli_and_file_vars([], cli)
+        final_envs = dict(final_envs)
+
+        assert final_envs['key1'] == 'value1'
+        assert final_envs['key2'] == 'value2'
+        assert len(final_envs) == 2
+
+    def test_env_secret_file_merger_duplicate_keys_in_cli(self):
+        """Test that later CLI args override earlier ones for same key."""
+        # While we don't expect users to pass in the same keys into the
+        # cli command, the last key will technically be preferred.
+        cli = [('key1', 'first'), ('key1', 'second'), ('key1', 'third')]
+
+        final_envs = command._merge_cli_and_file_vars([], cli)
+        final_envs = dict(final_envs)
+
+        assert final_envs['key1'] == 'third'
+        assert len(final_envs) == 1
+
+    def test_env_secret_file_merger_all_none_env_dicts(self):
+        """Test with all None env_dicts."""
+        cli = [('key1', 'value1')]
+
+        final_envs = command._merge_cli_and_file_vars([None, None, None], cli)
+        final_envs = dict(final_envs)
+
+        assert final_envs['key1'] == 'value1'
+        assert len(final_envs) == 1
+
+    def test_env_secret_file_merger_empty_inputs(self):
+        """Test with completely empty inputs."""
+        final_envs = command._merge_cli_and_file_vars([], [])
+        assert final_envs == []
+
+        final_envs = command._merge_cli_and_file_vars([{}, {}], [])
+        assert final_envs == []
+
+    def test_env_dict_priority_order(self):
+        """Test that higher index env_dicts override lower index ones."""
+        env_dict1 = {'key': 'first', 'unique1': 'value1'}
+        env_dict2 = {'key': 'second', 'unique2': 'value2'}
+        env_dict3 = {'key': 'third', 'unique3': 'value3'}
+
+        final_envs = command._merge_cli_and_file_vars(
+            [env_dict1, env_dict2, env_dict3], [])
+        final_envs = dict(final_envs)
+
+        assert final_envs['key'] == 'third'
+        assert final_envs['unique1'] == 'value1'
+        assert final_envs['unique2'] == 'value2'
+        assert final_envs['unique3'] == 'value3'
+
+    def test_cli_overrides_all_dicts(self):
+        """Test that CLI args override all env_dicts regardless of position."""
+        env_dict1 = {'key': 'dict1'}
+        env_dict2 = {'key': 'dict2'}
+        cli = [('key', 'cli_value')]
+
+        final_envs = command._merge_cli_and_file_vars([env_dict1, env_dict2],
+                                                      cli)
+        final_envs = dict(final_envs)
+
+        assert final_envs['key'] == 'cli_value'
+
 
 def strip_ansi(s: str) -> str:
     return re.sub(r"\x1b\[[0-9;]*m", "", s)
@@ -629,11 +726,11 @@ def test_batch_continues_on_errors_helper(monkeypatch, capsys, mode):
 
     names = ["sky-ok-1", "sky-nebius-fail", "sky-ok-2"]
 
-    def fake_down(name, purge=False):
+    def fake_down(name, purge=False, graceful=False, graceful_timeout=None):
         if name == "sky-nebius-fail":
             raise DummyCloudError()
 
-    def fake_stop(name, purge=False):
+    def fake_stop(name, purge=False, graceful=False, graceful_timeout=None):
         return fake_down(name, purge=purge)
 
     def fake_autostop(name, idle_minutes, wait_for, down):
diff --git a/tests/test_job_groups/__init__.py b/tests/test_job_groups/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/tests/test_job_groups/example_job_group.yaml b/tests/test_job_groups/example_job_group.yaml
new file mode 100644
index 00000000000..84c8d659b91
--- /dev/null
+++ b/tests/test_job_groups/example_job_group.yaml
@@ -0,0 +1,82 @@
+# Example JobGroup YAML for RL training workload
+# This demonstrates heterogeneous parallel workloads with hostname-based networking
+
+name: rl-experiment
+
+# Execution mode: all jobs start in parallel
+execution: parallel
+
+---
+# GPU nodes for training
+name: trainer
+
+resources:
+  accelerators: H100:8
+  # image_id: docker:my-docker-repo/trainer:0.1.0
+
+num_nodes: 2
+
+run: |
+  echo "Starting trainer on $(hostname)"
+  echo "JobGroup: ${SKYPILOT_JOBGROUP_NAME}"
+  echo "Master addr: trainer-0.${SKYPILOT_JOBGROUP_NAME}"
+  echo "Replay buffer: replay-buffer-0.${SKYPILOT_JOBGROUP_NAME}:6379"
+  # torchrun --nnodes=2 --nproc_per_node=8 \
+  #   --master_addr=trainer-0.${SKYPILOT_JOBGROUP_NAME} \
+  #   train.py \
+  #   --replay-buffer-addr replay-buffer-0.${SKYPILOT_JOBGROUP_NAME}:6379
+  sleep 60
+  echo "Trainer completed"
+
+---
+# GPU nodes for data preprocessing
+name: data-processor
+
+resources:
+  accelerators: V100:4
+
+num_nodes: 2
+
+run: |
+  echo "Starting data-processor on $(hostname)"
+  echo "JobGroup: ${SKYPILOT_JOBGROUP_NAME}"
+  echo "Output to: replay-buffer-0.${SKYPILOT_JOBGROUP_NAME}:6379"
+  # python preprocess.py \
+  #   --output-addr replay-buffer-0.${SKYPILOT_JOBGROUP_NAME}:6379
+  sleep 45
+  echo "Data processor completed"
+
+---
+# High-RAM CPU nodes for replay buffer
+name: replay-buffer
+
+resources:
+  cpus: 8+
+  memory: 64+
+
+run: |
+  echo "Starting replay-buffer on $(hostname)"
+  echo "JobGroup: ${SKYPILOT_JOBGROUP_NAME}"
+  echo "Listening on port 6379"
+  # python replay_buffer.py --port 6379
+  sleep 90
+  echo "Replay buffer completed"
+
+---
+# High-CPU nodes for RL environment workers
+name: env-worker
+
+resources:
+  cpus: 32+
+  memory: 16+
+
+num_nodes: 2
+
+run: |
+  echo "Starting env-worker on $(hostname)"
+  echo "JobGroup: ${SKYPILOT_JOBGROUP_NAME}"
+  echo "Trainer addr: trainer-0.${SKYPILOT_JOBGROUP_NAME}:8080"
+  # python env_worker.py --port 5000 \
+  #   --trainer-addr trainer-0.${SKYPILOT_JOBGROUP_NAME}:8080
+  sleep 60
+  echo "Env worker completed"
diff --git a/tests/test_job_groups/smoke_basic.yaml b/tests/test_job_groups/smoke_basic.yaml
new file mode 100644
index 00000000000..c5801f0f370
--- /dev/null
+++ b/tests/test_job_groups/smoke_basic.yaml
@@ -0,0 +1,24 @@
+# Basic JobGroup smoke test - 2 parallel jobs
+---
+name: {{name}}
+execution: parallel
+---
+name: job-a
+resources:
+  cpus: 2
+  infra: {{cloud}}
+run: |
+  echo "Job A Starting"
+  echo "SKYPILOT_JOBGROUP_NAME: ${SKYPILOT_JOBGROUP_NAME}"
+  sleep 10
+  echo "Job A Done"
+---
+name: job-b
+resources:
+  cpus: 2
+  infra: {{cloud}}
+run: |
+  echo "Job B Starting"
+  echo "SKYPILOT_JOBGROUP_NAME: ${SKYPILOT_JOBGROUP_NAME}"
+  sleep 10
+  echo "Job B Done"
diff --git a/tests/test_job_groups/smoke_networking.yaml b/tests/test_job_groups/smoke_networking.yaml
new file mode 100644
index 00000000000..6d867006b22
--- /dev/null
+++ b/tests/test_job_groups/smoke_networking.yaml
@@ -0,0 +1,36 @@
+# JobGroup networking smoke test - verifies cross-job hostname resolution
+---
+name: {{name}}
+execution: parallel
+---
+name: server
+resources:
+  cpus: 2
+  infra: {{cloud}}
+run: |
+  echo "Server starting"
+  python3 -m http.server 8080 &
+  SERVER_PID=$!
+  sleep 60
+  kill $SERVER_PID 2>/dev/null || true
+  echo "Server done"
+---
+name: client
+resources:
+  cpus: 2
+  infra: {{cloud}}
+run: |
+  echo "Client starting"
+  sleep 10
+  SERVER_HOST="server-0.${SKYPILOT_JOBGROUP_NAME}"
+  echo "Testing connection to $SERVER_HOST"
+  for i in $(seq 1 10); do
+    if curl -s --connect-timeout 5 http://$SERVER_HOST:8080/ > /dev/null 2>&1; then
+      echo "SUCCESS: Connected to server"
+      exit 0
+    fi
+    echo "Attempt $i failed, retrying..."
+    sleep 3
+  done
+  echo "FAILED: Could not connect to server"
+  exit 1
diff --git a/tests/test_job_groups/smoke_networking_custom_image.yaml b/tests/test_job_groups/smoke_networking_custom_image.yaml
new file mode 100644
index 00000000000..d7503a20123
--- /dev/null
+++ b/tests/test_job_groups/smoke_networking_custom_image.yaml
@@ -0,0 +1,54 @@
+# JobGroup networking smoke test with custom image (no sudo installed)
+# This tests the fix for containers running as root but without sudo binary.
+# The image_id is passed as a Jinja variable for parametrized testing.
+---
+name: {{name}}
+execution: parallel
+---
+name: server
+resources:
+  cpus: 2
+  memory: 2GB
+  infra: {{cloud}}
+  # Custom image passed via parametrize - tests images without sudo
+  image_id: {{image_id}}
+run: |
+  echo "Server starting"
+  echo "Running as: $(id)"
+  echo "Has sudo: $(which sudo 2>/dev/null || echo 'NO - expected for custom images')"
+  python3 -m http.server 8080 &
+  SERVER_PID=$!
+  sleep 60
+  kill $SERVER_PID 2>/dev/null || true
+  echo "Server done"
+---
+name: client
+resources:
+  cpus: 2
+  memory: 2GB
+  infra: {{cloud}}
+  # Custom image passed via parametrize - tests images without sudo
+  image_id: {{image_id}}
+run: |
+  echo "Client starting"
+  echo "Running as: $(id)"
+  echo "Has sudo: $(which sudo 2>/dev/null || echo 'NO - expected for custom images')"
+  sleep 10
+  SERVER_HOST="server-0.${SKYPILOT_JOBGROUP_NAME}"
+  echo "Testing connection to $SERVER_HOST"
+
+  # First verify /etc/hosts was updated (DNS updater worked without sudo)
+  echo "Checking /etc/hosts entries:"
+  grep -E "SkyPilot|server-0|client-0" /etc/hosts || echo "No JobGroup entries yet"
+
+  # Try to connect to server using Python (available in all test images)
+  for i in $(seq 1 10); do
+    if python3 -c "import urllib.request; urllib.request.urlopen('http://$SERVER_HOST:8080/', timeout=5)" 2>/dev/null; then
+      echo "SUCCESS: Connected to server on custom image without sudo"
+      exit 0
+    fi
+    echo "Attempt $i failed, retrying..."
+    sleep 3
+  done
+  echo "FAILED: Could not connect to server"
+  exit 1
diff --git a/tests/test_job_groups/smoke_primary_auxiliary.yaml b/tests/test_job_groups/smoke_primary_auxiliary.yaml
new file mode 100644
index 00000000000..a9658601410
--- /dev/null
+++ b/tests/test_job_groups/smoke_primary_auxiliary.yaml
@@ -0,0 +1,42 @@
+# Primary/Auxiliary Job Group Smoke Test
+# Tests the primary/auxiliary termination behavior:
+# - Primary task (trainer) runs for a short time and succeeds
+# - Auxiliary task (replay-buffer) runs indefinitely until terminated
+# - After primary task completes, auxiliary should be terminated with delay
+---
+name: {{name}}
+execution: parallel
+primary_tasks: [trainer]
+termination_delay: {{delay}}
+---
+name: trainer
+resources:
+  cpus: 2
+  memory: 4+
+  infra: {{cloud}}
+run: |
+  echo "Primary trainer task starting on $(hostname)"
+  echo "JobGroup: ${SKYPILOT_JOBGROUP_NAME}"
+  echo "Trainer will run for a short time and then complete successfully"
+  for i in {1..5}; do
+    echo "Training iteration $i"
+    sleep 2
+  done
+  echo "PRIMARY TRAINER COMPLETED SUCCESSFULLY"
+---
+name: replay-buffer
+resources:
+  cpus: 2
+  memory: 4+
+  infra: {{cloud}}
+run: |
+  echo "Auxiliary replay-buffer task starting on $(hostname)"
+  echo "JobGroup: ${SKYPILOT_JOBGROUP_NAME}"
+  echo "This task runs indefinitely until primary completes and triggers termination"
+  iteration=0
+  while true; do
+    iteration=$((iteration + 1))
+    echo "Replay buffer heartbeat $iteration"
+    sleep 2
+  done
+  echo "REPLAY BUFFER SHOULD NOT REACH HERE - should be terminated"
diff --git a/tests/test_job_groups/smoke_primary_failure.yaml b/tests/test_job_groups/smoke_primary_failure.yaml
new file mode 100644
index 00000000000..c5319575285
--- /dev/null
+++ b/tests/test_job_groups/smoke_primary_failure.yaml
@@ -0,0 +1,37 @@
+# Primary Task Failure Smoke Test
+# Tests that when primary task fails, auxiliary tasks are terminated immediately
+# (without waiting for termination_delay)
+---
+name: {{name}}
+execution: parallel
+primary_tasks: [failing-trainer]
+termination_delay: 60s
+---
+name: failing-trainer
+resources:
+  cpus: 2
+  memory: 4+
+  infra: {{cloud}}
+run: |
+  echo "Primary failing-trainer task starting on $(hostname)"
+  echo "JobGroup: ${SKYPILOT_JOBGROUP_NAME}"
+  echo "This task will fail after a short time"
+  sleep 5
+  echo "PRIMARY TRAINER ABOUT TO FAIL"
+  exit 1
+---
+name: replay-buffer
+resources:
+  cpus: 2
+  memory: 4+
+  infra: {{cloud}}
+run: |
+  echo "Auxiliary replay-buffer task starting on $(hostname)"
+  echo "JobGroup: ${SKYPILOT_JOBGROUP_NAME}"
+  echo "This task should be terminated IMMEDIATELY when primary fails (no delay)"
+  iteration=0
+  while true; do
+    iteration=$((iteration + 1))
+    echo "Replay buffer heartbeat $iteration"
+    sleep 2
+  done
diff --git a/tests/test_job_groups/smoke_rl_architecture.yaml b/tests/test_job_groups/smoke_rl_architecture.yaml
new file mode 100644
index 00000000000..c93d2850efb
--- /dev/null
+++ b/tests/test_job_groups/smoke_rl_architecture.yaml
@@ -0,0 +1,79 @@
+# RL Experiment Architecture Smoke Test
+# Tests the heterogeneous workload pattern with 4 components:
+# trainer (multi-node), data-processor, replay-buffer, env-worker (multi-node)
+# Resource-scaled version for smoke testing
+---
+name: {{name}}
+execution: parallel
+---
+name: trainer
+resources:
+  cpus: 2
+  memory: 4+
+  infra: {{cloud}}
+num_nodes: 2
+run: |
+  echo "Trainer node ${SKYPILOT_NODE_RANK} starting on $(hostname)"
+  echo "JobGroup: ${SKYPILOT_JOBGROUP_NAME}"
+  echo "Master addr: trainer-0.${SKYPILOT_JOBGROUP_NAME}"
+  echo "Replay buffer: replay-buffer-0.${SKYPILOT_JOBGROUP_NAME}:6379"
+  for i in {1..3}; do
+    echo "Training iteration $i on node ${SKYPILOT_NODE_RANK}"
+    sleep 2
+  done
+  echo "Trainer node ${SKYPILOT_NODE_RANK} done"
+---
+name: data-processor
+resources:
+  cpus: 2
+  memory: 4+
+  infra: {{cloud}}
+run: |
+  echo "Data processor starting"
+  echo "JobGroup: ${SKYPILOT_JOBGROUP_NAME}"
+  echo "Output to: replay-buffer-0.${SKYPILOT_JOBGROUP_NAME}:6379"
+  for i in {1..3}; do
+    echo "Processing batch $i"
+    sleep 2
+  done
+  echo "Data processor done"
+---
+name: replay-buffer
+resources:
+  cpus: 2
+  memory: 4+
+  infra: {{cloud}}
+run: |
+  echo "Replay buffer starting on $(hostname)"
+  echo "JobGroup: ${SKYPILOT_JOBGROUP_NAME}"
+  python3 -m http.server 6379 &
+  SERVER_PID=$!
+  for i in {1..5}; do
+    echo "Replay buffer heartbeat $i"
+    sleep 2
+  done
+  kill $SERVER_PID 2>/dev/null || true
+  echo "Replay buffer done"
+---
+name: env-worker
+resources:
+  cpus: 2
+  memory: 4+
+  infra: {{cloud}}
+num_nodes: 2
+run: |
+  echo "Env worker node ${SKYPILOT_NODE_RANK} starting"
+  echo "JobGroup: ${SKYPILOT_JOBGROUP_NAME}"
+  echo "Testing connectivity to replay-buffer..."
+  sleep 5
+  REPLAY_HOST="replay-buffer-0.${SKYPILOT_JOBGROUP_NAME}"
+  if curl -s --connect-timeout 5 http://$REPLAY_HOST:6379/ > /dev/null 2>&1; then
+    echo "SUCCESS: Connected to replay-buffer at $REPLAY_HOST"
+  else
+    echo "WARNING: Could not connect to replay-buffer (may not be HTTP)"
+  fi
+  for i in {1..3}; do
+    echo "Environment step $i on node ${SKYPILOT_NODE_RANK}"
+    sleep 2
+  done
+  echo "Env worker node ${SKYPILOT_NODE_RANK} done"
diff --git a/tests/test_job_groups/test_job_group.py b/tests/test_job_groups/test_job_group.py
new file mode 100644
index 00000000000..191261687bd
--- /dev/null
+++ b/tests/test_job_groups/test_job_group.py
@@ -0,0 +1,1272 @@
+"""Tests for JobGroup functionality."""
+import os
+import tempfile
+from typing import Optional
+from unittest import mock
+
+import pytest
+
+from sky import clouds
+from sky import dag as dag_lib
+from sky import resources as resources_lib
+from sky import task as task_lib
+from sky.utils import dag_utils
+from sky.utils import resources_utils
+
+
+class TestJobGroupYamlParsing:
+    """Tests for JobGroup YAML parsing."""
+
+    def test_is_job_group_yaml_true(self):
+        """Test detection of JobGroup YAML."""
+        yaml_content = """
+---
+name: test-group
+execution: parallel
+---
+name: job1
+run: echo hello
+---
+name: job2
+run: echo world
+"""
+        with tempfile.NamedTemporaryFile(mode='w', suffix='.yaml',
+                                         delete=False) as f:
+            f.write(yaml_content)
+            f.flush()
+
+            try:
+                assert dag_utils.is_job_group_yaml(f.name) is True
+            finally:
+                os.unlink(f.name)
+
+    def test_is_job_group_yaml_false_chain_dag(self):
+        """Test that chain DAG is not detected as JobGroup."""
+        yaml_content = """
+---
+name: chain-dag
+---
+name: task1
+run: echo hello
+---
+name: task2
+run: echo world
+"""
+        with tempfile.NamedTemporaryFile(mode='w', suffix='.yaml',
+                                         delete=False) as f:
+            f.write(yaml_content)
+            f.flush()
+
+            try:
+                # Chain DAG has name-only header (no execution: parallel)
+                assert dag_utils.is_job_group_yaml(f.name) is False
+            finally:
+                os.unlink(f.name)
+
+    def test_is_job_group_yaml_false_single_task(self):
+        """Test that single task YAML is not detected as JobGroup."""
+        yaml_content = """
+name: my-task
+run: echo hello
+"""
+        with tempfile.NamedTemporaryFile(mode='w', suffix='.yaml',
+                                         delete=False) as f:
+            f.write(yaml_content)
+            f.flush()
+
+            try:
+                assert dag_utils.is_job_group_yaml(f.name) is False
+            finally:
+                os.unlink(f.name)
+
+    def test_load_job_group_from_yaml(self):
+        """Test loading a JobGroup from YAML."""
+        yaml_content = """
+---
+name: test-group
+execution: parallel
+---
+name: job1
+run: echo hello
+---
+name: job2
+run: echo world
+"""
+        with tempfile.NamedTemporaryFile(mode='w', suffix='.yaml',
+                                         delete=False) as f:
+            f.write(yaml_content)
+            f.flush()
+
+            try:
+                dag = dag_utils.load_job_group_from_yaml(f.name)
+
+                assert dag.is_job_group() is True
+                assert dag.name == 'test-group'
+                assert dag.execution == dag_lib.DagExecution.PARALLEL
+                assert len(dag.tasks) == 2
+                assert dag.tasks[0].name == 'job1'
+                assert dag.tasks[1].name == 'job2'
+            finally:
+                os.unlink(f.name)
+
+    def test_load_job_group_missing_job_name(self):
+        """Test that JobGroup loading fails if job has no name."""
+        yaml_content = """
+---
+name: test-group
+execution: parallel
+---
+run: echo hello
+"""
+        with tempfile.NamedTemporaryFile(mode='w', suffix='.yaml',
+                                         delete=False) as f:
+            f.write(yaml_content)
+            f.flush()
+
+            try:
+                with pytest.raises(ValueError,
+                                   match='must have a "name" field'):
+                    dag_utils.load_job_group_from_yaml(f.name)
+            finally:
+                os.unlink(f.name)
+
+    def test_load_job_group_duplicate_job_names(self):
+        """Test that JobGroup loading fails with duplicate job names."""
+        yaml_content = """
+---
+name: test-group
+execution: parallel
+---
+name: job1
+run: echo hello
+---
+name: job1
+run: echo world
+"""
+        with tempfile.NamedTemporaryFile(mode='w', suffix='.yaml',
+                                         delete=False) as f:
+            f.write(yaml_content)
+            f.flush()
+
+            try:
+                with pytest.raises(ValueError, match='Duplicate job name'):
+                    dag_utils.load_job_group_from_yaml(f.name)
+            finally:
+                os.unlink(f.name)
+
+
+class TestPrimaryJobsParsing:
+    """Tests for primary_tasks and termination_delay parsing."""
+
+    def test_load_job_group_with_primary_tasks(self):
+        """Test loading JobGroup with primary_tasks field."""
+        yaml_content = """
+---
+name: test-rl
+execution: parallel
+primary_tasks: [trainer]
+---
+name: trainer
+run: python train.py
+---
+name: replay-buffer
+run: python replay_buffer.py
+"""
+        with tempfile.NamedTemporaryFile(mode='w', suffix='.yaml',
+                                         delete=False) as f:
+            f.write(yaml_content)
+            f.flush()
+
+            try:
+                dag = dag_utils.load_job_group_from_yaml(f.name)
+                assert dag.primary_tasks == ['trainer']
+                assert dag.termination_delay is None
+            finally:
+                os.unlink(f.name)
+
+    def test_load_job_group_with_simple_termination_delay(self):
+        """Test loading JobGroup with simple termination_delay string."""
+        yaml_content = """
+---
+name: test-rl
+execution: parallel
+primary_tasks: [trainer]
+termination_delay: 30s
+---
+name: trainer
+run: echo trainer
+---
+name: replay-buffer
+run: echo buffer
+"""
+        with tempfile.NamedTemporaryFile(mode='w', suffix='.yaml',
+                                         delete=False) as f:
+            f.write(yaml_content)
+            f.flush()
+
+            try:
+                dag = dag_utils.load_job_group_from_yaml(f.name)
+                assert dag.primary_tasks == ['trainer']
+                assert dag.termination_delay == '30s'
+                assert dag.get_termination_delay_secs('replay-buffer') == 30
+            finally:
+                os.unlink(f.name)
+
+    def test_load_job_group_with_dict_termination_delay(self):
+        """Test loading JobGroup with dict termination_delay."""
+        yaml_content = """
+---
+name: test-rl
+execution: parallel
+primary_tasks: [trainer]
+termination_delay:
+  default: 10s
+  replay-buffer: 30s
+  evaluator: 1m
+---
+name: trainer
+run: echo trainer
+---
+name: replay-buffer
+run: echo buffer
+---
+name: evaluator
+run: echo eval
+"""
+        with tempfile.NamedTemporaryFile(mode='w', suffix='.yaml',
+                                         delete=False) as f:
+            f.write(yaml_content)
+            f.flush()
+
+            try:
+                dag = dag_utils.load_job_group_from_yaml(f.name)
+                assert dag.primary_tasks == ['trainer']
+                assert isinstance(dag.termination_delay, dict)
+                assert dag.get_termination_delay_secs('replay-buffer') == 30
+                assert dag.get_termination_delay_secs('evaluator') == 60
+                # Unknown job gets default
+                assert dag.get_termination_delay_secs('unknown') == 10
+            finally:
+                os.unlink(f.name)
+
+    def test_load_job_group_invalid_primary_job_name(self):
+        """Test that referencing unknown job in primary_tasks raises error."""
+        yaml_content = """
+---
+name: test-group
+execution: parallel
+primary_tasks: [nonexistent]
+---
+name: job1
+run: echo hello
+"""
+        with tempfile.NamedTemporaryFile(mode='w', suffix='.yaml',
+                                         delete=False) as f:
+            f.write(yaml_content)
+            f.flush()
+
+            try:
+                with pytest.raises(
+                        ValueError,
+                        match='primary_tasks references unknown job'):
+                    dag_utils.load_job_group_from_yaml(f.name)
+            finally:
+                os.unlink(f.name)
+
+    def test_load_job_group_invalid_termination_delay_format(self):
+        """Test that invalid termination_delay format raises error."""
+        yaml_content = """
+---
+name: test-group
+execution: parallel
+primary_tasks: [job1]
+termination_delay: invalid
+---
+name: job1
+run: echo hello
+"""
+        with tempfile.NamedTemporaryFile(mode='w', suffix='.yaml',
+                                         delete=False) as f:
+            f.write(yaml_content)
+            f.flush()
+
+            try:
+                with pytest.raises(ValueError, match='Invalid time duration'):
+                    dag_utils.load_job_group_from_yaml(f.name)
+            finally:
+                os.unlink(f.name)
+
+    def test_load_job_group_termination_delay_dict_unknown_job(self):
+        """Test that termination_delay dict with unknown job raises error."""
+        yaml_content = """
+---
+name: test-group
+execution: parallel
+primary_tasks: [job1]
+termination_delay:
+  unknown-job: 30s
+---
+name: job1
+run: echo hello
+"""
+        with tempfile.NamedTemporaryFile(mode='w', suffix='.yaml',
+                                         delete=False) as f:
+            f.write(yaml_content)
+            f.flush()
+
+            try:
+                with pytest.raises(
+                        ValueError,
+                        match='termination_delay references unknown job'):
+                    dag_utils.load_job_group_from_yaml(f.name)
+            finally:
+                os.unlink(f.name)
+
+    def test_get_termination_delay_secs_no_delay(self):
+        """Test get_termination_delay_secs returns 0 when not configured."""
+        dag = dag_lib.Dag()
+        assert dag.get_termination_delay_secs('any-job') == 0
+
+    def test_get_termination_delay_secs_simple_string(self):
+        """Test get_termination_delay_secs with simple string."""
+        dag = dag_lib.Dag()
+        dag.termination_delay = '45s'
+        assert dag.get_termination_delay_secs('any-job') == 45
+
+    def test_get_termination_delay_secs_dict_with_default(self):
+        """Test get_termination_delay_secs with dict and default."""
+        dag = dag_lib.Dag()
+        dag.termination_delay = {'default': '20s', 'special-job': '2m'}
+        assert dag.get_termination_delay_secs('special-job') == 120
+        assert dag.get_termination_delay_secs('other-job') == 20
+
+    def test_get_termination_delay_secs_dict_no_default(self):
+        """Test get_termination_delay_secs with dict but no default."""
+        dag = dag_lib.Dag()
+        dag.termination_delay = {'special-job': '30s'}
+        assert dag.get_termination_delay_secs('special-job') == 30
+        assert dag.get_termination_delay_secs('other-job') == 0  # No default
+
+    def test_dump_job_group_with_primary_tasks(self):
+        """Test dumping JobGroup with primary_tasks to YAML."""
+        dag = dag_lib.Dag()
+        dag.name = 'test-group'
+        dag.set_execution(dag_lib.DagExecution.PARALLEL)
+        dag.primary_tasks = ['job1']
+        dag.termination_delay = '30s'
+
+        # Add tasks
+        task1 = task_lib.Task(name='job1')
+        task1.set_resources(resources_lib.Resources())
+        task2 = task_lib.Task(name='job2')
+        task2.set_resources(resources_lib.Resources())
+        dag.add(task1)
+        dag.add(task2)
+
+        yaml_str = dag_utils.dump_job_group_to_yaml_str(dag)
+
+        # Reload and verify
+        reloaded_dag = dag_utils.load_job_group_from_yaml_str(yaml_str)
+        assert reloaded_dag.primary_tasks == ['job1']
+        assert reloaded_dag.termination_delay == '30s'
+
+    def test_duration_parsing_various_formats(self):
+        """Test that various time formats are parsed correctly."""
+        from sky.utils import resources_utils
+
+        # Seconds
+        assert resources_utils.parse_time_seconds('30') == 30
+        assert resources_utils.parse_time_seconds('30s') == 30
+        assert resources_utils.parse_time_seconds('30S') == 30
+
+        # Minutes
+        assert resources_utils.parse_time_seconds('5m') == 300
+        assert resources_utils.parse_time_seconds('5M') == 300
+
+        # Hours
+        assert resources_utils.parse_time_seconds('1h') == 3600
+        assert resources_utils.parse_time_seconds('2H') == 7200
+
+        # Days
+        assert resources_utils.parse_time_seconds('1d') == 86400
+
+    def test_duration_parsing_invalid_format(self):
+        """Test that invalid duration format raises ValueError."""
+        from sky.utils import resources_utils
+
+        with pytest.raises(ValueError, match='Invalid time format'):
+            resources_utils.parse_time_seconds('invalid')
+
+        with pytest.raises(ValueError, match='Invalid time format'):
+            resources_utils.parse_time_seconds('30x')
+
+
+class TestDagJobGroup:
+    """Tests for DAG JobGroup functionality."""
+
+    def test_dag_is_job_group_default(self):
+        """Test that new DAG is not a JobGroup by default."""
+        dag = dag_lib.Dag()
+        assert dag.is_job_group() is False
+
+    def test_set_execution(self):
+        """Test setting DAG execution mode."""
+        dag = dag_lib.Dag()
+        dag.set_execution(dag_lib.DagExecution.PARALLEL)
+        assert dag.is_job_group() is True
+        assert dag.execution == dag_lib.DagExecution.PARALLEL
+
+    def test_primary_tasks_default(self):
+        """Test that primary_tasks is None by default."""
+        dag = dag_lib.Dag()
+        assert dag.primary_tasks is None
+        assert dag.termination_delay is None
+
+
+class TestJobGroupNetworking:
+    """Tests for JobGroup networking utilities."""
+
+    def test_jobgroup_name_env_var_constant(self):
+        """Test that the JobGroup name env var constant is correctly defined."""
+        from sky.jobs import constants as jobs_constants
+
+        assert jobs_constants.SKYPILOT_JOBGROUP_NAME_ENV_VAR == (
+            'SKYPILOT_JOBGROUP_NAME')
+
+    def test_get_k8s_namespace_logs_on_exception(self):
+        """Test that _get_k8s_namespace_from_handle logs debug message on error.
+
+        This test verifies the fix for silent exception handling - exceptions
+        should be logged at debug level instead of silently passed.
+        """
+        from sky.jobs import job_group_networking
+
+        # Create a mock handle that will cause an exception
+        mock_handle = mock.MagicMock()
+        mock_handle.launched_resources = mock.MagicMock()
+        mock_handle.launched_resources.region = 'test-context'
+
+        # Mock the k8s_utils to raise an exception and patch the logger
+        with mock.patch(
+                'sky.provision.kubernetes.utils.'
+                'get_kube_config_context_namespace') as mock_get_ns, \
+                mock.patch.object(job_group_networking,
+                                  'logger') as mock_logger:
+            mock_get_ns.side_effect = Exception('Test K8s error')
+
+            result = job_group_networking._get_k8s_namespace_from_handle(
+                mock_handle)
+
+            # Should fall back to default
+            assert result == 'default'
+
+            # Should have logged the exception at debug level
+            mock_logger.debug.assert_called_once()
+            log_message = mock_logger.debug.call_args[0][0]
+            assert 'Failed to get K8s namespace from handle' in log_message
+            assert 'Test K8s error' in log_message
+
+    def test_get_k8s_namespace_returns_default_for_none_handle(self):
+        """Test _get_k8s_namespace_from_handle returns 'default' for None."""
+        from sky.jobs import job_group_networking
+
+        result = job_group_networking._get_k8s_namespace_from_handle(None)
+        assert result == 'default'
+
+    def test_generate_wait_for_networking_script_with_hostnames(self):
+        """Test wait script generation with multiple job names."""
+        from sky.jobs import job_group_networking
+
+        script = job_group_networking.generate_wait_for_networking_script(
+            'my-job-group', ['trainer', 'evaluator'])
+
+        # Verify script contains expected hostnames
+        assert 'trainer-0.my-job-group' in script
+        assert 'evaluator-0.my-job-group' in script
+
+        # Verify script has required structure
+        assert 'HOSTNAMES=' in script
+        assert 'MAX_WAIT=300' in script
+        assert 'getent hosts' in script
+        assert '[SkyPilot]' in script
+
+    def test_generate_wait_for_networking_script_empty(self):
+        """Test wait script returns empty for no other jobs."""
+        from sky.jobs import job_group_networking
+
+        script = job_group_networking.generate_wait_for_networking_script(
+            'my-job-group', [])
+
+        assert script == ''
+
+    def test_generate_k8s_dns_updater_script_content(self):
+        """Test DNS updater script structure."""
+        from sky.jobs import job_group_networking
+
+        dns_mappings = [('trainer-0.ns.svc.cluster.local',
+                         'trainer-0.my-group'),
+                        ('eval-0.ns.svc.cluster.local', 'eval-0.my-group')]
+
+        script = job_group_networking.generate_k8s_dns_updater_script(
+            dns_mappings, 'my-group')
+
+        # Verify script contains required elements
+        assert 'MAPPINGS=' in script
+        assert 'trainer-0.ns.svc.cluster.local:trainer-0.my-group' in script
+        assert 'eval-0.ns.svc.cluster.local:eval-0.my-group' in script
+        assert 'getent hosts' in script
+        assert '/etc/hosts' in script
+        assert 'SkyPilot JobGroup K8s entries' in script
+        assert 'while true' in script  # Background loop
+        # Verify idempotency (grep -v for filtering old entries)
+        assert 'grep -v' in script
+
+    def test_generate_k8s_dns_updater_script_empty_mappings(self):
+        """Test DNS updater returns empty for no mappings."""
+        from sky.jobs import job_group_networking
+
+        script = job_group_networking.generate_k8s_dns_updater_script(
+            [], 'my-group')
+        assert script == ''
+
+    def test_generate_k8s_dns_updater_script_includes_sudo_alias(self):
+        """Test DNS updater includes sudo alias for root user support.
+
+        This tests the fix for the bug where JobGroup networking fails on
+        container images without sudo installed (e.g., pytorch/pytorch)
+        even when running as root (uid=0).
+
+        The fix adds ALIAS_SUDO_TO_EMPTY_FOR_ROOT_CMD which creates a
+        function that makes sudo a no-op when running as root.
+        """
+        from sky.jobs import job_group_networking
+        from sky.utils import command_runner
+
+        dns_mappings = [('trainer-0.ns.svc.cluster.local', 'trainer-0.my-group')
+                       ]
+
+        script = job_group_networking.generate_k8s_dns_updater_script(
+            dns_mappings, 'my-group')
+
+        # Verify the sudo alias is included in the script
+        assert 'function sudo()' in script, (
+            'DNS updater script should include sudo alias for root user')
+        assert '$(whoami)' in script or 'whoami' in script, (
+            'Script should check if running as root')
+        # Verify the script still uses sudo commands (which will be aliased)
+        assert 'sudo grep' in script or 'sudo tee' in script, (
+            'Script should still use sudo commands (to be aliased when root)')
+
+
+class TestOptimizerSelectBestInfra:
+    """Tests for Optimizer._select_best_infra logic.
+
+    These tests verify the fix for the logic bug where the optimizer could
+    select an infrastructure that cannot run all tasks.
+    """
+
+    def _create_mock_cloud(self, name: str) -> clouds.Cloud:
+        """Create a mock cloud object."""
+        mock_cloud = mock.MagicMock(spec=clouds.Cloud)
+        mock_cloud.__str__ = mock.MagicMock(return_value=name)
+        mock_cloud.__hash__ = mock.MagicMock(return_value=hash(name))
+        mock_cloud.__eq__ = lambda self, other: str(self) == str(other)
+        return mock_cloud
+
+    def _create_mock_resources(self,
+                               region: Optional[str] = None,
+                               cost: float = 1.0) -> resources_lib.Resources:
+        """Create a mock resources object."""
+        mock_res = mock.MagicMock(spec=resources_lib.Resources)
+        mock_res.region = region
+        mock_res.get_cost = mock.MagicMock(return_value=cost)
+        return mock_res
+
+    def _create_mock_task(self, name: str, num_nodes: int = 1) -> task_lib.Task:
+        """Create a mock task object."""
+        mock_task = mock.MagicMock(spec=task_lib.Task)
+        mock_task.name = name
+        mock_task.num_nodes = num_nodes
+        mock_task.estimate_runtime = mock.MagicMock(return_value=3600)
+        mock_task.time_estimator_func = mock.MagicMock()
+        return mock_task
+
+    def test_select_best_infra_single_option(self):
+        """Test that single infra option is returned directly."""
+        from sky.optimizer import Optimizer
+
+        cloud = self._create_mock_cloud('aws')
+        common_infras = [(cloud, 'us-east-1')]
+
+        result = Optimizer._select_best_infra(common_infras, {}, [], True)
+        assert result == (cloud, 'us-east-1')
+
+    def test_select_best_infra_skips_invalid_cloud(self):
+        """Test that infra is skipped if task cannot run on that cloud.
+
+        This tests the fix for the bug where the optimizer would silently
+        skip tasks that couldn't run on a cloud, potentially selecting
+        an invalid infrastructure.
+        """
+        from sky.optimizer import Optimizer
+
+        cloud_aws = self._create_mock_cloud('aws')
+        cloud_gcp = self._create_mock_cloud('gcp')
+
+        task1 = self._create_mock_task('task1')
+        task2 = self._create_mock_task('task2')
+
+        # task1 can run on both clouds, task2 can only run on GCP
+        res_aws = self._create_mock_resources('us-east-1', cost=1.0)
+        res_gcp = self._create_mock_resources('us-central1', cost=2.0)
+
+        task_candidates = {
+            task1: {
+                cloud_aws: [res_aws],
+                cloud_gcp: [res_gcp]
+            },
+            task2: {
+                cloud_gcp: [res_gcp]
+            }  # task2 NOT available on AWS
+        }
+
+        common_infras = [
+            (cloud_aws, 'us-east-1'),  # Invalid for task2
+            (cloud_gcp, 'us-central1')  # Valid for both
+        ]
+
+        result = Optimizer._select_best_infra(common_infras, task_candidates,
+                                              [task1, task2], True)
+
+        # Should select GCP since AWS can't run task2
+        assert result == (cloud_gcp, 'us-central1')
+
+    def test_select_best_infra_skips_no_matching_region(self):
+        """Test that infra is skipped if no resources match the region.
+
+        This tests the fix where best_task_score remains infinity when
+        no resources match the region.
+        """
+        from sky.optimizer import Optimizer
+
+        cloud_aws = self._create_mock_cloud('aws')
+
+        task1 = self._create_mock_task('task1')
+
+        # Resource only available in us-west-2, not us-east-1
+        res_west = self._create_mock_resources('us-west-2', cost=1.0)
+
+        task_candidates = {task1: {cloud_aws: [res_west]}}
+
+        common_infras = [
+            (cloud_aws, 'us-east-1'),  # No resources for this region
+            (cloud_aws, 'us-west-2')  # Resources available
+        ]
+
+        result = Optimizer._select_best_infra(common_infras, task_candidates,
+                                              [task1], True)
+
+        # Should select us-west-2 since us-east-1 has no matching resources
+        assert result == (cloud_aws, 'us-west-2')
+
+    def test_select_best_infra_fallback_to_first_when_all_invalid(self):
+        """Test fallback to first infra when none are valid."""
+        from sky.optimizer import Optimizer
+
+        cloud = self._create_mock_cloud('aws')
+        task1 = self._create_mock_task('task1')
+
+        # No resources for task1 on this cloud
+        task_candidates = {task1: {}}
+
+        common_infras = [(cloud, 'us-east-1'), (cloud, 'us-west-2')]
+
+        result = Optimizer._select_best_infra(common_infras, task_candidates,
+                                              [task1], True)
+
+        # Should fallback to first option
+        assert result == common_infras[0]
+
+    def test_select_best_infra_chooses_cheapest(self):
+        """Test cheapest valid infra is selected when minimize_cost=True."""
+        from sky.optimizer import Optimizer
+
+        cloud_aws = self._create_mock_cloud('aws')
+        cloud_gcp = self._create_mock_cloud('gcp')
+
+        task1 = self._create_mock_task('task1')
+
+        res_aws = self._create_mock_resources('us-east-1', cost=10.0)
+        res_gcp = self._create_mock_resources('us-central1', cost=5.0)
+
+        task_candidates = {task1: {cloud_aws: [res_aws], cloud_gcp: [res_gcp]}}
+
+        common_infras = [(cloud_aws, 'us-east-1'), (cloud_gcp, 'us-central1')]
+
+        result = Optimizer._select_best_infra(common_infras, task_candidates,
+                                              [task1], True)
+
+        # Should select GCP as it's cheaper
+        assert result == (cloud_gcp, 'us-central1')
+
+    def test_select_best_infra_multiple_tasks_all_must_be_valid(self):
+        """Test that all tasks must have valid resources on selected infra."""
+        from sky.optimizer import Optimizer
+
+        cloud = self._create_mock_cloud('aws')
+
+        task1 = self._create_mock_task('task1')
+        task2 = self._create_mock_task('task2')
+        task3 = self._create_mock_task('task3')
+
+        res_east = self._create_mock_resources('us-east-1', cost=1.0)
+        res_west = self._create_mock_resources('us-west-2', cost=1.0)
+
+        # task1 and task2 available in us-east-1, task3 only in us-west-2
+        # task1 available in both regions
+        task_candidates = {
+            task1: {
+                cloud: [res_east, res_west]
+            },
+            task2: {
+                cloud: [res_east]
+            },  # Only us-east-1
+            task3: {
+                cloud: [res_west]
+            }  # Only us-west-2
+        }
+
+        common_infras = [(cloud, 'us-east-1'), (cloud, 'us-west-2')]
+
+        result = Optimizer._select_best_infra(common_infras, task_candidates,
+                                              [task1, task2, task3], True)
+
+        # Neither region can run all 3 tasks, should fallback to first
+        assert result == common_infras[0]
+
+
+class TestOptimizeSameInfraRegion:
+    """Tests for SAME_INFRA optimizer region handling.
+
+    These tests verify that _optimize_same_infra correctly uses launchable
+    resources (with regions) instead of cloud candidates (without regions).
+    This is critical for multi-cluster environments like Kubernetes where
+    each cluster/context is represented as a region.
+    """
+
+    def test_find_common_infras_with_regions(self):
+        """Test _find_common_infras correctly identifies region-specific infras."""
+        from sky.optimizer import Optimizer
+
+        # Create mock clouds
+        k8s = mock.MagicMock(spec=clouds.Cloud)
+        k8s.__str__ = mock.MagicMock(return_value='Kubernetes')
+        k8s.__hash__ = mock.MagicMock(return_value=hash('Kubernetes'))
+        k8s.__eq__ = lambda self, other: str(self) == str(other)
+
+        # Create mock tasks
+        task1 = mock.MagicMock(spec=task_lib.Task)
+        task2 = mock.MagicMock(spec=task_lib.Task)
+
+        # Create resources WITH specific regions (like launchable_resources)
+        res_cluster1_t1 = mock.MagicMock(spec=resources_lib.Resources)
+        res_cluster1_t1.region = 'kind-cluster-1'
+        res_cluster2_t1 = mock.MagicMock(spec=resources_lib.Resources)
+        res_cluster2_t1.region = 'kind-cluster-2'
+
+        res_cluster1_t2 = mock.MagicMock(spec=resources_lib.Resources)
+        res_cluster1_t2.region = 'kind-cluster-1'
+        res_cluster2_t2 = mock.MagicMock(spec=resources_lib.Resources)
+        res_cluster2_t2.region = 'kind-cluster-2'
+
+        # Task candidates with region-specific resources
+        task_candidates = {
+            task1: {
+                k8s: [res_cluster1_t1, res_cluster2_t1]
+            },
+            task2: {
+                k8s: [res_cluster1_t2, res_cluster2_t2]
+            }
+        }
+
+        common_infras = Optimizer._find_common_infras(task_candidates)
+
+        # Should find both clusters as common infras
+        infra_set = {(str(cloud), region) for cloud, region in common_infras}
+        assert ('Kubernetes', 'kind-cluster-1') in infra_set
+        assert ('Kubernetes', 'kind-cluster-2') in infra_set
+        # Should NOT have None region
+        assert ('Kubernetes', None) not in infra_set
+
+    def test_find_common_infras_without_regions_returns_none_region(self):
+        """Test that cloud_candidates (without regions) result in None region.
+
+        This demonstrates the bug that was fixed: when using cloud_candidates
+        directly, regions are None, leading to invalid SAME_INFRA selection.
+        """
+        from sky.optimizer import Optimizer
+
+        k8s = mock.MagicMock(spec=clouds.Cloud)
+        k8s.__str__ = mock.MagicMock(return_value='Kubernetes')
+        k8s.__hash__ = mock.MagicMock(return_value=hash('Kubernetes'))
+        k8s.__eq__ = lambda self, other: str(self) == str(other)
+
+        task1 = mock.MagicMock(spec=task_lib.Task)
+
+        # Resources WITHOUT regions (like cloud_candidates)
+        res_no_region = mock.MagicMock(spec=resources_lib.Resources)
+        res_no_region.region = None
+
+        task_candidates = {task1: {k8s: [res_no_region]}}
+
+        common_infras = Optimizer._find_common_infras(task_candidates)
+
+        # With cloud_candidates (no region), we get (Kubernetes, None)
+        infra_set = {(str(cloud), region) for cloud, region in common_infras}
+        assert ('Kubernetes', None) in infra_set
+
+    def test_find_common_infras_partial_overlap(self):
+        """Test finding common infras when tasks have partial overlap."""
+        from sky.optimizer import Optimizer
+
+        k8s = mock.MagicMock(spec=clouds.Cloud)
+        k8s.__str__ = mock.MagicMock(return_value='Kubernetes')
+        k8s.__hash__ = mock.MagicMock(return_value=hash('Kubernetes'))
+        k8s.__eq__ = lambda self, other: str(self) == str(other)
+
+        task1 = mock.MagicMock(spec=task_lib.Task)
+        task2 = mock.MagicMock(spec=task_lib.Task)
+
+        # Task1 can run on cluster-1 and cluster-2
+        res_cluster1_t1 = mock.MagicMock(spec=resources_lib.Resources)
+        res_cluster1_t1.region = 'cluster-1'
+        res_cluster2_t1 = mock.MagicMock(spec=resources_lib.Resources)
+        res_cluster2_t1.region = 'cluster-2'
+
+        # Task2 can only run on cluster-1 (e.g., due to GPU availability)
+        res_cluster1_t2 = mock.MagicMock(spec=resources_lib.Resources)
+        res_cluster1_t2.region = 'cluster-1'
+
+        task_candidates = {
+            task1: {
+                k8s: [res_cluster1_t1, res_cluster2_t1]
+            },
+            task2: {
+                k8s: [res_cluster1_t2]
+            }
+        }
+
+        common_infras = Optimizer._find_common_infras(task_candidates)
+
+        # Only cluster-1 should be common
+        infra_set = {(str(cloud), region) for cloud, region in common_infras}
+        assert ('Kubernetes', 'cluster-1') in infra_set
+        assert ('Kubernetes', 'cluster-2') not in infra_set
+
+
+class TestControllerAsyncPatterns:
+    """Tests to verify async patterns are used correctly in controller.
+
+    These tests verify that blocking calls are properly wrapped with
+    context_utils.to_thread() to avoid blocking the event loop.
+    """
+
+    def test_download_log_uses_to_thread_in_monitor_job_group_task(self):
+        """Verify _download_log_and_stream is called via to_thread.
+
+        This test ensures the async blocking bug fix is in place by
+        checking that the code structure properly awaits to_thread.
+        """
+        import ast
+        import inspect
+
+        from sky.jobs import controller
+
+        # Get the source code of JobController
+        source = inspect.getsource(controller.JobController)
+
+        # Parse the source to check for the pattern
+        # We're looking for:
+        #   await context_utils.to_thread(..._download_log_and_stream...)
+        tree = ast.parse(source)
+
+        # Find all function definitions
+        async_methods_with_download = []
+        for node in ast.walk(tree):
+            if isinstance(node, ast.AsyncFunctionDef):
+                # Check if this async method contains _download_log_and_stream
+                method_source = ast.unparse(node)
+                if '_download_log_and_stream' in method_source:
+                    async_methods_with_download.append(node.name)
+                    # Verify it's called via to_thread
+                    assert 'to_thread' in method_source, (
+                        f'Async method {node.name} calls '
+                        f'_download_log_and_stream but does not use '
+                        f'to_thread - this will block the event loop!')
+
+        # Ensure we found the relevant methods
+        assert len(async_methods_with_download) > 0, (
+            'No async methods found that call _download_log_and_stream')
+
+
+class TestDocstringQuality:
+    """Tests for code quality issues like typos."""
+
+    def test_no_typos_in_controller_docstrings(self):
+        """Verify common typos are not present in controller module."""
+        import inspect
+
+        from sky.jobs import controller
+
+        source = inspect.getsource(controller)
+
+        # Check for known typos that were fixed
+        typos = ['donwload', 'recieve', 'occured', 'seperate']
+        for typo in typos:
+            assert typo not in source.lower(), (
+                f'Found typo "{typo}" in controller.py')
+
+
+class TestPrimaryAuxiliaryLogic:
+    """Tests for primary/auxiliary job logic and edge cases."""
+
+    def test_multiple_primary_tasks(self):
+        """Test JobGroup with multiple primary tasks."""
+        yaml_content = """
+---
+name: multi-primary
+execution: parallel
+primary_tasks: [trainer, evaluator]
+termination_delay: 30s
+---
+name: trainer
+run: echo trainer
+---
+name: evaluator
+run: echo evaluator
+---
+name: replay-buffer
+run: echo buffer
+---
+name: data-server
+run: echo data
+"""
+        with tempfile.NamedTemporaryFile(mode='w', suffix='.yaml',
+                                         delete=False) as f:
+            f.write(yaml_content)
+            f.flush()
+
+            try:
+                dag = dag_utils.load_job_group_from_yaml(f.name)
+                assert set(dag.primary_tasks) == {'trainer', 'evaluator'}
+                assert len(dag.tasks) == 4
+                # Verify non-primary tasks get the delay
+                assert dag.get_termination_delay_secs('replay-buffer') == 30
+                assert dag.get_termination_delay_secs('data-server') == 30
+                # Primary tasks also get the delay (though they won't be terminated)
+                assert dag.get_termination_delay_secs('trainer') == 30
+            finally:
+                os.unlink(f.name)
+
+    def test_empty_primary_tasks_list_means_all_primary(self):
+        """Test that empty primary_tasks list means all jobs are primary.
+
+        An empty list is treated as equivalent to not setting primary_tasks
+        (all jobs are primary), so it's stored as None.
+        """
+        yaml_content = """
+---
+name: all-primary
+execution: parallel
+primary_tasks: []
+---
+name: job1
+run: echo job1
+---
+name: job2
+run: echo job2
+"""
+        with tempfile.NamedTemporaryFile(mode='w', suffix='.yaml',
+                                         delete=False) as f:
+            f.write(yaml_content)
+            f.flush()
+
+            try:
+                dag = dag_utils.load_job_group_from_yaml(f.name)
+                # Empty list is treated as "all primary" (same as None)
+                assert dag.primary_tasks is None
+            finally:
+                os.unlink(f.name)
+
+    def test_no_primary_tasks_field_means_all_primary(self):
+        """Test that omitting primary_tasks means all jobs are primary."""
+        yaml_content = """
+---
+name: default-all-primary
+execution: parallel
+---
+name: job1
+run: echo job1
+---
+name: job2
+run: echo job2
+"""
+        with tempfile.NamedTemporaryFile(mode='w', suffix='.yaml',
+                                         delete=False) as f:
+            f.write(yaml_content)
+            f.flush()
+
+            try:
+                dag = dag_utils.load_job_group_from_yaml(f.name)
+                assert dag.primary_tasks is None
+            finally:
+                os.unlink(f.name)
+
+    def test_termination_delay_without_primary_tasks_allowed(self):
+        """Test that termination_delay without primary_tasks is allowed.
+
+        When primary_tasks is not specified, all jobs are primary, so
+        termination_delay has no effect but is still allowed.
+        """
+        yaml_content = """
+---
+name: delay-no-primary
+execution: parallel
+termination_delay: 30s
+---
+name: job1
+run: echo job1
+---
+name: job2
+run: echo job2
+"""
+        with tempfile.NamedTemporaryFile(mode='w', suffix='.yaml',
+                                         delete=False) as f:
+            f.write(yaml_content)
+            f.flush()
+
+            try:
+                dag = dag_utils.load_job_group_from_yaml(f.name)
+                # termination_delay is set even without primary_tasks
+                assert dag.termination_delay == '30s'
+                # primary_tasks is None (all jobs are primary)
+                assert dag.primary_tasks is None
+            finally:
+                os.unlink(f.name)
+
+    def test_primary_task_can_be_single_item(self):
+        """Test primary_tasks with a single item works."""
+        yaml_content = """
+---
+name: single-primary
+execution: parallel
+primary_tasks: [trainer]
+---
+name: trainer
+run: echo trainer
+---
+name: aux1
+run: echo aux1
+---
+name: aux2
+run: echo aux2
+"""
+        with tempfile.NamedTemporaryFile(mode='w', suffix='.yaml',
+                                         delete=False) as f:
+            f.write(yaml_content)
+            f.flush()
+
+            try:
+                dag = dag_utils.load_job_group_from_yaml(f.name)
+                assert dag.primary_tasks == ['trainer']
+            finally:
+                os.unlink(f.name)
+
+    def test_dict_termination_delay_with_partial_jobs(self):
+        """Test dict termination_delay with only some jobs specified."""
+        yaml_content = """
+---
+name: partial-delays
+execution: parallel
+primary_tasks: [trainer]
+termination_delay:
+  replay-buffer: 1m
+---
+name: trainer
+run: echo trainer
+---
+name: replay-buffer
+run: echo buffer
+---
+name: data-server
+run: echo data
+"""
+        with tempfile.NamedTemporaryFile(mode='w', suffix='.yaml',
+                                         delete=False) as f:
+            f.write(yaml_content)
+            f.flush()
+
+            try:
+                dag = dag_utils.load_job_group_from_yaml(f.name)
+                # replay-buffer has explicit delay
+                assert dag.get_termination_delay_secs('replay-buffer') == 60
+                # data-server has no entry and no default, gets 0
+                assert dag.get_termination_delay_secs('data-server') == 0
+            finally:
+                os.unlink(f.name)
+
+    def test_termination_delay_zero_is_valid(self):
+        """Test that termination_delay of 0 or 0s is valid."""
+        yaml_content = """
+---
+name: zero-delay
+execution: parallel
+primary_tasks: [trainer]
+termination_delay: 0s
+---
+name: trainer
+run: echo trainer
+---
+name: aux
+run: echo aux
+"""
+        with tempfile.NamedTemporaryFile(mode='w', suffix='.yaml',
+                                         delete=False) as f:
+            f.write(yaml_content)
+            f.flush()
+
+            try:
+                dag = dag_utils.load_job_group_from_yaml(f.name)
+                assert dag.get_termination_delay_secs('aux') == 0
+            finally:
+                os.unlink(f.name)
+
+    def test_termination_delay_large_value(self):
+        """Test termination_delay with large values like 1 hour."""
+        yaml_content = """
+---
+name: large-delay
+execution: parallel
+primary_tasks: [trainer]
+termination_delay: 1h
+---
+name: trainer
+run: echo trainer
+---
+name: aux
+run: echo aux
+"""
+        with tempfile.NamedTemporaryFile(mode='w', suffix='.yaml',
+                                         delete=False) as f:
+            f.write(yaml_content)
+            f.flush()
+
+            try:
+                dag = dag_utils.load_job_group_from_yaml(f.name)
+                assert dag.get_termination_delay_secs('aux') == 3600
+            finally:
+                os.unlink(f.name)
+
+    def test_round_trip_serialization_with_primary_auxiliary(self):
+        """Test that primary_tasks survives YAML round-trip."""
+        dag = dag_lib.Dag()
+        dag.name = 'round-trip-test'
+        dag.set_execution(dag_lib.DagExecution.PARALLEL)
+        dag.primary_tasks = ['primary1', 'primary2']
+        dag.termination_delay = {'default': '30s', 'aux1': '1m'}
+
+        # Add tasks
+        for name in ['primary1', 'primary2', 'aux1', 'aux2']:
+            task = task_lib.Task(name=name)
+            task.set_resources(resources_lib.Resources())
+            dag.add(task)
+
+        # Serialize to YAML
+        yaml_str = dag_utils.dump_job_group_to_yaml_str(dag)
+
+        # Reload from YAML
+        reloaded = dag_utils.load_job_group_from_yaml_str(yaml_str)
+
+        assert set(reloaded.primary_tasks) == {'primary1', 'primary2'}
+        assert isinstance(reloaded.termination_delay, dict)
+        assert reloaded.get_termination_delay_secs('aux1') == 60
+        assert reloaded.get_termination_delay_secs('aux2') == 30
+        assert reloaded.get_termination_delay_secs('unknown') == 30
+
+
+class TestPrimaryAuxiliaryDagMethods:
+    """Tests for Dag methods related to primary/auxiliary logic."""
+
+    def test_is_primary_task_with_primary_tasks_set(self):
+        """Test is_primary_task when primary_tasks is explicitly set."""
+        dag = dag_lib.Dag()
+        dag.primary_tasks = ['trainer', 'evaluator']
+
+        # Add tasks to the DAG
+        for name in ['trainer', 'evaluator', 'replay-buffer']:
+            task = task_lib.Task(name=name)
+            task.set_resources(resources_lib.Resources())
+            dag.add(task)
+
+        assert dag.is_primary_task('trainer') is True
+        assert dag.is_primary_task('evaluator') is True
+        assert dag.is_primary_task('replay-buffer') is False
+
+    def test_is_primary_task_when_all_primary(self):
+        """Test is_primary_task when primary_tasks is None (all primary)."""
+        dag = dag_lib.Dag()
+        dag.primary_tasks = None
+
+        for name in ['job1', 'job2']:
+            task = task_lib.Task(name=name)
+            task.set_resources(resources_lib.Resources())
+            dag.add(task)
+
+        # All tasks should be primary when primary_tasks is None
+        assert dag.is_primary_task('job1') is True
+        assert dag.is_primary_task('job2') is True
+
+    def test_is_primary_task_empty_list(self):
+        """Test is_primary_task when primary_tasks is empty list."""
+        dag = dag_lib.Dag()
+        dag.primary_tasks = []
+
+        for name in ['job1', 'job2']:
+            task = task_lib.Task(name=name)
+            task.set_resources(resources_lib.Resources())
+            dag.add(task)
+
+        # Empty list means all tasks are primary
+        assert dag.is_primary_task('job1') is True
+        assert dag.is_primary_task('job2') is True
+
+    def test_get_auxiliary_tasks(self):
+        """Test getting auxiliary task names."""
+        dag = dag_lib.Dag()
+        dag.primary_tasks = ['trainer']
+
+        for name in ['trainer', 'aux1', 'aux2']:
+            task = task_lib.Task(name=name)
+            task.set_resources(resources_lib.Resources())
+            dag.add(task)
+
+        auxiliary = dag.get_auxiliary_task_names()
+        assert set(auxiliary) == {'aux1', 'aux2'}
+
+    def test_get_auxiliary_tasks_all_primary(self):
+        """Test get_auxiliary_task_names when all tasks are primary."""
+        dag = dag_lib.Dag()
+        dag.primary_tasks = None
+
+        for name in ['job1', 'job2']:
+            task = task_lib.Task(name=name)
+            task.set_resources(resources_lib.Resources())
+            dag.add(task)
+
+        # All tasks are primary, so no auxiliary tasks
+        auxiliary = dag.get_auxiliary_task_names()
+        assert auxiliary == []
+
+
+if __name__ == '__main__':
+    pytest.main([__file__, '-v'])
diff --git a/tests/test_job_groups/test_job_group.yaml b/tests/test_job_groups/test_job_group.yaml
new file mode 100644
index 00000000000..c6883cac2ae
--- /dev/null
+++ b/tests/test_job_groups/test_job_group.yaml
@@ -0,0 +1,19 @@
+---
+name: test-job-group
+execution: parallel
+---
+name: job-a
+resources:
+  cpus: 2
+run: |
+  echo "Job A running"
+  sleep 5
+  echo "Job A done"
+---
+name: job-b
+resources:
+  cpus: 2
+run: |
+  echo "Job B running"
+  sleep 5
+  echo "Job B done"
diff --git a/tests/test_jobs_state_async_vs_sync.py b/tests/test_jobs_state_async_vs_sync.py
index 6f2ab5b5282..b4cb10e100c 100644
--- a/tests/test_jobs_state_async_vs_sync.py
+++ b/tests/test_jobs_state_async_vs_sync.py
@@ -182,7 +182,7 @@ async def test_get_job_schedule_state_same(_seed_one_job: int):
     job_id = _seed_one_job
     # Transition to WAITING using the scheduler API
     state.scheduler_set_waiting(
-        job_id,
+        [job_id],
         dag_yaml_content='dummy.yaml',
         original_user_yaml_content='dummy_user.yaml',
         env_file_content='dummy.env',
@@ -211,7 +211,7 @@ async def test_schedule_state_transitions_same(_mock_jobs_db_conn):
 
     # WAITING
     state.scheduler_set_waiting(
-        job_id,
+        [job_id],
         dag_yaml_content='d.yaml',
         original_user_yaml_content='u.yaml',
         env_file_content='e.env',
diff --git a/tests/test_optimizer_dryruns.py b/tests/test_optimizer_dryruns.py
index f35af16482e..38313734b3e 100644
--- a/tests/test_optimizer_dryruns.py
+++ b/tests/test_optimizer_dryruns.py
@@ -770,87 +770,77 @@ def test_resource_hints_for_invalid_resources(capfd, enable_all_clouds):
     assert 'Did you mean:' not in stdout  # No fuzzy candidates
 
 
-def test_accelerator_memory_filtering(capfd, enable_all_clouds):
+# Parametrized test for accelerator memory filtering - enables parallel execution
+# via pytest-xdist for faster CI runs while maintaining full cloud coverage.
+@pytest.mark.parametrize(
+    'spec,expected,unexpected',
+    [
+        # Test exact memory match - T4 has 16GB memory
+        ({
+            'accelerators': '16GB'
+        }, ['T4'], []),
+        # Test memory with plus (greater than or equal) - V100 has 32GB, A100 has 40GB/80GB
+        ({
+            'accelerators': '32GB+'
+        }, ['V100', 'A100'], ['T4']),
+        # Test list format with memory plus
+        ({
+            'accelerators': ['32GB+']
+        }, ['V100', 'A100'], ['T4']),
+        # Test dict format with multiple accelerators
+        ({
+            'accelerators': {
+                '32GB+': 1,
+                'T4': 1
+            }
+        }, ['V100', 'A100', 'T4'], []),
+        # Test list format with multiple accelerators
+        ({
+            'accelerators': ['32GB+', 'T4']
+        }, ['V100', 'A100', 'T4'], []),
+        # Test list format with different memory specs
+        ({
+            'accelerators': ['32GB+', '16gb']
+        }, ['V100', 'A100', 'T4'], []),
+        # Test memory with different units (16GB in MB)
+        ({
+            'accelerators': '16384MB'
+        }, ['T4'], []),
+    ])
+def test_accelerator_memory_filtering(capfd, enable_all_clouds, spec, expected,
+                                      unexpected):
     """Test filtering accelerators by memory requirements."""
-    # Test exact memory match
-    spec = {'accelerators': '16GB'}
     _test_resources_from_yaml(spec)
     stdout, _ = capfd.readouterr()
-    assert 'T4' in stdout  # T4 has 16GB memory
-
-    # Test memory with plus (greater than or equal)
-    spec = {'accelerators': '32GB+'}
-    _test_resources_from_yaml(spec)
-    stdout, _ = capfd.readouterr()
-    assert 'V100' in stdout  # V100 has 32GB memory
-    assert 'A100' in stdout  # A100 has 40GB/80GB memory
-    assert 'T4' not in stdout  # T4 has 16GB memory
-
-    spec = {'accelerators': ['32GB+']}
-    _test_resources_from_yaml(spec)
-    stdout, _ = capfd.readouterr()
-    assert 'V100' in stdout  # V100 has 32GB memory
-    assert 'A100' in stdout  # A100 has 40GB/80GB memory
-    assert 'T4' not in stdout  # T4 has 16GB memory
-
-    _test_resources_from_yaml({'accelerators': {'32GB+': 1, 'T4': 1}})
-    stdout, _ = capfd.readouterr()
-    assert 'V100' in stdout  # V100 has 32GB memory
-    assert 'A100' in stdout  # A100 has 40GB/80GB memory
-    assert 'T4' in stdout  # T4 has 16GB memory
-
-    _test_resources_from_yaml({'accelerators': ['32GB+', 'T4']})
-    stdout, _ = capfd.readouterr()
-    assert 'V100' in stdout  # V100 has 32GB memory
-    assert 'A100' in stdout  # A100 has 40GB/80GB memory
-    assert 'T4' in stdout  # T4 has 16GB memory
-
-    _test_resources_from_yaml({'accelerators': ['32GB+', '16gb']})
-    stdout, _ = capfd.readouterr()
-    assert 'V100' in stdout  # V100 has 32GB memory
-    assert 'A100' in stdout  # A100 has 40GB/80GB memory
-    assert 'T4' in stdout  # T4 has 16GB memory
-
-    # Test memory with different units
-    spec = {'accelerators': '16384MB'}  # 16GB in MB
-    _test_resources_from_yaml(spec)
-    stdout, _ = capfd.readouterr()
-    assert 'T4' in stdout
-
-
-def test_accelerator_manufacturer_filtering(capfd, enable_all_clouds):
+    for acc in expected:
+        assert acc in stdout, f'Expected {acc} in output for spec {spec}'
+    for acc in unexpected:
+        assert acc not in stdout, f'Unexpected {acc} in output for spec {spec}'
+
+
+# Parametrized test for accelerator manufacturer filtering - enables parallel
+# execution via pytest-xdist for faster CI runs.
+@pytest.mark.parametrize(
+    'spec,expected,unexpected',
+    [
+        # Test NVIDIA GPUs with exact memory - T4 has 16GB
+        ({
+            'accelerators': 'nvidia:16GB:1'
+        }, ['T4'], []),
+        # Test NVIDIA GPUs with memory plus - V100 has 32GB, A100 has 40GB/80GB
+        ({
+            'accelerators': 'nvidia:32GB+'
+        }, ['V100', 'A100'], ['T4']),
+    ])
+def test_accelerator_manufacturer_filtering(capfd, enable_all_clouds, spec,
+                                            expected, unexpected):
     """Test filtering accelerators by manufacturer."""
-    # Test NVIDIA GPUs
-    spec = {'accelerators': 'nvidia:16GB:1'}
-    _test_resources_from_yaml(spec)
-    stdout, _ = capfd.readouterr()
-    assert 'T4' in stdout
-
-    # Test with memory plus
-    spec = {'accelerators': 'nvidia:32GB+'}
-    _test_resources_from_yaml(spec)
-    stdout, _ = capfd.readouterr()
-    assert 'V100' in stdout
-    assert 'A100' in stdout
-    assert 'T4' not in stdout
-
-
-def test_accelerator_cloud_filtering(capfd, enable_all_clouds):
-    """Test filtering accelerators by cloud provider."""
-    # Test AWS GPUs
-    spec = {'accelerators': '16GB'}
-    _test_resources_from_yaml(spec)
-    stdout, _ = capfd.readouterr()
-
-    # Test Azure GPUs
-    spec = {'accelerators': '16GB'}
-    _test_resources_from_yaml(spec)
-    stdout, _ = capfd.readouterr()
-
-    # Test with manufacturer and memory
-    spec = {'accelerators': 'nvidia:32GB+'}
     _test_resources_from_yaml(spec)
     stdout, _ = capfd.readouterr()
+    for acc in expected:
+        assert acc in stdout, f'Expected {acc} in output for spec {spec}'
+    for acc in unexpected:
+        assert acc not in stdout, f'Unexpected {acc} in output for spec {spec}'
 
 
 def test_candidate_logging(enable_all_clouds, capfd):
diff --git a/tests/test_ssh_proxy_lag.py b/tests/test_ssh_proxy_lag.py
index c49ba144310..855f00c5d51 100644
--- a/tests/test_ssh_proxy_lag.py
+++ b/tests/test_ssh_proxy_lag.py
@@ -36,7 +36,7 @@ async def _run_endpoint_func(func, *args, **kwargs):
     if asyncio.iscoroutinefunction(func):
         return await func(*args, **kwargs)
     else:
-        return await context_utils.to_thread(func, *args, **kwargs)
+        return await asyncio.to_thread(func, *args, **kwargs)
 
 
 class SSHLatencyMonitor:
@@ -194,7 +194,7 @@ async def async_get_request(*args, **kwargs):
 
 def create_blocking_mock(return_value, delay=0.02, name=None):
     """Create a mock that simulates blocking behavior.
-    
+
     Args:
         return_value: Value to return after blocking
         delay: Time to block in seconds (default 20ms)
@@ -497,9 +497,9 @@ async def test_func():
                 pass
             finally:
                 import shutil
-                await context_utils.to_thread(shutil.rmtree,
-                                              test_dir,
-                                              ignore_errors=True)
+                await asyncio.to_thread(shutil.rmtree,
+                                        test_dir,
+                                        ignore_errors=True)
 
     result = await run_endpoint_test(test_func, monitor, num_concurrent=10)
     assert not result['blocking'], "/download should not block the event loop"
@@ -938,7 +938,7 @@ async def test_endpoint_validate(monitor):
     print("\n🔍 Testing: /validate")
 
     async def test_func():
-        with mock.patch('sky.utils.context_utils.to_thread') as mock_thread:
+        with mock.patch('sky.server.server.asyncio.to_thread') as mock_thread:
             # to_thread should handle blocking properly
             async def async_validate(*args):
                 await asyncio.sleep(0.001)
diff --git a/tests/test_yamls/slurm_bg_proc.yaml b/tests/test_yamls/slurm_bg_proc.yaml
new file mode 100644
index 00000000000..eed0c3a93d2
--- /dev/null
+++ b/tests/test_yamls/slurm_bg_proc.yaml
@@ -0,0 +1,55 @@
+resources:
+  cpus: 1
+  memory: 8+
+
+num_nodes: 2
+
+run: |
+  set -e
+
+  # Shared heartbeat directory (works on shared filesystem)
+  HEARTBEAT_DIR=~/.sky_proctrack_test
+  HEARTBEAT_FILE="$HEARTBEAT_DIR/heartbeat_$SKYPILOT_NODE_RANK"
+
+  if [ "$SKYPILOT_NODE_RANK" == "0" ]; then
+    # Head: create heartbeat dir and set up cleanup
+    rm -rf "$HEARTBEAT_DIR"
+    mkdir -p "$HEARTBEAT_DIR"
+    trap 'rm -rf "$HEARTBEAT_DIR"' EXIT
+  else
+    # Worker: wait for dir to exist
+    while [ ! -d "$HEARTBEAT_DIR" ]; do sleep 0.1; done
+  fi
+
+  # Start a daemon that writes heartbeats (simulates Ray worker)
+  # Uses setsid to detach from session, simulating what ray start does
+  setsid bash -c "while true; do date +%s > $HEARTBEAT_FILE; sleep 1; done" </dev/null >/dev/null 2>&1 &
+  echo "Started heartbeat daemon for rank $SKYPILOT_NODE_RANK"
+
+  sleep 5  # Let daemon start
+
+  if [ "$SKYPILOT_NODE_RANK" == "0" ]; then
+    echo "Head: waiting 30s for worker to exit..."
+    sleep 30
+
+    echo "Head: checking heartbeats..."
+    for i in $(seq 0 $((SKYPILOT_NUM_NODES - 1))); do
+      FILE="$HEARTBEAT_DIR/heartbeat_$i"
+      if [ ! -f "$FILE" ]; then
+        echo "FAIL: Heartbeat file missing for rank $i"
+        exit 1
+      fi
+      LAST_BEAT=$(cat "$FILE")
+      NOW=$(date +%s)
+      AGE=$((NOW - LAST_BEAT))
+      echo "Rank $i: last heartbeat ${AGE}s ago"
+      if [ "$AGE" -gt 5 ]; then
+        echo "FAIL: Rank $i heartbeat is stale (${AGE}s > 5s) - process was killed!"
+        exit 1
+      fi
+    done
+
+    echo "SUCCESS: All heartbeats are recent - processes survived!"
+  else
+    echo "Worker: exiting immediately"
+  fi
diff --git a/tests/test_yamls/test_k8s_ephemeral_storage_eviction.yaml b/tests/test_yamls/test_k8s_ephemeral_storage_eviction.yaml
new file mode 100644
index 00000000000..446de26b446
--- /dev/null
+++ b/tests/test_yamls/test_k8s_ephemeral_storage_eviction.yaml
@@ -0,0 +1,30 @@
+# This task creates a pod with low ephemeral storage limits that will
+# intentionally exceed the limit and get evicted. Used to test that
+# SkyPilot correctly handles pods with terminated containers where
+# finishedAt is null.
+
+resources:
+  infra: kubernetes
+
+config:
+  kubernetes:
+    pod_config:
+      spec:
+        containers:
+          - name: ray-node
+            resources:
+              requests:
+                ephemeral-storage: "100Mi"
+              limits:
+                # Low limit triggers eviction; may produce ContainerStatusUnknown
+                # with null finished_at if kubelet can't track container
+                # See: https://github.com/kubernetes/kubernetes/issues/122160
+                ephemeral-storage: "200Mi"
+
+run: |
+  # Fill storage to trigger eviction
+  i=0
+  while true; do
+    dd if=/dev/zero of=/tmp/fill-$i bs=10M count=10 2>/dev/null
+    i=$((i+1))
+  done
diff --git a/tests/test_yamls/test_k8s_pod_config_sidecar.yaml.j2 b/tests/test_yamls/test_k8s_pod_config_sidecar.yaml.j2
new file mode 100644
index 00000000000..3a4d9babe06
--- /dev/null
+++ b/tests/test_yamls/test_k8s_pod_config_sidecar.yaml.j2
@@ -0,0 +1,16 @@
+config:
+  kubernetes:
+    pod_config:
+      spec:
+        containers:
+        - name: sidecar
+          image: busybox:latest
+          command: ["sh", "-c", "while true; do echo 'sidecar running'; sleep 60; done"]
+          resources:
+            requests:
+              cpu: "100m"
+              memory: "64Mi"
+
+run: |
+  echo "Testing multi-container pod with sidecar"
+  echo "Hostname: $(hostname)"
diff --git a/tests/test_yamls/test_k8s_remote_identity_local_creds.yaml b/tests/test_yamls/test_k8s_remote_identity_local_creds.yaml
new file mode 100644
index 00000000000..0bd25b39c75
--- /dev/null
+++ b/tests/test_yamls/test_k8s_remote_identity_local_creds.yaml
@@ -0,0 +1,13 @@
+config:
+  kubernetes:
+    remote_identity: LOCAL_CREDENTIALS
+
+run: |
+  echo "Testing remote_identity override with LOCAL_CREDENTIALS"
+  # Check if kubeconfig was uploaded
+  if [ -f ~/.kube/config ]; then
+    echo "SUCCESS: ~/.kube/config exists as expected with LOCAL_CREDENTIALS"
+  else
+    echo "ERROR: ~/.kube/config should exist with LOCAL_CREDENTIALS"
+    exit 1
+  fi
diff --git a/tests/test_yamls/test_k8s_remote_identity_no_upload.yaml b/tests/test_yamls/test_k8s_remote_identity_no_upload.yaml
new file mode 100644
index 00000000000..8ff85640fc0
--- /dev/null
+++ b/tests/test_yamls/test_k8s_remote_identity_no_upload.yaml
@@ -0,0 +1,13 @@
+config:
+  kubernetes:
+    remote_identity: NO_UPLOAD
+
+run: |
+  echo "Testing remote_identity override with NO_UPLOAD"
+  # Check if kubeconfig was NOT uploaded
+  if [ -f ~/.kube/config ]; then
+    echo "ERROR: ~/.kube/config should not exist with NO_UPLOAD"
+    exit 1
+  else
+    echo "SUCCESS: ~/.kube/config does not exist as expected"
+  fi
diff --git a/tests/unit_tests/kubernetes/test_gpu_label_formatters.py b/tests/unit_tests/kubernetes/test_gpu_label_formatters.py
index cd7337dc7a1..74abdfe270a 100644
--- a/tests/unit_tests/kubernetes/test_gpu_label_formatters.py
+++ b/tests/unit_tests/kubernetes/test_gpu_label_formatters.py
@@ -2,21 +2,390 @@
 
 Tests verify correct GPU detection from Kubernetes labels.
 """
-import pytest
-
+from sky.provision.kubernetes import constants as kubernetes_constants
+from sky.provision.kubernetes.utils import _accelerator_name_matches
 from sky.provision.kubernetes.utils import GFDLabelFormatter
 
 
+class TestCanonicalGPUNames:
+    """Tests for the shared CANONICAL_GPU_NAMES constant."""
+
+    def test_canonical_gpu_names_order(self):
+        """Test that names which are prefixes of others come later.
+
+        This is critical for correct substring matching in the GPU labeler.
+        For example, 'L40S' must come before 'L40' which must come before 'L4',
+        otherwise 'L4' would incorrectly match 'L40S'.
+        """
+        names = kubernetes_constants.CANONICAL_GPU_NAMES
+
+        # L40S must come before L40 and L4
+        assert names.index('L40S') < names.index('L40')
+        assert names.index('L40') < names.index('L4')
+
+        # H100-80GB must come before H100
+        assert names.index('H100-80GB') < names.index('H100')
+
+        # A100-80GB must come before A100
+        assert names.index('A100-80GB') < names.index('A100')
+
+        # A10G must come before A10
+        assert names.index('A10G') < names.index('A10')
+
+        # T4g must come before T4
+        assert names.index('T4g') < names.index('T4')
+
+    def test_canonical_gpu_names_contains_latest_gpus(self):
+        """Test that all latest generation GPUs are included."""
+        names = kubernetes_constants.CANONICAL_GPU_NAMES
+
+        # Blackwell architecture
+        assert 'B200' in names
+        assert 'GB200' in names
+
+        # Hopper architecture
+        assert 'H100' in names
+        assert 'H100-80GB' in names
+        assert 'H200' in names
+        assert 'GH200' in names
+
+        # Ada Lovelace architecture
+        assert 'L4' in names
+        assert 'L40' in names
+        assert 'L40S' in names
+
+        # Ampere architecture
+        assert 'A100' in names
+        assert 'A100-80GB' in names
+        assert 'A10' in names
+        assert 'A10G' in names
+
+
+class TestGFDLabelFormatter:
+    """Tests for GFDLabelFormatter GPU detection."""
+
+    def test_l4_l40_l40s_detection(self):
+        """Test correct detection of L4, L40, and L40S GPUs.
+
+        This was the original bug: L40S was being misidentified as L4.
+        """
+        test_cases = [
+            ('NVIDIA-L4-24GB', 'L4'),
+            ('NVIDIA-L40-48GB', 'L40'),
+            ('NVIDIA-L40S-48GB', 'L40S'),
+            ('NVIDIA-L40S', 'L40S'),
+            ('NVIDIA-L4', 'L4'),
+            ('L40-GPU', 'L40'),
+            ('L40S-GPU', 'L40S'),
+        ]
+        for input_value, expected in test_cases:
+            result = GFDLabelFormatter.get_accelerator_from_label_value(
+                input_value)
+            assert result == expected, f'Failed for {input_value}'
+
+    def test_l400_not_matched_as_l4_or_l40(self):
+        """Test that L400 falls back correctly and doesn't match L4/L40."""
+        result = GFDLabelFormatter.get_accelerator_from_label_value(
+            'NVIDIA-L400')
+        # L400 is not in canonical names, so it should use fallback
+        assert result == 'L400', f'Expected L400, got {result}'
+
+    def test_h100_variants(self):
+        """Test H100 variant detection including 80GB models."""
+        test_cases = [
+            ('NVIDIA-H100-SXM-80GB', 'H100-80GB'),
+            ('NVIDIA-H100-PCIE-80GB', 'H100-80GB'),
+            ('NVIDIA-H100-80GB-HBM3', 'H100-80GB'),
+            ('NVIDIA-H100-SXM', 'H100'),
+            ('NVIDIA-H100', 'H100'),
+        ]
+        for input_value, expected in test_cases:
+            result = GFDLabelFormatter.get_accelerator_from_label_value(
+                input_value)
+            assert result == expected, f'Failed for {input_value}'
+
+    def test_a100_variants(self):
+        """Test A100 variant detection including 80GB models."""
+        test_cases = [
+            ('NVIDIA-A100-SXM4-80GB', 'A100-80GB'),
+            ('NVIDIA-A100-PCIE-80GB', 'A100-80GB'),
+            ('NVIDIA-A100-80GB', 'A100-80GB'),
+            ('NVIDIA-A100-SXM4-40GB', 'A100'),
+            ('NVIDIA-A100-40GB', 'A100'),
+            ('NVIDIA-A100', 'A100'),
+        ]
+        for input_value, expected in test_cases:
+            result = GFDLabelFormatter.get_accelerator_from_label_value(
+                input_value)
+            assert result == expected, f'Failed for {input_value}'
+
+    def test_blackwell_gpus(self):
+        """Test Blackwell architecture GPU detection."""
+        test_cases = [
+            ('NVIDIA-B200', 'B200'),
+            ('NVIDIA-B100', 'B100'),
+            ('NVIDIA-GB200', 'GB200'),
+            ('NVIDIA-GB300', 'GB300'),
+        ]
+        for input_value, expected in test_cases:
+            result = GFDLabelFormatter.get_accelerator_from_label_value(
+                input_value)
+            assert result == expected, f'Failed for {input_value}'
+
+    def test_hopper_gpus(self):
+        """Test Hopper architecture GPU detection."""
+        test_cases = [
+            ('NVIDIA-H200', 'H200'),
+            ('NVIDIA-GH200', 'GH200'),
+        ]
+        for input_value, expected in test_cases:
+            result = GFDLabelFormatter.get_accelerator_from_label_value(
+                input_value)
+            assert result == expected, f'Failed for {input_value}'
+
+    def test_ampere_gpus(self):
+        """Test Ampere architecture GPU detection."""
+        test_cases = [
+            ('NVIDIA-A10G', 'A10G'),
+            ('NVIDIA-A10', 'A10'),
+            ('NVIDIA-A30', 'A30'),
+            ('NVIDIA-A40', 'A40'),
+        ]
+        for input_value, expected in test_cases:
+            result = GFDLabelFormatter.get_accelerator_from_label_value(
+                input_value)
+            assert result == expected, f'Failed for {input_value}'
+
+    def test_older_gpus(self):
+        """Test older GPU architecture detection."""
+        test_cases = [
+            ('NVIDIA-V100-SXM2-32GB', 'V100'),
+            ('NVIDIA-V100', 'V100'),
+            ('NVIDIA-T4', 'T4'),
+            ('NVIDIA-P100', 'P100'),
+            ('NVIDIA-K80', 'K80'),
+        ]
+        for input_value, expected in test_cases:
+            result = GFDLabelFormatter.get_accelerator_from_label_value(
+                input_value)
+            assert result == expected, f'Failed for {input_value}'
+
+    def test_fallback_for_unknown_gpus(self):
+        """Test fallback behavior for GPUs not in canonical list."""
+        # Fallback removes 'NVIDIA-', 'GEFORCE-', and replaces 'RTX-' with 'RTX'
+        # but preserves other dashes
+        test_cases = [
+            ('NVIDIA-GEFORCE-RTX-3090', 'RTX3090'),
+            ('NVIDIA-RTX-6000', 'RTX6000'),
+            ('NVIDIA-QUADRO-P5000', 'QUADRO-P5000'),
+        ]
+        for input_value, expected in test_cases:
+            result = GFDLabelFormatter.get_accelerator_from_label_value(
+                input_value)
+            assert result == expected, f'Failed for {input_value}'
+
+    def test_professional_rtx_gpus(self):
+        """Test detection of professional RTX GPUs now in canonical list."""
+        test_cases = [
+            ('NVIDIA-RTX-A6000', 'A6000'),
+            ('NVIDIA-RTX-A5000-24GB', 'A5000'),
+            ('NVIDIA-RTX-A4000-16GB', 'A4000'),
+        ]
+        for input_value, expected in test_cases:
+            result = GFDLabelFormatter.get_accelerator_from_label_value(
+                input_value)
+            assert result == expected, f'Failed for {input_value}'
+
+
+class TestGPULabelerMatching:
+    """Tests for the GPU labeler substring matching logic.
+
+    The GPU labeler uses substring matching (canonical_name.lower() in gpu_name)
+    which requires careful ordering of canonical names.
+    """
+
+    def _simulate_labeler_match(self, gpu_name: str) -> str:
+        """Simulate the GPU labeler matching logic."""
+        for canonical_name in kubernetes_constants.CANONICAL_GPU_NAMES:
+            if canonical_name.lower() in gpu_name.lower():
+                return canonical_name.lower()
+        # Fallback logic from the labeler
+        return gpu_name.lower().replace('nvidia ',
+                                        '').replace('geforce ', '').replace(
+                                            'rtx ', 'rtx').replace(' ', '-')
+
+    def test_l40s_not_matched_as_l4(self):
+        """Test that L40S is not incorrectly matched as L4.
+
+        This was the original bug reported by the customer.
+        """
+        result = self._simulate_labeler_match('NVIDIA L40S')
+        assert result == 'l40s', f'Expected l40s, got {result}'
+
+    def test_l40_not_matched_as_l4(self):
+        """Test that L40 is not incorrectly matched as L4."""
+        result = self._simulate_labeler_match('NVIDIA L40')
+        assert result == 'l40', f'Expected l40, got {result}'
+
+    def test_l4_matched_correctly(self):
+        """Test that L4 is still matched correctly."""
+        result = self._simulate_labeler_match('NVIDIA L4')
+        assert result == 'l4', f'Expected l4, got {result}'
+
+    def test_nvidia_smi_output_formats(self):
+        """Test various nvidia-smi output formats.
+
+        Note: The labeler uses simple substring matching. This means:
+        - 'H100-80GB' won't match 'NVIDIA H100 80GB HBM3' (hyphen vs space)
+        - 'A100-80GB' won't match 'NVIDIA A100-SXM4-80GB' (SXM4 in between)
+        These are known limitations of substring matching.
+        """
+        test_cases = [
+            ('NVIDIA L40S', 'l40s'),
+            ('NVIDIA L40S 48GB', 'l40s'),
+            # H100-80GB won't match due to hyphen vs space, matches H100
+            ('NVIDIA H100 80GB HBM3', 'h100'),
+            ('NVIDIA H100 PCIe', 'h100'),
+            # A100-80GB won't match 'A100-SXM4-80GB' - matches A100 instead
+            ('NVIDIA A100-SXM4-80GB', 'a100'),
+            ('NVIDIA A100-SXM4-40GB', 'a100'),
+            ('NVIDIA GeForce RTX 3090', 'rtx3090'),
+        ]
+        for gpu_name, expected in test_cases:
+            result = self._simulate_labeler_match(gpu_name)
+            assert result == expected, f'Failed for {gpu_name}: got {result}'
+
+
+# Keep the original test for backwards compatibility
 def test_gfd_label_formatter():
     """Test word boundary regex matching in GFDLabelFormatter."""
     # Test various GPU name patterns
     test_cases = [
         ('NVIDIA-L4-24GB', 'L4'),
         ('NVIDIA-L40-48GB', 'L40'),
-        ('NVIDIA-L400', 'L400'),  # Should not match L4 or L40
+        ('NVIDIA-L40S-48GB', 'L40S'),  # L40S should not match L40 or L4
+        ('NVIDIA-L40S', 'L40S'),
+        ('NVIDIA-L400', 'L400'),  # Should not match L4, L40, or L40S
         ('NVIDIA-L4', 'L4'),
         ('L40-GPU', 'L40'),
+        ('L40S-GPU', 'L40S'),
     ]
     for input_value, expected in test_cases:
         result = GFDLabelFormatter.get_accelerator_from_label_value(input_value)
         assert result == expected, f'Failed for {input_value}'
+
+
+class TestAcceleratorNameMatches:
+    """Tests for backward-compatible accelerator name matching.
+
+    These tests ensure that clusters launched with old fallback GPU names
+    (e.g., H200-SXM-80GB) continue to work after upgrading to versions
+    that use canonical names (e.g., H200).
+    """
+
+    def test_exact_match(self):
+        """Test exact accelerator name matching."""
+        assert _accelerator_name_matches('H200', ['h200'])
+        assert _accelerator_name_matches('H100', ['h100'])
+        assert _accelerator_name_matches('A100-80GB', ['a100-80gb'])
+        assert not _accelerator_name_matches('H200', ['h100'])
+
+    def test_backward_compat_canonical_to_fallback(self):
+        """Test matching canonical name against fallback name.
+
+        Scenario: Cluster launched with fallback name 'H200-SXM-80GB',
+        after upgrade the label maps to canonical 'H200'. The stored
+        launched_resources has 'H200-SXM-80GB', viable list has ['h200'].
+        """
+        # Old acc_type (H200-SXM-80GB) should match new canonical (h200)
+        assert _accelerator_name_matches('H200-SXM-80GB', ['h200'])
+        assert _accelerator_name_matches('H200-SXM-80GB',
+                                         ['nvidia-h200-sxm-80gb', 'h200'])
+        assert _accelerator_name_matches('H100-PCIE-80GB', ['h100'])
+        assert _accelerator_name_matches('L40S-48GB', ['l40s'])
+
+    def test_forward_compat_fallback_to_canonical(self):
+        """Test matching fallback name against canonical name.
+
+        Scenario: User specifies canonical 'H200' but old API server returns
+        fallback 'H200-SXM-80GB' from get_accelerator_from_label_value.
+        """
+        # New acc_type (H200) should match old fallback (h200-sxm-80gb)
+        assert _accelerator_name_matches('H200', ['h200-sxm-80gb'])
+        assert _accelerator_name_matches(
+            'H200', ['nvidia-h200-sxm-80gb', 'h200-sxm-80gb'])
+        assert _accelerator_name_matches('H100', ['h100-pcie-80gb'])
+        assert _accelerator_name_matches('L40S', ['l40s-48gb'])
+
+    def test_no_false_positive_prefix(self):
+        """Test that partial prefixes don't match incorrectly."""
+        # H2 should not match H200 (no dash separator)
+        assert not _accelerator_name_matches('H2', ['h200'])
+        assert not _accelerator_name_matches('H20', ['h200'])
+        # L4 should not match L40 (different GPU)
+        assert not _accelerator_name_matches('L4', ['l40'])
+        assert not _accelerator_name_matches('L40', ['l4'])
+        # A10 should not match A100
+        assert not _accelerator_name_matches('A10', ['a100'])
+
+    def test_case_insensitive(self):
+        """Test case-insensitive matching."""
+        assert _accelerator_name_matches('H200', ['H200'])
+        assert _accelerator_name_matches('h200', ['H200'])
+        assert _accelerator_name_matches('H200-SXM-80GB', ['h200'])
+        assert _accelerator_name_matches('h200-sxm-80gb', ['H200'])
+
+    def test_multiple_viable_names(self):
+        """Test matching against multiple viable names."""
+        viable = ['nvidia-h200-sxm-80gb', 'h200']
+        assert _accelerator_name_matches('H200', viable)
+        assert _accelerator_name_matches('H200-SXM-80GB', viable)
+        assert not _accelerator_name_matches('H100', viable)
+
+    def test_h100_80gb_backward_compat(self):
+        """Test H100 ↔ H100-80GB backward compatibility.
+
+        PR #8593 adds H100-80GB as a canonical name that maps H100-SXM-80GB
+        and H100-PCIE-80GB labels. Before this change, these labels mapped
+        to just 'H100'. This test ensures clusters launched with 'H100'
+        continue to work when the new code returns 'H100-80GB'.
+
+        Scenario:
+        - Old SkyPilot: label 'H100-SXM-80GB' → canonical 'H100'
+        - New SkyPilot: label 'H100-SXM-80GB' → canonical 'H100-80GB'
+        - Cluster launched with old SkyPilot has 'H100' in launched_resources
+        - After upgrade, viable_names = ['h100-80gb', 'nvidia-h100-sxm-80gb']
+        - _accelerator_name_matches('H100', viable_names) should return True
+        """
+        # Old stored 'H100' should match new canonical 'H100-80GB'
+        assert _accelerator_name_matches('H100', ['h100-80gb'])
+        assert _accelerator_name_matches('H100',
+                                         ['h100-80gb', 'nvidia-h100-sxm-80gb'])
+
+        # New stored 'H100-80GB' should match old canonical 'H100'
+        assert _accelerator_name_matches('H100-80GB', ['h100'])
+        assert _accelerator_name_matches('H100-80GB',
+                                         ['h100', 'nvidia-h100-sxm-80gb'])
+
+        # Same for A100-80GB (already existed but worth testing)
+        assert _accelerator_name_matches('A100', ['a100-80gb'])
+        assert _accelerator_name_matches('A100-80GB', ['a100'])
+
+        # GH200 variants (GH200-480GB is a possible variant)
+        assert _accelerator_name_matches('GH200', ['gh200-480gb'])
+        assert _accelerator_name_matches('GH200-480GB', ['gh200'])
+
+    def test_no_cross_variant_matching(self):
+        """Test that different GPU variants don't incorrectly match.
+
+        H100 and H100-MEGA are different GPUs and should not match each
+        other. However, due to prefix matching, H100 will match H100-MEGA.
+        This is a known limitation that's acceptable because:
+        1. It's unlikely a user launches with H100-MEGA and expects H100
+        2. Not matching would break backward compat for valid cases
+        """
+        # These will match due to prefix logic - this is expected behavior
+        assert _accelerator_name_matches('H100', ['h100-mega'])
+        # But ensure unrelated GPUs don't match
+        assert not _accelerator_name_matches('H200', ['h100-mega'])
+        assert not _accelerator_name_matches('A100', ['h100-mega'])
diff --git a/tests/unit_tests/kubernetes/test_kubernetes_utils.py b/tests/unit_tests/kubernetes/test_kubernetes_utils.py
index 3312b5df282..79d732550bd 100644
--- a/tests/unit_tests/kubernetes/test_kubernetes_utils.py
+++ b/tests/unit_tests/kubernetes/test_kubernetes_utils.py
@@ -41,6 +41,9 @@ def test_get_kubernetes_node_info():
         'cloud.google.com/gke-accelerator-count': '4'
     }
     mock_gpu_node_1.status.allocatable = {'nvidia.com/gpu': '4'}
+    mock_gpu_node_1.is_ready.return_value = True
+    mock_gpu_node_1.is_cordoned.return_value = False
+    mock_gpu_node_1.get_taints.return_value = []
 
     mock_gpu_node_2 = mock.MagicMock()
     mock_gpu_node_2.metadata.name = 'node-2'
@@ -51,6 +54,9 @@ def test_get_kubernetes_node_info():
         'cloud.google.com/gke-tpu-topology': '2x4'
     }
     mock_gpu_node_2.status.allocatable = {'google.com/tpu': '8'}
+    mock_gpu_node_2.is_ready.return_value = True
+    mock_gpu_node_2.is_cordoned.return_value = False
+    mock_gpu_node_2.get_taints.return_value = []
 
     mock_pod_1 = mock.MagicMock()
     mock_pod_1.spec.node_name = 'node-1'
@@ -118,6 +124,9 @@ def test_get_kubernetes_node_info():
         'cloud.google.com/gke-tpu-node-pool-type': 'multi-host'
     }
     mock_tpu_node_1.status.allocatable = {'google.com/tpu': '4'}
+    mock_tpu_node_1.is_ready.return_value = True
+    mock_tpu_node_1.is_cordoned.return_value = False
+    mock_tpu_node_1.get_taints.return_value = []
 
     with mock.patch('sky.provision.kubernetes.utils.get_kubernetes_nodes',
                    return_value=[mock_gpu_node_1, mock_tpu_node_1]), \
@@ -150,6 +159,9 @@ def test_get_kubernetes_node_info():
     mock_cpu_node_1.status.addresses = [
         mock.MagicMock(type='InternalIP', address='10.0.0.1')
     ]
+    mock_cpu_node_1.is_ready.return_value = True
+    mock_cpu_node_1.is_cordoned.return_value = False
+    mock_cpu_node_1.get_taints.return_value = []
 
     mock_cpu_node_2 = mock.MagicMock()
     mock_cpu_node_2.metadata.name = 'node-5'
@@ -158,6 +170,9 @@ def test_get_kubernetes_node_info():
     mock_cpu_node_2.status.addresses = [
         mock.MagicMock(type='InternalIP', address='10.0.0.2')
     ]
+    mock_cpu_node_2.is_ready.return_value = True
+    mock_cpu_node_2.is_cordoned.return_value = False
+    mock_cpu_node_2.get_taints.return_value = []
 
     with mock.patch('sky.provision.kubernetes.utils.get_kubernetes_nodes',
                    return_value=[mock_cpu_node_1, mock_cpu_node_2]), \
@@ -202,6 +217,174 @@ def test_get_kubernetes_node_info():
         assert node_info.node_info_dict['node-1'].free[
             'accelerators_available'] == 2
 
+    # Test case 7: Cordoned node
+    mock_cordoned_node = mock.MagicMock()
+    mock_cordoned_node.metadata.name = 'node-cordoned'
+    mock_cordoned_node.metadata.labels = {
+        'skypilot.co/accelerator': 'a100-80gb',
+        'cloud.google.com/gke-accelerator-count': '4'
+    }
+    mock_cordoned_node.status.allocatable = {'nvidia.com/gpu': '4'}
+    mock_cordoned_node.status.capacity = {'cpu': '8', 'memory': '32Gi'}
+    mock_cordoned_node.status.addresses = [
+        mock.MagicMock(type='InternalIP', address='10.0.0.10')
+    ]
+    mock_cordoned_node.spec.unschedulable = True
+    mock_cordoned_node.spec.taints = [
+        mock.MagicMock(key='node.kubernetes.io/unschedulable',
+                       value=None,
+                       effect='NoSchedule')
+    ]
+    mock_cordoned_node.is_ready.return_value = True
+    mock_cordoned_node.is_cordoned.return_value = True
+    mock_cordoned_node.get_taints.return_value = []
+
+    with mock.patch('sky.provision.kubernetes.utils.get_kubernetes_nodes',
+                   return_value=[mock_cordoned_node]), \
+         mock.patch('sky.provision.kubernetes.utils.'
+                   'get_allocated_resources_by_node',
+                   return_value=({mock_cordoned_node.metadata.name: 2}, {})), \
+         mock.patch('sky.provision.kubernetes.utils.get_gpu_resource_key',
+                   return_value='nvidia.com/gpu'):
+        node_info = utils.get_kubernetes_node_info()
+        assert isinstance(node_info, models.KubernetesNodesInfo)
+        assert len(node_info.node_info_dict) == 1
+        assert node_info.node_info_dict['node-cordoned'].is_cordoned is True
+        assert node_info.node_info_dict['node-cordoned'].taints == []
+
+    # Test case 8: Node with custom taints (no cordon)
+    mock_tainted_node = mock.MagicMock()
+    mock_tainted_node.metadata.name = 'node-tainted'
+    mock_tainted_node.metadata.labels = {
+        'skypilot.co/accelerator': 'v100',
+        'cloud.google.com/gke-accelerator-count': '2'
+    }
+    mock_tainted_node.status.allocatable = {'nvidia.com/gpu': '2'}
+    mock_tainted_node.status.capacity = {'cpu': '4', 'memory': '16Gi'}
+    mock_tainted_node.status.addresses = [
+        mock.MagicMock(type='InternalIP', address='10.0.0.11')
+    ]
+    mock_tainted_node.spec.unschedulable = False
+    mock_tainted_node.spec.taints = [
+        mock.MagicMock(key='dedicated', value='gpu', effect='NoSchedule'),
+        mock.MagicMock(key='gpu-type', value='v100', effect='NoExecute')
+    ]
+    mock_tainted_node.is_ready.return_value = True
+    mock_tainted_node.is_cordoned.return_value = False
+    mock_tainted_node.get_taints.return_value = [{
+        'key': 'dedicated',
+        'value': 'gpu',
+        'effect': 'NoSchedule'
+    }, {
+        'key': 'gpu-type',
+        'value': 'v100',
+        'effect': 'NoExecute'
+    }]
+
+    with mock.patch('sky.provision.kubernetes.utils.get_kubernetes_nodes',
+                   return_value=[mock_tainted_node]), \
+         mock.patch('sky.provision.kubernetes.utils.'
+                   'get_allocated_resources_by_node',
+                   return_value=({mock_tainted_node.metadata.name: 1}, {})), \
+         mock.patch('sky.provision.kubernetes.utils.get_gpu_resource_key',
+                   return_value='nvidia.com/gpu'):
+        node_info = utils.get_kubernetes_node_info()
+        assert isinstance(node_info, models.KubernetesNodesInfo)
+        assert len(node_info.node_info_dict) == 1
+        assert node_info.node_info_dict['node-tainted'].is_cordoned is False
+        assert len(node_info.node_info_dict['node-tainted'].taints) == 2
+        assert node_info.node_info_dict['node-tainted'].taints[0] == {
+            'key': 'dedicated',
+            'value': 'gpu',
+            'effect': 'NoSchedule'
+        }
+        assert node_info.node_info_dict['node-tainted'].taints[1] == {
+            'key': 'gpu-type',
+            'value': 'v100',
+            'effect': 'NoExecute'
+        }
+
+    # Test case 9: Cordoned node with additional custom taints
+    mock_cordoned_and_tainted = mock.MagicMock()
+    mock_cordoned_and_tainted.metadata.name = 'node-cordoned-and-tainted'
+    mock_cordoned_and_tainted.metadata.labels = {
+        'skypilot.co/accelerator': 't4',
+        'cloud.google.com/gke-accelerator-count': '1'
+    }
+    mock_cordoned_and_tainted.status.allocatable = {'nvidia.com/gpu': '1'}
+    mock_cordoned_and_tainted.status.capacity = {'cpu': '2', 'memory': '8Gi'}
+    mock_cordoned_and_tainted.status.addresses = [
+        mock.MagicMock(type='InternalIP', address='10.0.0.12')
+    ]
+    mock_cordoned_and_tainted.spec.unschedulable = True
+    mock_cordoned_and_tainted.spec.taints = [
+        mock.MagicMock(key='node.kubernetes.io/unschedulable',
+                       value=None,
+                       effect='NoSchedule'),
+        mock.MagicMock(key='maintenance', value='true', effect='NoSchedule')
+    ]
+    mock_cordoned_and_tainted.is_ready.return_value = True
+    mock_cordoned_and_tainted.is_cordoned.return_value = True
+    mock_cordoned_and_tainted.get_taints.return_value = [{
+        'key': 'maintenance',
+        'value': 'true',
+        'effect': 'NoSchedule'
+    }]
+
+    with mock.patch('sky.provision.kubernetes.utils.get_kubernetes_nodes',
+                   return_value=[mock_cordoned_and_tainted]), \
+         mock.patch('sky.provision.kubernetes.utils.'
+                   'get_allocated_resources_by_node',
+                   return_value=({mock_cordoned_and_tainted.metadata.name: 0}, {})), \
+         mock.patch('sky.provision.kubernetes.utils.get_gpu_resource_key',
+                   return_value='nvidia.com/gpu'):
+        node_info = utils.get_kubernetes_node_info()
+        assert isinstance(node_info, models.KubernetesNodesInfo)
+        assert len(node_info.node_info_dict) == 1
+        assert node_info.node_info_dict[
+            'node-cordoned-and-tainted'].is_cordoned is True
+        # Should only return non-cordon taints
+        assert len(
+            node_info.node_info_dict['node-cordoned-and-tainted'].taints) == 1
+        assert node_info.node_info_dict['node-cordoned-and-tainted'].taints[
+            0] == {
+                'key': 'maintenance',
+                'value': 'true',
+                'effect': 'NoSchedule'
+            }
+
+    # Test case 10: CPU-only node with cordoned status
+    mock_cpu_cordoned = mock.MagicMock()
+    mock_cpu_cordoned.metadata.name = 'node-cpu-cordoned'
+    mock_cpu_cordoned.metadata.labels = {}
+    mock_cpu_cordoned.status.allocatable = {'cpu': '4', 'memory': '16Gi'}
+    mock_cpu_cordoned.status.capacity = {'cpu': '4', 'memory': '16Gi'}
+    mock_cpu_cordoned.status.addresses = [
+        mock.MagicMock(type='InternalIP', address='10.0.0.13')
+    ]
+    mock_cpu_cordoned.spec.unschedulable = True
+    mock_cpu_cordoned.spec.taints = [
+        mock.MagicMock(key='node.kubernetes.io/unschedulable',
+                       value=None,
+                       effect='NoSchedule')
+    ]
+    mock_cpu_cordoned.is_ready.return_value = True
+    mock_cpu_cordoned.is_cordoned.return_value = True
+    mock_cpu_cordoned.get_taints.return_value = []
+
+    with mock.patch('sky.provision.kubernetes.utils.get_kubernetes_nodes',
+                   return_value=[mock_cpu_cordoned]), \
+         mock.patch('sky.provision.kubernetes.utils.'
+                   'get_allocated_resources_by_node',
+                   return_value=({}, {})):
+        node_info = utils.get_kubernetes_node_info()
+        assert isinstance(node_info, models.KubernetesNodesInfo)
+        assert len(node_info.node_info_dict) == 1
+        assert node_info.node_info_dict['node-cpu-cordoned'].is_cordoned is True
+        assert node_info.node_info_dict['node-cpu-cordoned'].taints == []
+        assert node_info.node_info_dict['node-cpu-cordoned'].total[
+            'accelerator_count'] == 0
+
 
 def test_get_all_kube_context_names():
     """Tests get_all_kube_context_names function with KUBECONFIG env var."""
@@ -517,10 +700,16 @@ def test_heterogenous_gpu_detection():
         'gpu.nvidia.com/vram': '81'
     }
     mock_node1.status.allocatable = {'nvidia.com/gpu': '2'}
+    mock_node1.is_ready.return_value = True
+    mock_node1.is_cordoned.return_value = False
+    mock_node1.get_taints.return_value = []
 
     mock_node2 = mock.MagicMock()
     mock_node2.metadata.name = 'node2'
     mock_node2.metadata.labels = {'cloud.google.com/gke-accelerator': ''}
+    mock_node2.is_ready.return_value = True
+    mock_node2.is_cordoned.return_value = False
+    mock_node2.get_taints.return_value = []
 
     mock_container1 = mock.MagicMock()
     mock_container1.resources.requests = 0
@@ -568,6 +757,9 @@ def test_low_priority_pod_filtering():
     mock_node.status.addresses = [
         mock.MagicMock(type='InternalIP', address='10.0.0.1')
     ]
+    mock_node.is_ready.return_value = True
+    mock_node.is_cordoned.return_value = False
+    mock_node.get_taints.return_value = []
 
     # Mock regular pod requesting 2 GPUs
     mock_regular_pod = mock.MagicMock()
@@ -1137,7 +1329,7 @@ def test_parse_cpu_or_gpu_resource_to_float():
 
 def test_parse_memory_resource_with_millibytes():
     """Test parse_memory_resource function with lowercase 'm' suffix.
-    
+
     This test verifies that parse_memory_resource correctly handles memory
     values with lowercase 'm' suffix like '100m' = 0.1 bytes.
     Note: For memory, 'm' means milli (0.001 bytes), so 100m = 100 * 0.001 = 0.1 bytes.
@@ -2489,3 +2681,247 @@ def test_tpu_does_not_fit(self):
             assert fits is False
             assert reason is not None
             assert 'Requested TPU type was not found' in reason
+
+
+class TestV1Node(unittest.TestCase):
+    """Tests for V1Node dataclass and its methods."""
+
+    def _create_v1node(self,
+                       name: str = 'test-node',
+                       labels: Optional[dict] = None,
+                       conditions: Optional[list] = None,
+                       unschedulable: bool = False,
+                       taints: Optional[list] = None) -> utils.V1Node:
+        """Helper to create a V1Node for testing."""
+        if labels is None:
+            labels = {}
+        if conditions is None:
+            conditions = []
+        if taints is None:
+            taints = []
+
+        return utils.V1Node(metadata=utils.V1ObjectMeta(name=name,
+                                                        labels=labels),
+                            status=utils.V1NodeStatus(
+                                allocatable={
+                                    'cpu': '4',
+                                    'memory': '16Gi'
+                                },
+                                capacity={
+                                    'cpu': '4',
+                                    'memory': '16Gi'
+                                },
+                                addresses=[
+                                    utils.V1NodeAddress(type='InternalIP',
+                                                        address='10.0.0.1')
+                                ],
+                                conditions=[
+                                    utils.V1NodeCondition(type=c['type'],
+                                                          status=c['status'])
+                                    for c in conditions
+                                ]),
+                            spec=utils.V1NodeSpec(unschedulable=unschedulable,
+                                                  taints=[
+                                                      utils.V1Taint(
+                                                          key=t['key'],
+                                                          effect=t['effect'],
+                                                          value=t.get('value'))
+                                                      for t in taints
+                                                  ]))
+
+    def test_is_ready_true(self):
+        """Test is_ready returns True when node has Ready condition with status True."""
+        node = self._create_v1node(conditions=[{
+            'type': 'Ready',
+            'status': 'True'
+        }])
+        assert node.is_ready() is True
+
+    def test_is_ready_false(self):
+        """Test is_ready returns False when node has Ready condition with status False."""
+        node = self._create_v1node(conditions=[{
+            'type': 'Ready',
+            'status': 'False'
+        }])
+        assert node.is_ready() is False
+
+    def test_is_ready_no_condition(self):
+        """Test is_ready returns False when node has no Ready condition."""
+        node = self._create_v1node(conditions=[{
+            'type': 'DiskPressure',
+            'status': 'False'
+        }])
+        assert node.is_ready() is False
+
+    def test_is_ready_unknown_status(self):
+        """Test is_ready returns False when Ready condition has Unknown status."""
+        node = self._create_v1node(conditions=[{
+            'type': 'Ready',
+            'status': 'Unknown'
+        }])
+        assert node.is_ready() is False
+
+    def test_is_cordoned_true(self):
+        """Test is_cordoned returns True when unschedulable is True."""
+        node = self._create_v1node(unschedulable=True)
+        assert node.is_cordoned() is True
+
+    def test_is_cordoned_false(self):
+        """Test is_cordoned returns False when unschedulable is False."""
+        node = self._create_v1node(unschedulable=False)
+        assert node.is_cordoned() is False
+
+    def test_get_taints_all(self):
+        """Test get_taints returns all taints by default."""
+        node = self._create_v1node(taints=[
+            {
+                'key': 'nvidia.com/gpu',
+                'effect': 'NoSchedule',
+                'value': 'true'
+            },
+            {
+                'key': 'dedicated',
+                'effect': 'NoExecute',
+                'value': 'gpu'
+            },
+        ])
+        taints = node.get_taints()
+        assert len(taints) == 2
+        assert taints[0]['key'] == 'nvidia.com/gpu'
+        assert taints[1]['key'] == 'dedicated'
+
+    def test_get_taints_exclude_cordon(self):
+        """Test get_taints excludes cordon taint when exclude_cordon=True."""
+        node = self._create_v1node(taints=[
+            {
+                'key': 'node.kubernetes.io/unschedulable',
+                'effect': 'NoSchedule'
+            },
+            {
+                'key': 'dedicated',
+                'effect': 'NoSchedule',
+                'value': 'gpu'
+            },
+        ])
+        taints = node.get_taints(exclude_cordon=True)
+        assert len(taints) == 1
+        assert taints[0]['key'] == 'dedicated'
+
+    def test_get_taints_exclude_not_ready_noschedule(self):
+        """Test get_taints excludes not ready taint with NoSchedule effect."""
+        node = self._create_v1node(taints=[
+            {
+                'key': 'node.kubernetes.io/unreachable',
+                'effect': 'NoSchedule'
+            },
+            {
+                'key': 'dedicated',
+                'effect': 'NoSchedule',
+                'value': 'gpu'
+            },
+        ])
+        taints = node.get_taints(exclude_not_ready=True)
+        assert len(taints) == 1
+        assert taints[0]['key'] == 'dedicated'
+
+    def test_get_taints_exclude_not_ready_noexecute(self):
+        """Test get_taints excludes not ready taint with NoExecute effect."""
+        node = self._create_v1node(taints=[
+            {
+                'key': 'node.kubernetes.io/unreachable',
+                'effect': 'NoExecute'
+            },
+            {
+                'key': 'dedicated',
+                'effect': 'NoSchedule',
+                'value': 'gpu'
+            },
+        ])
+        taints = node.get_taints(exclude_not_ready=True)
+        assert len(taints) == 1
+        assert taints[0]['key'] == 'dedicated'
+
+    def test_get_taints_exclude_not_ready_keeps_other_effects(self):
+        """Test get_taints keeps unreachable taint with other effects."""
+        node = self._create_v1node(taints=[
+            {
+                'key': 'node.kubernetes.io/unreachable',
+                'effect': 'PreferNoSchedule'
+            },
+            {
+                'key': 'dedicated',
+                'effect': 'NoSchedule',
+                'value': 'gpu'
+            },
+        ])
+        taints = node.get_taints(exclude_not_ready=True)
+        assert len(taints) == 2
+        keys = [t['key'] for t in taints]
+        assert 'node.kubernetes.io/unreachable' in keys
+
+    def test_get_taints_exclude_effects(self):
+        """Test get_taints excludes taints with specified effects."""
+        node = self._create_v1node(taints=[
+            {
+                'key': 'nvidia.com/gpu',
+                'effect': 'NoSchedule'
+            },
+            {
+                'key': 'dedicated',
+                'effect': 'PreferNoSchedule'
+            },
+            {
+                'key': 'critical',
+                'effect': 'NoExecute'
+            },
+        ])
+        taints = node.get_taints(exclude_effects=['PreferNoSchedule'])
+        assert len(taints) == 2
+        effects = [t['effect'] for t in taints]
+        assert 'PreferNoSchedule' not in effects
+
+    def test_get_taints_exclude_keys(self):
+        """Test get_taints excludes taints with specified keys."""
+        node = self._create_v1node(taints=[
+            {
+                'key': 'nvidia.com/gpu',
+                'effect': 'NoSchedule'
+            },
+            {
+                'key': 'dedicated',
+                'effect': 'NoSchedule'
+            },
+        ])
+        taints = node.get_taints(exclude_keys=['nvidia.com/gpu'])
+        assert len(taints) == 1
+        assert taints[0]['key'] == 'dedicated'
+
+    def test_get_taints_empty(self):
+        """Test get_taints returns empty list when node has no taints."""
+        node = self._create_v1node(taints=[])
+        taints = node.get_taints()
+        assert len(taints) == 0
+
+
+class TestGetHandledTaintKeys(unittest.TestCase):
+    """Tests for get_handled_taint_keys function."""
+
+    def test_default_keys(self):
+        """Test that default keys include TPU and GPU resource keys."""
+        with mock.patch.dict(os.environ, {}, clear=True):
+            # Remove CUSTOM_GPU_RESOURCE_KEY if it exists
+            if 'CUSTOM_GPU_RESOURCE_KEY' in os.environ:
+                del os.environ['CUSTOM_GPU_RESOURCE_KEY']
+            keys = utils.get_handled_taint_keys()
+            assert utils.TPU_RESOURCE_KEY in keys
+            assert 'nvidia.com/gpu' in keys
+            assert 'amd.com/gpu' in keys
+
+    def test_custom_key_included(self):
+        """Test that custom GPU resource key is included when env var is set."""
+        with mock.patch.dict(os.environ,
+                             {'CUSTOM_GPU_RESOURCE_KEY': 'custom.io/gpu'}):
+            keys = utils.get_handled_taint_keys()
+            assert 'custom.io/gpu' in keys
+            assert utils.TPU_RESOURCE_KEY in keys
+            assert 'nvidia.com/gpu' in keys
diff --git a/tests/unit_tests/kubernetes/test_provision.py b/tests/unit_tests/kubernetes/test_provision.py
index 73ae7147997..50f1517a29f 100644
--- a/tests/unit_tests/kubernetes/test_provision.py
+++ b/tests/unit_tests/kubernetes/test_provision.py
@@ -494,6 +494,54 @@ def test_pod_termination_reason_kueue_preemption(monkeypatch):
     assert reason == expected
 
 
+def test_pod_termination_reason_null_finished_at(monkeypatch):
+    """Test _get_pod_termination_reason with null finished_at timestamp.
+
+    When pods are in certain failed states (e.g., Unknown status due to
+    ephemeral storage issues), terminated.finished_at can be None.
+    This should not cause a TypeError.
+
+    Regression test for SKY-4423.
+    """
+    import datetime
+
+    now = datetime.datetime(2025, 1, 1, 0, 0, 0)
+
+    pod = mock.MagicMock()
+    pod.metadata.name = 'test-pod'
+    pod.status.start_time = now
+
+    # Ready condition
+    ready_condition = mock.MagicMock()
+    ready_condition.type = 'Ready'
+    ready_condition.reason = 'PodFailed'
+    ready_condition.message = ''
+    ready_condition.last_transition_time = now
+
+    pod.status.conditions = [ready_condition]
+
+    # Container with terminated state but null finished_at
+    container_status = mock.MagicMock()
+    container_status.name = 'ray-node'
+    container_status.state.terminated = mock.MagicMock()
+    container_status.state.terminated.exit_code = 137
+    container_status.state.terminated.reason = 'Unknown'
+    container_status.state.terminated.finished_at = None
+
+    pod.status.container_statuses = [container_status]
+
+    monkeypatch.setattr('sky.provision.kubernetes.instance.global_user_state',
+                        mock.MagicMock())
+
+    # Should not raise TypeError
+    reason = instance._get_pod_termination_reason(pod, 'test-cluster')
+
+    expected = ('Terminated unexpectedly.\n'
+                'Last known state: PodFailed.\n'
+                'Container errors: Unknown')
+    assert reason == expected
+
+
 def test_list_namespaced_pod_success(monkeypatch):
     """Test that list_namespaced_pod returns pods from the API response."""
     mock_pod1 = mock.MagicMock()
diff --git a/tests/unit_tests/test_aws.py b/tests/unit_tests/test_aws.py
index a469540545e..14f118b39a1 100644
--- a/tests/unit_tests/test_aws.py
+++ b/tests/unit_tests/test_aws.py
@@ -12,6 +12,7 @@
 from sky.clouds import Region
 from sky.clouds import Zone
 from sky.clouds.aws import AWS
+from sky.provision import constants as provision_constants
 from sky.provision.aws import config
 from sky.utils import common_utils
 from sky.utils import config_utils
@@ -62,7 +63,7 @@ def test_usable_subnets(monkeypatch):
                                       vpc_name=None)
 
     error_message = str(e.value)
-    assert f"SKYPILOT_ERROR_NO_NODES_LAUNCHED: The default VPC in {region} either does not exist or has no subnets." == error_message
+    assert f"{provision_constants.ERROR_NO_NODES_LAUNCHED}: The default VPC in {region} either does not exist or has no subnets." == error_message
 
     # Case 2: Specified VPC has no subnets.
     with pytest.raises(RuntimeError) as e:
@@ -74,7 +75,7 @@ def test_usable_subnets(monkeypatch):
                                       vpc_name=vpc_name)
 
     error_message = str(e.value)
-    assert f"SKYPILOT_ERROR_NO_NODES_LAUNCHED: No candidate subnets found in specified VPC {vpc_id}." == error_message
+    assert f"{provision_constants.ERROR_NO_NODES_LAUNCHED}: No candidate subnets found in specified VPC {vpc_id}." == error_message
 
     # Case 3: All the subnets are public and use_internal_ips is True.
     monkeypatch.setattr('sky.provision.aws.config._is_subnet_public',
@@ -94,7 +95,7 @@ def test_usable_subnets(monkeypatch):
                                       vpc_name=vpc_name)
 
     error_message = str(e.value)
-    assert f"SKYPILOT_ERROR_NO_NODES_LAUNCHED: The use_internal_ips option is set to True, but all candidate subnets are public." == error_message
+    assert f"{provision_constants.ERROR_NO_NODES_LAUNCHED}: The use_internal_ips option is set to True, but all candidate subnets are public." == error_message
 
     # Case 4: All the subnets are private and use_internal_ips is False
     monkeypatch.setattr('sky.provision.aws.config._is_subnet_public',
@@ -114,7 +115,7 @@ def test_usable_subnets(monkeypatch):
                                       vpc_name=vpc_name)
 
     error_message = str(e.value)
-    assert f"SKYPILOT_ERROR_NO_NODES_LAUNCHED: All candidate subnets are private, did you mean to set use_internal_ips to True?" == error_message
+    assert f"{provision_constants.ERROR_NO_NODES_LAUNCHED}: All candidate subnets are private, did you mean to set use_internal_ips to True?" == error_message
 
 
 def test_ssm_default(monkeypatch):
diff --git a/tests/unit_tests/test_backend_utils.py b/tests/unit_tests/test_backend_utils.py
index 0c1decddb9f..0195f15d642 100644
--- a/tests/unit_tests/test_backend_utils.py
+++ b/tests/unit_tests/test_backend_utils.py
@@ -6,6 +6,7 @@
 from sky import clouds
 from sky import skypilot_config
 from sky.backends import backend_utils
+from sky.exceptions import ClusterNotUpError
 from sky.resources import Resources
 from sky.utils import common
 from sky.utils import common_utils
@@ -331,3 +332,58 @@ def test_kubeconfig_upload_with_kubernetes_exclusion():
         assert '~/.kube/config' not in credentials_fixed, (
             'Kubeconfig should not be uploaded when both Kubernetes and SSH '
             'are excluded.')
+
+
+@mock.patch('sky.backends.backend_utils.get_backend_from_handle')
+@mock.patch('sky.backends.backend_utils.refresh_cluster_status_handle')
+def test_check_cluster_available_accepts_autostopping(mock_refresh,
+                                                      mock_get_backend):
+    """Verify check_cluster_available accepts AUTOSTOPPING status."""
+    # Mock AUTOSTOPPING cluster
+    mock_handle = mock.MagicMock()
+    mock_refresh.return_value = (status_lib.ClusterStatus.AUTOSTOPPING,
+                                 mock_handle)
+    mock_get_backend.return_value = mock.MagicMock()
+
+    # Should not raise ClusterNotUpError for AUTOSTOPPING
+    result = backend_utils.check_cluster_available(
+        'test-cluster',
+        operation='test_operation',
+        check_cloud_vm_ray_backend=False)
+    assert result == mock_handle
+
+
+@mock.patch('sky.backends.backend_utils.get_backend_from_handle')
+@mock.patch('sky.backends.backend_utils.refresh_cluster_status_handle')
+def test_check_cluster_available_rejects_init(mock_refresh, mock_get_backend):
+    """Verify check_cluster_available rejects INIT status."""
+    mock_handle = mock.MagicMock()
+    mock_refresh.return_value = (status_lib.ClusterStatus.INIT, mock_handle)
+    mock_get_backend.return_value = mock.MagicMock()
+
+    # Should raise ClusterNotUpError for INIT
+    try:
+        backend_utils.check_cluster_available('test-cluster',
+                                              operation='test_operation',
+                                              check_cloud_vm_ray_backend=False)
+        assert False, 'Expected ClusterNotUpError to be raised'
+    except ClusterNotUpError:
+        pass
+
+
+@mock.patch('sky.backends.backend_utils.refresh_cluster_status_handle')
+def test_is_controller_accessible_accepts_autostopping(mock_refresh):
+    """Verify is_controller_accessible accepts AUTOSTOPPING status."""
+    from sky.utils import controller_utils
+
+    mock_handle = mock.MagicMock()
+    mock_handle.head_ip = '1.2.3.4'
+    mock_refresh.return_value = (status_lib.ClusterStatus.AUTOSTOPPING,
+                                 mock_handle)
+
+    # Should not raise for AUTOSTOPPING controller
+    result = backend_utils.is_controller_accessible(
+        controller_utils.Controllers.JOBS_CONTROLLER,
+        stopped_message='Test stopped',
+        exit_if_not_accessible=False)
+    assert result == mock_handle
diff --git a/tests/unit_tests/test_jobs_utils.py b/tests/unit_tests/test_jobs_utils.py
index 0ba9979027e..cb4c58baf4f 100644
--- a/tests/unit_tests/test_jobs_utils.py
+++ b/tests/unit_tests/test_jobs_utils.py
@@ -167,3 +167,24 @@ def test_consolidation_mode_warning_without_restart(mock_config, mock_logger):
             assert 'Consolidation mode for managed jobs is enabled' in warning_message
             assert 'API server has not been restarted yet' in warning_message
             assert 'Please restart the API server to enable it' in warning_message
+
+
+def test_job_recovery_skips_autostopping():
+    """Verify job recovery logic treats AUTOSTOPPING like UP (no recovery)."""
+    from sky.utils import status_lib
+
+    # AUTOSTOPPING should be treated as UP-like (not preempted)
+    # Recovery logic should skip AUTOSTOPPING (similar to UP)
+    up_status = status_lib.ClusterStatus.UP
+    autostopping_status = status_lib.ClusterStatus.AUTOSTOPPING
+    stopped_status = status_lib.ClusterStatus.STOPPED
+
+    # AUTOSTOPPING should be in the same category as UP for recovery purposes
+    recovery_skip_statuses = {
+        up_status,
+        autostopping_status,
+    }
+
+    assert up_status in recovery_skip_statuses
+    assert autostopping_status in recovery_skip_statuses
+    assert stopped_status not in recovery_skip_statuses
diff --git a/tests/unit_tests/test_lambda.py b/tests/unit_tests/test_lambda.py
index 6000810f455..1a5144ee9bc 100644
--- a/tests/unit_tests/test_lambda.py
+++ b/tests/unit_tests/test_lambda.py
@@ -1,9 +1,14 @@
+import threading
+import time
 from unittest import mock
 
 import pytest
 
+from sky.authentication import setup_lambda_authentication
 from sky.provision.lambda_cloud import instance
 from sky.provision.lambda_cloud import lambda_utils
+from sky.utils import auth_utils
+from sky.utils import common_utils
 
 
 def test_get_private_ip():
@@ -57,3 +62,40 @@ def test_open_ports():
         mock_client.list_firewall_rules.assert_called_once()
         mock_client.create_firewall_rule.assert_any_call(
             port_range=[5000, 5002], protocol='tcp')
+
+
+def test_setup_lambda_authentication_no_duplicate_keys():
+    call_count = [0]  # List so we don't have to use nonlocal
+
+    def mock_get_key(prefix, pub_key):
+        time.sleep(0.02)  # Simulated API delay to expose race condition
+        return ('sky-key', call_count[0] > 0)  # Key exists after first register
+
+    def mock_register(name, pub_key):
+        time.sleep(0.02)
+        call_count[0] += 1
+
+    with mock.patch.object(lambda_utils, 'LambdaCloudClient') as mock_cls, \
+         mock.patch.object(auth_utils, 'get_or_generate_keys',
+                          return_value=('/key', '/key.pub')), \
+         mock.patch.object(common_utils, 'get_user_hash', return_value='hash'), \
+         mock.patch('builtins.open', mock.mock_open(read_data='ssh-rsa key')):
+
+        mock_client = mock_cls.return_value
+        mock_client.get_unique_ssh_key_name.side_effect = mock_get_key
+        mock_client.register_ssh_key.side_effect = mock_register
+
+        # Run 5 concurrent threads
+        threads = [
+            threading.Thread(
+                target=lambda: setup_lambda_authentication({'auth': {}}))
+            for _ in range(5)
+        ]
+        for t in threads:
+            t.start()
+        for t in threads:
+            t.join()
+
+        # Register should only be called once due to file lock
+        assert call_count[
+            0] == 1, f"Expected 1 registration, got {call_count[0]}"
diff --git a/tests/unit_tests/test_optimizer_job_group.py b/tests/unit_tests/test_optimizer_job_group.py
new file mode 100644
index 00000000000..c1d59c9f90f
--- /dev/null
+++ b/tests/unit_tests/test_optimizer_job_group.py
@@ -0,0 +1,759 @@
+"""Unit tests for sky.optimizer - JobGroup optimization logic."""
+import collections
+from typing import Dict, List, Optional, Set, Tuple
+from unittest.mock import MagicMock
+from unittest.mock import patch
+
+import pytest
+
+from sky import clouds
+from sky import dag as dag_lib
+from sky import exceptions
+from sky import optimizer
+from sky import resources as resources_lib
+from sky import task as task_lib
+from sky.utils import common
+
+
+class TestJobGroupOptimizer:
+    """Tests for JobGroup optimization in sky.optimizer.
+
+    These tests verify the optimization logic for JobGroups where all
+    tasks are co-located on the same infrastructure.
+    """
+
+    @pytest.fixture
+    def mock_aws_cloud(self):
+        """Create a mock AWS cloud."""
+        cloud = MagicMock(spec=clouds.AWS)
+        cloud.__str__ = MagicMock(return_value='AWS')
+        cloud.__repr__ = MagicMock(return_value='AWS')
+        cloud.__hash__ = MagicMock(return_value=hash('AWS'))
+        cloud.__eq__ = lambda self, other: str(other) == 'AWS'
+        return cloud
+
+    @pytest.fixture
+    def mock_gcp_cloud(self):
+        """Create a mock GCP cloud."""
+        cloud = MagicMock(spec=clouds.GCP)
+        cloud.__str__ = MagicMock(return_value='GCP')
+        cloud.__repr__ = MagicMock(return_value='GCP')
+        cloud.__hash__ = MagicMock(return_value=hash('GCP'))
+        cloud.__eq__ = lambda self, other: str(other) == 'GCP'
+        return cloud
+
+    @pytest.fixture
+    def mock_resources_aws_us_east_1(self, mock_aws_cloud):
+        """Create mock AWS resources in us-east-1."""
+        resources = MagicMock(spec=resources_lib.Resources)
+        resources.cloud = mock_aws_cloud
+        resources.region = 'us-east-1'
+        resources.get_cost = MagicMock(return_value=1.0)
+        return resources
+
+    @pytest.fixture
+    def mock_resources_aws_us_west_2(self, mock_aws_cloud):
+        """Create mock AWS resources in us-west-2."""
+        resources = MagicMock(spec=resources_lib.Resources)
+        resources.cloud = mock_aws_cloud
+        resources.region = 'us-west-2'
+        resources.get_cost = MagicMock(return_value=1.5)
+        return resources
+
+    @pytest.fixture
+    def mock_resources_gcp_us_central1(self, mock_gcp_cloud):
+        """Create mock GCP resources in us-central1."""
+        resources = MagicMock(spec=resources_lib.Resources)
+        resources.cloud = mock_gcp_cloud
+        resources.region = 'us-central1'
+        resources.get_cost = MagicMock(return_value=0.8)
+        return resources
+
+    def test_find_common_infras_single_task(self, mock_aws_cloud,
+                                            mock_resources_aws_us_east_1):
+        """Test _find_common_infras with a single task."""
+        task = MagicMock(spec=task_lib.Task)
+        task_candidates = {
+            task: {
+                mock_aws_cloud: [mock_resources_aws_us_east_1]
+            }
+        }
+
+        result = optimizer.Optimizer._find_common_infras(task_candidates)
+
+        assert len(result) == 1
+        cloud, region = result[0]
+        assert str(cloud) == 'AWS'
+        assert region == 'us-east-1'
+
+    def test_find_common_infras_two_tasks_same_region(
+            self, mock_aws_cloud, mock_resources_aws_us_east_1):
+        """Test _find_common_infras with two tasks in same region."""
+        task1 = MagicMock(spec=task_lib.Task)
+        task2 = MagicMock(spec=task_lib.Task)
+
+        task_candidates = {
+            task1: {
+                mock_aws_cloud: [mock_resources_aws_us_east_1]
+            },
+            task2: {
+                mock_aws_cloud: [mock_resources_aws_us_east_1]
+            }
+        }
+
+        result = optimizer.Optimizer._find_common_infras(task_candidates)
+
+        assert len(result) == 1
+        cloud, region = result[0]
+        assert str(cloud) == 'AWS'
+        assert region == 'us-east-1'
+
+    def test_find_common_infras_no_common_region(self, mock_aws_cloud,
+                                                 mock_resources_aws_us_east_1,
+                                                 mock_resources_aws_us_west_2):
+        """Test _find_common_infras with no common region."""
+        task1 = MagicMock(spec=task_lib.Task)
+        task2 = MagicMock(spec=task_lib.Task)
+
+        task_candidates = {
+            task1: {
+                mock_aws_cloud: [mock_resources_aws_us_east_1]
+            },
+            task2: {
+                mock_aws_cloud: [mock_resources_aws_us_west_2]
+            }
+        }
+
+        result = optimizer.Optimizer._find_common_infras(task_candidates)
+
+        # No common region between us-east-1 and us-west-2
+        assert len(result) == 0
+
+    def test_find_common_infras_multiple_common_regions(
+            self, mock_aws_cloud, mock_resources_aws_us_east_1,
+            mock_resources_aws_us_west_2):
+        """Test _find_common_infras with multiple common regions."""
+        task1 = MagicMock(spec=task_lib.Task)
+        task2 = MagicMock(spec=task_lib.Task)
+
+        # Both tasks can run in both regions
+        task_candidates = {
+            task1: {
+                mock_aws_cloud: [
+                    mock_resources_aws_us_east_1, mock_resources_aws_us_west_2
+                ]
+            },
+            task2: {
+                mock_aws_cloud: [
+                    mock_resources_aws_us_east_1, mock_resources_aws_us_west_2
+                ]
+            }
+        }
+
+        result = optimizer.Optimizer._find_common_infras(task_candidates)
+
+        # Both regions should be common
+        assert len(result) == 2
+        regions = {r for _, r in result}
+        assert regions == {'us-east-1', 'us-west-2'}
+
+    def test_find_common_infras_empty_candidates(self):
+        """Test _find_common_infras with empty candidates."""
+        result = optimizer.Optimizer._find_common_infras({})
+        assert result == []
+
+    def test_select_best_infra_single_option(self, mock_aws_cloud,
+                                             mock_resources_aws_us_east_1):
+        """Test _select_best_infra with single option."""
+        task = MagicMock(spec=task_lib.Task)
+        task.estimate_runtime = MagicMock(return_value=3600)
+        task.num_nodes = 1
+
+        common_infras = [(mock_aws_cloud, 'us-east-1')]
+        task_candidates = {
+            task: {
+                mock_aws_cloud: [mock_resources_aws_us_east_1]
+            }
+        }
+
+        result = optimizer.Optimizer._select_best_infra(common_infras,
+                                                        task_candidates, [task],
+                                                        minimize_cost=True)
+
+        cloud, region = result
+        assert str(cloud) == 'AWS'
+        assert region == 'us-east-1'
+
+    def test_select_best_infra_minimize_cost(self, mock_aws_cloud,
+                                             mock_resources_aws_us_east_1,
+                                             mock_resources_aws_us_west_2):
+        """Test _select_best_infra selects cheapest option."""
+        task = MagicMock(spec=task_lib.Task)
+        task.estimate_runtime = MagicMock(return_value=3600)
+        task.time_estimator_func = MagicMock()
+        task.num_nodes = 1
+
+        # us-east-1 costs 1.0, us-west-2 costs 1.5
+        common_infras = [(mock_aws_cloud, 'us-east-1'),
+                         (mock_aws_cloud, 'us-west-2')]
+        task_candidates = {
+            task: {
+                mock_aws_cloud: [
+                    mock_resources_aws_us_east_1, mock_resources_aws_us_west_2
+                ]
+            }
+        }
+
+        result = optimizer.Optimizer._select_best_infra(common_infras,
+                                                        task_candidates, [task],
+                                                        minimize_cost=True)
+
+        cloud, region = result
+        assert str(cloud) == 'AWS'
+        # Should select us-east-1 (cheaper)
+        assert region == 'us-east-1'
+
+    def test_select_best_infra_multiple_tasks(self, mock_aws_cloud,
+                                              mock_resources_aws_us_east_1,
+                                              mock_resources_aws_us_west_2):
+        """Test _select_best_infra considers all tasks."""
+        task1 = MagicMock(spec=task_lib.Task)
+        task1.estimate_runtime = MagicMock(return_value=3600)
+        task1.time_estimator_func = MagicMock()
+        task1.num_nodes = 1
+
+        task2 = MagicMock(spec=task_lib.Task)
+        task2.estimate_runtime = MagicMock(return_value=7200)
+        task2.time_estimator_func = MagicMock()
+        task2.num_nodes = 2
+
+        common_infras = [(mock_aws_cloud, 'us-east-1'),
+                         (mock_aws_cloud, 'us-west-2')]
+        task_candidates = {
+            task1: {
+                mock_aws_cloud: [
+                    mock_resources_aws_us_east_1, mock_resources_aws_us_west_2
+                ]
+            },
+            task2: {
+                mock_aws_cloud: [
+                    mock_resources_aws_us_east_1, mock_resources_aws_us_west_2
+                ]
+            }
+        }
+
+        result = optimizer.Optimizer._select_best_infra(common_infras,
+                                                        task_candidates,
+                                                        [task1, task2],
+                                                        minimize_cost=True)
+
+        # Should return a valid infra
+        cloud, region = result
+        assert str(cloud) == 'AWS'
+        assert region in ['us-east-1', 'us-west-2']
+
+
+class TestOptimizeJobGroup:
+    """Tests for the main optimize_job_group function."""
+
+    @pytest.fixture
+    def mock_dag_non_job_group(self):
+        """Create a mock DAG that is NOT a JobGroup."""
+        dag = MagicMock(spec=dag_lib.Dag)
+        dag.is_job_group = MagicMock(return_value=False)
+        dag.tasks = []
+        return dag
+
+    @pytest.fixture
+    def mock_dag_job_group(self):
+        """Create a mock JobGroup DAG."""
+        dag = MagicMock(spec=dag_lib.Dag)
+        dag.is_job_group = MagicMock(return_value=True)
+        dag.name = 'test-job-group'
+
+        task1 = MagicMock(spec=task_lib.Task)
+        task1.name = 'task-1'
+        task2 = MagicMock(spec=task_lib.Task)
+        task2.name = 'task-2'
+        dag.tasks = [task1, task2]
+
+        return dag
+
+    def test_optimize_job_group_falls_back_for_non_job_group(
+            self, mock_dag_non_job_group):
+        """Test that non-JobGroup DAGs fall back to regular optimization."""
+        with patch.object(optimizer.Optimizer, 'optimize') as mock_optimize:
+            mock_optimize.return_value = mock_dag_non_job_group
+
+            result = optimizer.Optimizer.optimize_job_group(
+                mock_dag_non_job_group)
+
+            mock_optimize.assert_called_once()
+            assert result == mock_dag_non_job_group
+
+    def test_optimize_job_group_calls_optimize_same_infra(
+            self, mock_dag_job_group):
+        """Test JobGroup optimization calls _optimize_same_infra."""
+        with patch.object(optimizer.Optimizer,
+                          '_optimize_same_infra') as mock_same_infra:
+            mock_same_infra.return_value = mock_dag_job_group
+
+            result = optimizer.Optimizer.optimize_job_group(mock_dag_job_group,
+                                                            quiet=True)
+
+            mock_same_infra.assert_called_once()
+
+
+class TestOptimizeIndependent:
+    """Tests for _optimize_independent method."""
+
+    def test_optimize_independent_creates_temp_dag_per_task(self):
+        """Test that _optimize_independent creates temp DAG for each task."""
+        dag = MagicMock(spec=dag_lib.Dag)
+        task1 = MagicMock(spec=task_lib.Task)
+        task1.name = 'task-1'
+        task2 = MagicMock(spec=task_lib.Task)
+        task2.name = 'task-2'
+        dag.tasks = [task1, task2]
+
+        optimize_call_count = 0
+
+        def mock_optimize(temp_dag, minimize, blocked_resources, quiet):
+            nonlocal optimize_call_count
+            optimize_call_count += 1
+            return temp_dag
+
+        with patch.object(optimizer.Optimizer,
+                          'optimize',
+                          side_effect=mock_optimize):
+            result = optimizer.Optimizer._optimize_independent(
+                dag,
+                minimize=common.OptimizeTarget.COST,
+                blocked_resources=None,
+                quiet=True)
+
+            # Should call optimize once per task
+            assert optimize_call_count == 2
+            assert result == dag
+
+
+class TestOptimizeSameInfra:
+    """Tests for _optimize_same_infra method."""
+
+    @pytest.fixture
+    def mock_aws_cloud(self):
+        """Create a mock AWS cloud."""
+        cloud = MagicMock(spec=clouds.AWS)
+        cloud.__str__ = MagicMock(return_value='AWS')
+        cloud.__repr__ = MagicMock(return_value='AWS')
+        cloud.__hash__ = MagicMock(return_value=hash('AWS'))
+        cloud.__eq__ = lambda self, other: str(other) == 'AWS'
+        return cloud
+
+    def test_optimize_same_infra_no_resources_raises_error(self):
+        """Test that missing resources raises ResourcesUnavailableError."""
+        dag = MagicMock(spec=dag_lib.Dag)
+        dag.name = 'test-job-group'
+        task = MagicMock(spec=task_lib.Task)
+        task.name = 'task-1'
+        dag.tasks = [task]
+
+        # Mock _fill_in_launchable_resources to return empty resources
+        with patch('sky.optimizer._fill_in_launchable_resources') as mock_fill:
+            mock_fill.return_value = ({}, None, None, None)
+
+            with pytest.raises(
+                    exceptions.ResourcesUnavailableError) as exc_info:
+                optimizer.Optimizer._optimize_same_infra(
+                    dag,
+                    minimize=common.OptimizeTarget.COST,
+                    blocked_resources=None,
+                    quiet=True)
+
+            assert 'No resources available' in str(exc_info.value)
+            assert 'task-1' in str(exc_info.value)
+
+    def test_optimize_same_infra_fallback_when_no_common_infra(
+            self, mock_aws_cloud):
+        """Test fallback to independent optimization when no common infra."""
+        dag = MagicMock(spec=dag_lib.Dag)
+        dag.name = 'test-job-group'
+
+        task1 = MagicMock(spec=task_lib.Task)
+        task1.name = 'task-1'
+        task2 = MagicMock(spec=task_lib.Task)
+        task2.name = 'task-2'
+        dag.tasks = [task1, task2]
+
+        # Create resources in different regions (no overlap)
+        resources1 = MagicMock(spec=resources_lib.Resources)
+        resources1.cloud = mock_aws_cloud
+        resources1.region = 'us-east-1'
+
+        resources2 = MagicMock(spec=resources_lib.Resources)
+        resources2.cloud = mock_aws_cloud
+        resources2.region = 'us-west-2'
+
+        call_count = [0]
+
+        def mock_fill(task, blocked_resources, quiet):
+            call_count[0] += 1
+            if task == task1:
+                return ({resources1: [resources1]}, None, None, None)
+            else:
+                return ({resources2: [resources2]}, None, None, None)
+
+        with patch('sky.optimizer._fill_in_launchable_resources',
+                   side_effect=mock_fill):
+            with patch.object(optimizer.Optimizer,
+                              '_optimize_independent') as mock_independent:
+                mock_independent.return_value = dag
+
+                result = optimizer.Optimizer._optimize_same_infra(
+                    dag,
+                    minimize=common.OptimizeTarget.COST,
+                    blocked_resources=None,
+                    quiet=True)
+
+                # Should fallback to independent optimization
+                mock_independent.assert_called_once()
+
+    def test_optimize_same_infra_sets_best_resources(self, mock_aws_cloud):
+        """Test that _optimize_same_infra sets best_resources on tasks."""
+        dag = MagicMock(spec=dag_lib.Dag)
+        dag.name = 'test-job-group'
+
+        task1 = MagicMock(spec=task_lib.Task)
+        task1.name = 'task-1'
+        task1.estimate_runtime = MagicMock(return_value=3600)
+        task1.num_nodes = 1
+
+        task2 = MagicMock(spec=task_lib.Task)
+        task2.name = 'task-2'
+        task2.estimate_runtime = MagicMock(return_value=3600)
+        task2.num_nodes = 1
+
+        dag.tasks = [task1, task2]
+
+        # Create resources in the same region
+        resources = MagicMock(spec=resources_lib.Resources)
+        resources.cloud = mock_aws_cloud
+        resources.region = 'us-east-1'
+        resources.get_cost = MagicMock(return_value=1.0)
+
+        def mock_fill(task, blocked_resources, quiet):
+            return ({resources: [resources]}, None, None, None)
+
+        with patch('sky.optimizer._fill_in_launchable_resources',
+                   side_effect=mock_fill):
+            result = optimizer.Optimizer._optimize_same_infra(
+                dag,
+                minimize=common.OptimizeTarget.COST,
+                blocked_resources=None,
+                quiet=True)
+
+            # Both tasks should have best_resources set
+            assert task1.best_resources == resources
+            assert task2.best_resources == resources
+
+
+class TestModuleLevelOptimizeJobGroup:
+    """Tests for the module-level optimize_job_group function."""
+
+    def test_module_level_function_calls_optimizer_method(self):
+        """Test module-level function delegates to Optimizer class."""
+        dag = MagicMock(spec=dag_lib.Dag)
+        dag.is_job_group = MagicMock(return_value=True)
+
+        with patch.object(optimizer.Optimizer,
+                          'optimize_job_group') as mock_method:
+            mock_method.return_value = dag
+
+            result = optimizer.optimize_job_group(dag, quiet=True)
+
+            mock_method.assert_called_once_with(dag, common.OptimizeTarget.COST,
+                                                None, True)
+            assert result == dag
+
+
+class TestPrintJobGroupPlan:
+    """Tests for _print_job_group_plan output formatting."""
+
+    @pytest.fixture
+    def mock_kubernetes_cloud(self):
+        """Create a mock Kubernetes cloud."""
+        cloud = MagicMock(spec=clouds.Kubernetes)
+        cloud.__str__ = MagicMock(return_value='Kubernetes')
+        cloud.__repr__ = MagicMock(return_value='Kubernetes')
+        # Kubernetes parses instance_type like '2CPU--4GB' to get vCPUs/memory
+        cloud.get_vcpus_mem_from_instance_type = MagicMock(return_value=(2.0,
+                                                                         4.0))
+        return cloud
+
+    @pytest.fixture
+    def mock_aws_cloud(self):
+        """Create a mock AWS cloud."""
+        cloud = MagicMock(spec=clouds.AWS)
+        cloud.__str__ = MagicMock(return_value='AWS')
+        cloud.__repr__ = MagicMock(return_value='AWS')
+        # AWS returns vCPUs/memory based on instance type
+        cloud.get_vcpus_mem_from_instance_type = MagicMock(return_value=(4.0,
+                                                                         16.0))
+        return cloud
+
+    @pytest.fixture
+    def mock_infra(self):
+        """Create a mock InfraInfo."""
+        infra = MagicMock()
+        infra.formatted_str = MagicMock(return_value='Kubernetes (coreweave)')
+        return infra
+
+    @pytest.fixture
+    def mock_infra_aws(self):
+        """Create a mock InfraInfo for AWS."""
+        infra = MagicMock()
+        infra.formatted_str = MagicMock(return_value='AWS (us-east-1)')
+        return infra
+
+    def test_print_job_group_plan_shows_vcpus_memory_for_kubernetes(
+            self, mock_kubernetes_cloud, mock_infra):
+        """Test that vCPUs and memory are shown correctly for Kubernetes."""
+        task = MagicMock(spec=task_lib.Task)
+        task.name = 'data-server'
+        task.num_nodes = 1
+
+        resources = MagicMock(spec=resources_lib.Resources)
+        resources.cloud = mock_kubernetes_cloud
+        resources.instance_type = '2CPU--4GB'
+        resources.get_accelerators_str = MagicMock(return_value='-')
+        resources.get_spot_str = MagicMock(return_value='')
+        resources.infra = mock_infra
+
+        task.best_resources = resources
+
+        # Capture the logger output
+        with patch('sky.optimizer.logger') as mock_logger:
+            optimizer.Optimizer._print_job_group_plan([task])
+
+            # Verify logger.info was called with the table
+            assert mock_logger.info.call_count >= 1
+            # Get the table output (second call contains the table)
+            table_call = mock_logger.info.call_args_list[-1]
+            table_str = str(table_call)
+
+            # Verify the table contains correct values
+            assert 'data-server' in table_str
+            assert '2' in table_str  # vCPUs
+            assert '4' in table_str  # memory
+
+    def test_print_job_group_plan_shows_gpus(self, mock_kubernetes_cloud,
+                                             mock_infra):
+        """Test that GPUs are shown correctly in the optimizer table."""
+        task = MagicMock(spec=task_lib.Task)
+        task.name = 'trainer'
+        task.num_nodes = 2
+
+        resources = MagicMock(spec=resources_lib.Resources)
+        resources.cloud = mock_kubernetes_cloud
+        resources.instance_type = '4CPU--32GB'
+        resources.get_accelerators_str = MagicMock(return_value='H100:1')
+        resources.get_spot_str = MagicMock(return_value='')
+        resources.infra = mock_infra
+
+        # Update mock to return correct values for this instance type
+        mock_kubernetes_cloud.get_vcpus_mem_from_instance_type = MagicMock(
+            return_value=(4.0, 32.0))
+
+        task.best_resources = resources
+
+        with patch('sky.optimizer.logger') as mock_logger:
+            optimizer.Optimizer._print_job_group_plan([task])
+
+            table_call = mock_logger.info.call_args_list[-1]
+            table_str = str(table_call)
+
+            # Verify GPU is shown
+            assert 'H100:1' in table_str
+            assert 'trainer' in table_str
+
+    def test_print_job_group_plan_shows_dash_for_instance_type_on_kubernetes(
+            self, mock_kubernetes_cloud, mock_infra):
+        """Test that instance type shows '-' for Kubernetes."""
+        task = MagicMock(spec=task_lib.Task)
+        task.name = 'service'
+        task.num_nodes = 1
+
+        resources = MagicMock(spec=resources_lib.Resources)
+        resources.cloud = mock_kubernetes_cloud
+        resources.instance_type = '2CPU--8GB'
+        resources.get_accelerators_str = MagicMock(return_value='-')
+        resources.get_spot_str = MagicMock(return_value='')
+        resources.infra = mock_infra
+
+        mock_kubernetes_cloud.get_vcpus_mem_from_instance_type = MagicMock(
+            return_value=(2.0, 8.0))
+
+        task.best_resources = resources
+
+        with patch('sky.optimizer.logger') as mock_logger:
+            optimizer.Optimizer._print_job_group_plan([task])
+
+            table_call = mock_logger.info.call_args_list[-1]
+            table_str = str(table_call)
+
+            # Instance type column should show '-' not '2CPU--8GB'
+            assert '2CPU--8GB' not in table_str
+            # But vCPUs and memory should still be shown
+            assert '2' in table_str  # vCPUs
+            assert '8' in table_str  # memory
+
+    def test_print_job_group_plan_shows_instance_type_for_aws(
+            self, mock_aws_cloud, mock_infra_aws):
+        """Test that instance type is shown for AWS."""
+        task = MagicMock(spec=task_lib.Task)
+        task.name = 'compute'
+        task.num_nodes = 1
+
+        resources = MagicMock(spec=resources_lib.Resources)
+        resources.cloud = mock_aws_cloud
+        resources.instance_type = 'm5.xlarge'
+        resources.get_accelerators_str = MagicMock(return_value='-')
+        resources.get_spot_str = MagicMock(return_value='')
+        resources.infra = mock_infra_aws
+
+        task.best_resources = resources
+
+        with patch('sky.optimizer.logger') as mock_logger:
+            optimizer.Optimizer._print_job_group_plan([task])
+
+            table_call = mock_logger.info.call_args_list[-1]
+            table_str = str(table_call)
+
+            # Instance type should be shown for AWS
+            assert 'm5.xlarge' in table_str
+            assert '4' in table_str  # vCPUs
+            assert '16' in table_str  # memory
+
+    def test_print_job_group_plan_multiple_tasks(self, mock_kubernetes_cloud,
+                                                 mock_infra):
+        """Test that all tasks are shown in the optimizer table."""
+        tasks = []
+        task_names = ['data-server', 'reward-server', 'trainer']
+
+        for i, name in enumerate(task_names):
+            task = MagicMock(spec=task_lib.Task)
+            task.name = name
+            task.num_nodes = 1 if i < 2 else 2
+
+            resources = MagicMock(spec=resources_lib.Resources)
+            resources.cloud = mock_kubernetes_cloud
+            resources.instance_type = '2CPU--4GB'
+            resources.get_accelerators_str = MagicMock(
+                return_value='H100:1' if name == 'trainer' else '-')
+            resources.get_spot_str = MagicMock(return_value='')
+            resources.infra = mock_infra
+
+            task.best_resources = resources
+            tasks.append(task)
+
+        with patch('sky.optimizer.logger') as mock_logger:
+            optimizer.Optimizer._print_job_group_plan(tasks)
+
+            table_call = mock_logger.info.call_args_list[-1]
+            table_str = str(table_call)
+
+            # All task names should be in the table
+            for name in task_names:
+                assert name in table_str
+
+    def test_print_job_group_plan_skips_tasks_without_best_resources(
+            self, mock_kubernetes_cloud, mock_infra):
+        """Test that tasks without best_resources are skipped."""
+        task_with_resources = MagicMock(spec=task_lib.Task)
+        task_with_resources.name = 'has-resources'
+        task_with_resources.num_nodes = 1
+
+        resources = MagicMock(spec=resources_lib.Resources)
+        resources.cloud = mock_kubernetes_cloud
+        resources.instance_type = '2CPU--4GB'
+        resources.get_accelerators_str = MagicMock(return_value='-')
+        resources.get_spot_str = MagicMock(return_value='')
+        resources.infra = mock_infra
+        task_with_resources.best_resources = resources
+
+        task_without_resources = MagicMock(spec=task_lib.Task)
+        task_without_resources.name = 'no-resources'
+        task_without_resources.best_resources = None
+
+        with patch('sky.optimizer.logger') as mock_logger:
+            optimizer.Optimizer._print_job_group_plan(
+                [task_with_resources, task_without_resources])
+
+            table_call = mock_logger.info.call_args_list[-1]
+            table_str = str(table_call)
+
+            # Only the task with resources should be in the table
+            assert 'has-resources' in table_str
+            assert 'no-resources' not in table_str
+
+    def test_print_job_group_plan_handles_spot_instances(
+            self, mock_aws_cloud, mock_infra_aws):
+        """Test that spot instance indicator is shown."""
+        task = MagicMock(spec=task_lib.Task)
+        task.name = 'spot-task'
+        task.num_nodes = 1
+
+        resources = MagicMock(spec=resources_lib.Resources)
+        resources.cloud = mock_aws_cloud
+        resources.instance_type = 'm5.xlarge'
+        resources.get_accelerators_str = MagicMock(return_value='-')
+        resources.get_spot_str = MagicMock(return_value='[Spot]')
+        resources.infra = mock_infra_aws
+
+        task.best_resources = resources
+
+        with patch('sky.optimizer.logger') as mock_logger:
+            optimizer.Optimizer._print_job_group_plan([task])
+
+            table_call = mock_logger.info.call_args_list[-1]
+            table_str = str(table_call)
+
+            # Spot indicator should be appended to instance type
+            assert '[Spot]' in table_str
+
+    def test_print_job_group_plan_handles_none_instance_type(self, mock_infra):
+        """Test handling when instance_type is None."""
+        cloud = MagicMock(spec=clouds.AWS)
+        cloud.__str__ = MagicMock(return_value='AWS')
+
+        task = MagicMock(spec=task_lib.Task)
+        task.name = 'no-instance-type'
+        task.num_nodes = 1
+
+        resources = MagicMock(spec=resources_lib.Resources)
+        resources.cloud = cloud
+        resources.instance_type = None
+        resources.get_accelerators_str = MagicMock(return_value='-')
+        resources.get_spot_str = MagicMock(return_value='')
+        resources.infra = mock_infra
+
+        task.best_resources = resources
+
+        with patch('sky.optimizer.logger') as mock_logger:
+            optimizer.Optimizer._print_job_group_plan([task])
+
+            table_call = mock_logger.info.call_args_list[-1]
+            table_str = str(table_call)
+
+            # Task should still be shown with '-' for instance type
+            assert 'no-instance-type' in table_str
+
+    def test_print_job_group_plan_no_output_for_empty_tasks(self):
+        """Test that no output is produced for empty task list."""
+        with patch('sky.optimizer.logger') as mock_logger:
+            optimizer.Optimizer._print_job_group_plan([])
+
+            # logger.info should not be called with table
+            # (no "Best plan:" message)
+            for call in mock_logger.info.call_args_list:
+                assert 'Best plan' not in str(call)
diff --git a/tests/unit_tests/test_recipes.py b/tests/unit_tests/test_recipes.py
new file mode 100644
index 00000000000..0e37dd1fe33
--- /dev/null
+++ b/tests/unit_tests/test_recipes.py
@@ -0,0 +1,356 @@
+"""Unit tests for Recipe Hub core functionality.
+Tests validation of recipes against SkyPilot schema.
+"""
+import textwrap
+
+import pytest
+
+from sky import exceptions
+from sky.recipes import core as recipes_core
+from sky.recipes.utils import RecipeType
+from sky.utils import common_utils
+
+
+class TestRecipeValidation:
+    """Tests for recipe validation."""
+
+    def test_create_invalid_yaml_syntax(self):
+        """Test that creating a recipe with invalid syntax fails."""
+        invalid_yaml = textwrap.dedent("""
+        name: test
+         bad_indentation: true
+        run: echo hello
+        """).strip()
+        with pytest.raises(ValueError, match='Invalid YAML syntax'):
+            recipes_core._validate_skypilot_yaml(invalid_yaml,
+                                                 RecipeType.CLUSTER)
+
+    def test_create_invalid_yaml_not_dict(self):
+        """Test that creating a recipe that's not a dict fails."""
+        invalid_yaml = "- item1\n- item2\n- item3"
+        with pytest.raises(ValueError,
+                           match='YAML must be a dictionary/mapping'):
+            recipes_core._validate_skypilot_yaml(invalid_yaml,
+                                                 RecipeType.CLUSTER)
+
+    def test_create_empty_yaml(self):
+        """Test that creating an empty recipe fails."""
+        empty_yaml = ""
+        with pytest.raises(ValueError, match='YAML content is empty'):
+            recipes_core._validate_skypilot_yaml(empty_yaml, RecipeType.CLUSTER)
+
+    def test_create_yaml_with_invalid_field(self):
+        """Test that creating a YAML with only invalid fields fails."""
+        # This YAML has no valid SkyPilot fields
+        invalid_yaml = textwrap.dedent("""
+        name3: Lloyd
+        random_field: value
+        """).strip()
+        with pytest.raises(ValueError, match='Invalid task YAML'):
+            recipes_core._validate_skypilot_yaml(invalid_yaml,
+                                                 RecipeType.CLUSTER)
+
+    def test_create_pool_yaml_without_pool_section(self):
+        """Test that creating a recipe without pool section fails."""
+        invalid_pool_yaml = textwrap.dedent("""
+        resources:
+          cpus: 2
+        run: echo hello
+        """).strip()
+        with pytest.raises(ValueError,
+                           match="Pool YAML must contain a 'pool' section"):
+            recipes_core._validate_skypilot_yaml(invalid_pool_yaml,
+                                                 RecipeType.POOL)
+
+    def test_valid_cluster_yaml(self):
+        """Test that a valid cluster YAML passes validation."""
+        valid_yaml = textwrap.dedent("""
+        resources:
+          cpus: 2
+        run: echo hello
+        """).strip()
+        # Should not raise
+        recipes_core._validate_skypilot_yaml(valid_yaml, RecipeType.CLUSTER)
+
+    def test_valid_job_yaml(self):
+        """Test that a valid job YAML passes validation."""
+        valid_yaml = textwrap.dedent("""
+        resources:
+          cpus: 2
+        run: echo hello
+        """).strip()
+        # Should not raise
+        recipes_core._validate_skypilot_yaml(valid_yaml, RecipeType.JOB)
+
+    def test_invalid_recipe_type(self):
+        """Test that an invalid recipe_type is rejected."""
+        valid_yaml = textwrap.dedent("""
+        resources:
+          cpus: 2
+        run: echo hello
+        """).strip()
+        with pytest.raises(ValueError, match='Invalid recipe type'):
+            recipes_core.create_recipe(
+                name='test',
+                content=valid_yaml,
+                recipe_type='invalid_type',
+                user_id='test_user',
+            )
+
+    def test_yaml_with_invalid_resources(self):
+        """Test that a YAML with invalid resource specifications fails."""
+        invalid_yaml = textwrap.dedent("""
+        resources:
+          invalid_resource: 999
+        run: echo hello
+        """).strip()
+        with pytest.raises(ValueError, match='Invalid resources YAML'):
+            recipes_core._validate_skypilot_yaml(invalid_yaml,
+                                                 RecipeType.CLUSTER)
+
+    def test_yaml_with_completely_invalid_structure(self):
+        """Test that a YAML with completely invalid structure fails."""
+        invalid_yaml = textwrap.dedent("""
+        not_a_valid_field: value
+        another_invalid: 123
+        """).strip()
+        with pytest.raises(ValueError, match='Invalid task YAML'):
+            recipes_core._validate_skypilot_yaml(invalid_yaml,
+                                                 RecipeType.CLUSTER)
+
+    def test_cluster_yaml_minimal(self):
+        """Test that a minimal cluster YAML with just run command works."""
+        valid_yaml = textwrap.dedent("""
+        run: echo hello world
+        """).strip()
+        # Should not raise - minimal YAML with just a run command is valid
+        recipes_core._validate_skypilot_yaml(valid_yaml, RecipeType.CLUSTER)
+
+    # =========================================================================
+    # Tests for local path validation (workdir and file_mounts)
+    # =========================================================================
+
+    def test_local_workdir_rejected(self):
+        """Test that local workdir paths are rejected in recipes."""
+        yaml_with_local_workdir = textwrap.dedent("""
+        workdir: /path/to/local/dir
+        run: python train.py
+        """).strip()
+        with pytest.raises(ValueError,
+                           match='Local workdir paths are not allowed'):
+            recipes_core._validate_skypilot_yaml(yaml_with_local_workdir,
+                                                 RecipeType.CLUSTER)
+
+    def test_local_workdir_relative_path_rejected(self):
+        """Test that relative workdir paths are rejected in recipes."""
+        yaml_with_relative_workdir = textwrap.dedent("""
+        workdir: ./my-project
+        run: python train.py
+        """).strip()
+        with pytest.raises(ValueError,
+                           match='Local workdir paths are not allowed'):
+            recipes_core._validate_skypilot_yaml(yaml_with_relative_workdir,
+                                                 RecipeType.CLUSTER)
+
+    def test_git_workdir_allowed(self):
+        """Test that git URL workdir is allowed in recipes."""
+        yaml_with_git_workdir = textwrap.dedent("""
+        workdir:
+          url: https://github.com/user/repo
+          ref: main
+        run: python train.py
+        """).strip()
+        # Should not raise
+        recipes_core._validate_skypilot_yaml(yaml_with_git_workdir,
+                                             RecipeType.CLUSTER)
+
+    def test_git_workdir_no_ref_allowed(self):
+        """Test that git URL workdir without ref is allowed in recipes."""
+        yaml_with_git_workdir = textwrap.dedent("""
+        workdir:
+          url: https://github.com/user/repo
+        run: python train.py
+        """).strip()
+        # Should not raise
+        recipes_core._validate_skypilot_yaml(yaml_with_git_workdir,
+                                             RecipeType.CLUSTER)
+
+    def test_local_file_mount_rejected(self):
+        """Test that local file mount sources are rejected in recipes."""
+        yaml_with_local_mount = textwrap.dedent("""
+        file_mounts:
+          /remote/data: /local/path/to/data
+        run: echo hello
+        """).strip()
+        with pytest.raises(ValueError,
+                           match='Local file mounts are not allowed'):
+            recipes_core._validate_skypilot_yaml(yaml_with_local_mount,
+                                                 RecipeType.CLUSTER)
+
+    def test_local_file_mount_relative_path_rejected(self):
+        """Test that relative file mount paths are rejected in recipes."""
+        yaml_with_relative_mount = textwrap.dedent("""
+        file_mounts:
+          /remote/data: ./local/data
+        run: echo hello
+        """).strip()
+        with pytest.raises(ValueError,
+                           match='Local file mounts are not allowed'):
+            recipes_core._validate_skypilot_yaml(yaml_with_relative_mount,
+                                                 RecipeType.CLUSTER)
+
+    def test_cloud_file_mount_s3_allowed(self):
+        """Test that S3 file mounts are allowed in recipes."""
+        yaml_with_cloud_mount = textwrap.dedent("""
+        file_mounts:
+          /remote/data: s3://my-bucket/data
+        run: echo hello
+        """).strip()
+        # Should not raise
+        recipes_core._validate_skypilot_yaml(yaml_with_cloud_mount,
+                                             RecipeType.CLUSTER)
+
+    def test_cloud_file_mount_gs_allowed(self):
+        """Test that GCS file mounts are allowed in recipes."""
+        yaml_with_gcs_mount = textwrap.dedent("""
+        file_mounts:
+          /remote/data: gs://my-bucket/data
+        run: echo hello
+        """).strip()
+        # Should not raise
+        recipes_core._validate_skypilot_yaml(yaml_with_gcs_mount,
+                                             RecipeType.CLUSTER)
+
+    def test_mixed_file_mounts_one_local_rejected(self):
+        """Test that mixed file mounts with one local source are rejected."""
+        yaml_with_mixed_mounts = textwrap.dedent("""
+        file_mounts:
+          /remote/cloud-data: s3://my-bucket/data
+          /remote/local-data: /local/path/to/data
+        run: echo hello
+        """).strip()
+        with pytest.raises(ValueError,
+                           match='Local file mounts are not allowed'):
+            recipes_core._validate_skypilot_yaml(yaml_with_mixed_mounts,
+                                                 RecipeType.CLUSTER)
+
+    def test_inline_storage_mount_allowed(self):
+        """Test that inline storage definitions (dicts) are allowed."""
+        yaml_with_inline_storage = textwrap.dedent("""
+        file_mounts:
+          /remote/data:
+            name: my-bucket
+            source: s3://my-bucket/data
+            mode: COPY
+        run: echo hello
+        """).strip()
+        # Should not raise - dict sources are inline storage definitions
+        recipes_core._validate_skypilot_yaml(yaml_with_inline_storage,
+                                             RecipeType.CLUSTER)
+
+    def test_no_workdir_no_file_mounts_allowed(self):
+        """Test that recipes without workdir or file_mounts are valid."""
+        simple_yaml = textwrap.dedent("""
+        resources:
+          cpus: 2
+        run: echo hello
+        """).strip()
+        # Should not raise
+        recipes_core._validate_skypilot_yaml(simple_yaml, RecipeType.CLUSTER)
+
+    # =========================================================================
+    # Tests for volume recipe validation
+    # =========================================================================
+
+    def test_valid_volume_yaml(self):
+        """Test that a valid volume YAML passes validation."""
+        valid_volume_yaml = textwrap.dedent("""
+        name: my-volume
+        type: k8s-pvc
+        size: 100Gi
+        """).strip()
+        # Should not raise
+        recipes_core._validate_skypilot_yaml(valid_volume_yaml,
+                                             RecipeType.VOLUME)
+
+    def test_volume_yaml_missing_name(self):
+        """Test that volume YAML without name is rejected."""
+        invalid_volume_yaml = textwrap.dedent("""
+        type: k8s-pvc
+        size: 100Gi
+        """).strip()
+        with pytest.raises(ValueError, match="'name' is a required property"):
+            recipes_core._validate_skypilot_yaml(invalid_volume_yaml,
+                                                 RecipeType.VOLUME)
+
+    def test_volume_yaml_missing_type(self):
+        """Test that volume YAML without type is rejected."""
+        invalid_volume_yaml = textwrap.dedent("""
+        name: my-volume
+        size: 100Gi
+        """).strip()
+        with pytest.raises(ValueError, match="'type' is a required property"):
+            recipes_core._validate_skypilot_yaml(invalid_volume_yaml,
+                                                 RecipeType.VOLUME)
+
+    def test_volume_yaml_invalid_type(self):
+        """Test that volume YAML with invalid type is rejected."""
+        invalid_volume_yaml = textwrap.dedent("""
+        name: my-volume
+        type: invalid-type
+        size: 100Gi
+        """).strip()
+        with pytest.raises(ValueError, match='Invalid volume YAML'):
+            recipes_core._validate_skypilot_yaml(invalid_volume_yaml,
+                                                 RecipeType.VOLUME)
+
+
+class TestRecipeNameValidation:
+    """Tests for recipe name validation.
+
+    These tests validate the recipe name format directly using
+    check_recipe_name_is_valid to avoid any database interaction.
+    """
+
+    def test_invalid_name_with_underscore(self):
+        """Test that recipe names with underscores are rejected."""
+        with pytest.raises(exceptions.InvalidRecipeNameError):
+            common_utils.check_recipe_name_is_valid('invalid_name')
+
+    def test_invalid_name_with_dot(self):
+        """Test that recipe names with dots are rejected."""
+        with pytest.raises(exceptions.InvalidRecipeNameError):
+            common_utils.check_recipe_name_is_valid('invalid.name')
+
+    def test_invalid_name_starts_with_number(self):
+        """Test that recipe names starting with numbers are rejected."""
+        with pytest.raises(exceptions.InvalidRecipeNameError):
+            common_utils.check_recipe_name_is_valid('123invalid')
+
+    def test_invalid_name_with_spaces(self):
+        """Test that recipe names with spaces are rejected."""
+        with pytest.raises(exceptions.InvalidRecipeNameError):
+            common_utils.check_recipe_name_is_valid('invalid name')
+
+    def test_invalid_name_ends_with_dash(self):
+        """Test that recipe names ending with dash are rejected."""
+        with pytest.raises(exceptions.InvalidRecipeNameError):
+            common_utils.check_recipe_name_is_valid('invalid-name-')
+
+    def test_invalid_name_starts_with_dash(self):
+        """Test that recipe names starting with dash are rejected."""
+        with pytest.raises(exceptions.InvalidRecipeNameError):
+            common_utils.check_recipe_name_is_valid('-invalid-name')
+
+    def test_valid_name_simple(self):
+        """Test that simple valid names are accepted."""
+        # Should not raise
+        common_utils.check_recipe_name_is_valid('valid-name')
+        common_utils.check_recipe_name_is_valid('validname')
+        common_utils.check_recipe_name_is_valid('valid123')
+
+    def test_valid_name_with_numbers(self):
+        """Test that names with numbers in the middle are accepted."""
+        # Should not raise
+        common_utils.check_recipe_name_is_valid('my-recipe-v2')
+        common_utils.check_recipe_name_is_valid('llama3-70b-finetune')
diff --git a/tests/unit_tests/test_resources.py b/tests/unit_tests/test_resources.py
index 7db24b88237..4155ee59f7d 100644
--- a/tests/unit_tests/test_resources.py
+++ b/tests/unit_tests/test_resources.py
@@ -1478,6 +1478,19 @@ def test_resources_add_subtract_with_none():
     assert result.accelerators == {'V100': 1.0}
 
 
+def test_inferred_resource_not_copied():
+    """Test that when accelerator is None and only inferred by machine type,
+    the inferred accelerator values are not copied."""
+    r1 = Resources(instance_type='a2-highgpu-1g', cloud=clouds.GCP())
+    assert r1.accelerators == {'A100': 1}
+    assert r1._accelerators is None
+
+    override = {'instance_type': 'g2-standard-4'}
+    r2 = r1.copy(**override)
+    assert r2.accelerators == {'L4': 1}
+    assert r2._accelerators is None
+
+
 @mock.patch(
     'sky.provision.kubernetes.utils.check_port_forward_mode_dependencies')
 def test_kubernetes_end_to_end_make_deploy_variables(mock_check_deps,
diff --git a/tests/unit_tests/test_serve_utils.py b/tests/unit_tests/test_serve_utils.py
index 9fadf67f1f7..3908a4acb8b 100644
--- a/tests/unit_tests/test_serve_utils.py
+++ b/tests/unit_tests/test_serve_utils.py
@@ -50,3 +50,24 @@ def test_task_fits():
     task_resources = Resources(cpus=1, memory=1, cloud=clouds.AWS())
     free_resources = Resources(cpus=None, memory=None, cloud=clouds.AWS())
     assert serve_utils._task_fits(task_resources, free_resources) is False
+
+
+def test_serve_preemption_skips_autostopping():
+    """Verify serve preemption logic treats AUTOSTOPPING like UP (not preempted)."""
+    from sky.utils import status_lib
+
+    # AUTOSTOPPING should be treated as UP-like (not preempted)
+    # is_cluster_up() should return True for AUTOSTOPPING
+    up_status = status_lib.ClusterStatus.UP
+    autostopping_status = status_lib.ClusterStatus.AUTOSTOPPING
+    stopped_status = status_lib.ClusterStatus.STOPPED
+
+    # AUTOSTOPPING should be in the same category as UP for preemption purposes
+    not_preempted_statuses = {
+        up_status,
+        autostopping_status,
+    }
+
+    assert up_status in not_preempted_statuses
+    assert autostopping_status in not_preempted_statuses
+    assert stopped_status not in not_preempted_statuses
diff --git a/tests/unit_tests/test_sky/adaptors/test_kubernetes_adaptor.py b/tests/unit_tests/test_sky/adaptors/test_kubernetes_adaptor.py
index 5a28684b198..33320b66eeb 100644
--- a/tests/unit_tests/test_sky/adaptors/test_kubernetes_adaptor.py
+++ b/tests/unit_tests/test_sky/adaptors/test_kubernetes_adaptor.py
@@ -1,6 +1,11 @@
 """Tests for Kubernetes adaptor."""
 
+import concurrent.futures
 import gc
+import os
+import tempfile
+import threading
+import time
 from types import SimpleNamespace
 from unittest.mock import MagicMock
 
@@ -25,12 +30,14 @@
 )
 def test_typed_clients_cleanup(monkeypatch, ctor_name, api_func):
     """Verify typed client api_client.close() is called on GC."""
-    monkeypatch.setattr(kubernetes, '_load_config', lambda context=None: None)
     api_client_mock = MagicMock()
+    monkeypatch.setattr(kubernetes,
+                        '_get_api_client',
+                        lambda context=None: api_client_mock)
     monkeypatch.setattr(
         kubernetes.kubernetes.client,
         ctor_name,
-        lambda: SimpleNamespace(api_client=api_client_mock),
+        lambda api_client=None: SimpleNamespace(api_client=api_client),
     )
     obj = api_func()
     del obj
@@ -42,8 +49,6 @@ def test_typed_clients_cleanup(monkeypatch, ctor_name, api_func):
 
 def test_api_client_cleanup(monkeypatch):
     """Verify ApiClient.close() is called on GC."""
-    monkeypatch.setattr(kubernetes, '_load_config', lambda context=None: None)
-
     instances = []
 
     class FakeApiClient:
@@ -52,6 +57,11 @@ def __init__(self):
             self.close = MagicMock()
             instances.append(self)
 
+    # Mock _get_api_client to return a FakeApiClient instance
+    monkeypatch.setattr(kubernetes,
+                        '_get_api_client',
+                        lambda context=None: FakeApiClient())
+    # Also mock the ApiClient class so isinstance checks work
     monkeypatch.setattr(kubernetes.kubernetes.client, 'ApiClient',
                         FakeApiClient)
 
@@ -66,13 +76,15 @@ def __init__(self):
 
 def test_watch_cleanup(monkeypatch):
     """Verify Watch.stop() and underlying api_client.close() are called."""
-    monkeypatch.setattr(kubernetes, '_load_config', lambda context=None: None)
     api_client_mock = MagicMock()
+    monkeypatch.setattr(kubernetes,
+                        '_get_api_client',
+                        lambda context=None: api_client_mock)
 
     class FakeWatch:
 
-        def __init__(self):
-            self._api_client = api_client_mock()
+        def __init__(self, return_type=None):
+            self._raw_return_type = return_type
 
     monkeypatch.setattr(kubernetes.kubernetes.watch, 'Watch', FakeWatch)
 
@@ -85,3 +97,110 @@ def __init__(self):
     gc.collect()
 
     assert underlying._api_client.close.call_count == 1
+
+
+def _create_test_kubeconfig(num_contexts):
+    """Create a temporary kubeconfig with multiple contexts."""
+    clusters = '\n'.join(f'- cluster:\n'
+                         f'    server: https://cluster-{i}.example.com\n'
+                         f'  name: cluster-{i}' for i in range(num_contexts))
+
+    contexts = '\n'.join(f'- context:\n'
+                         f'    cluster: cluster-{i}\n'
+                         f'    user: user-{i}\n'
+                         f'  name: context-{i}' for i in range(num_contexts))
+
+    users = '\n'.join(f'- name: user-{i}\n'
+                      f'  user: {{}}' for i in range(num_contexts))
+
+    kubeconfig = (f'apiVersion: v1\n'
+                  f'kind: Config\n'
+                  f'clusters:\n'
+                  f'{clusters}\n'
+                  f'contexts:\n'
+                  f'{contexts}\n'
+                  f'current-context: context-0\n'
+                  f'users:\n'
+                  f'{users}\n')
+    fd, path = tempfile.mkstemp(suffix='.yaml')
+    os.write(fd, kubeconfig.encode())
+    os.close(fd)
+    return path
+
+
+@pytest.mark.parametrize(
+    'api_func',
+    [
+        kubernetes.core_api,
+        kubernetes.storage_api,
+        kubernetes.auth_api,
+        kubernetes.networking_api,
+        kubernetes.custom_objects_api,
+        kubernetes.apps_api,
+        kubernetes.batch_api,
+        kubernetes.custom_resources_api,
+    ],
+)
+def test_concurrent_context_isolation(monkeypatch, api_func):
+    """Verify concurrent API calls with different contexts get isolated clients.
+
+    This is a regression test for a race condition where the old implementation
+    would:
+    1. Call _load_config(context) which modified global
+       kubernetes.client.configuration
+    2. Create an API client that used that global config
+
+    If two threads interleaved:
+    - Thread A: _load_config('context-a')
+    - Thread B: _load_config('context-b')  # overwrites global config
+    - Thread A: CoreV1Api()  # incorrectly uses context-b!
+
+    The fix uses new_client_from_config() which returns an ApiClient with an
+    isolated Configuration object, avoiding global state.
+    """
+    num_contexts = 10
+    iterations = 5
+    contexts = [f'context-{i}' for i in range(num_contexts)]
+    expected_hosts = {
+        f'context-{i}': f'https://cluster-{i}.example.com'
+        for i in range(num_contexts)
+    }
+
+    config_file = _create_test_kubeconfig(num_contexts)
+    try:
+        monkeypatch.setattr(kubernetes, '_get_config_file', lambda: config_file)
+
+        original_get_api_client = kubernetes._get_api_client  # pylint: disable=protected-access
+
+        def slow_get_api_client(context=None):
+            assert (context is not None)
+            client = original_get_api_client(context)
+            time.sleep(0.001)
+            return client
+
+        monkeypatch.setattr(kubernetes, '_get_api_client', slow_get_api_client)
+
+        for iteration in range(iterations):
+            annotations.clear_request_level_cache()
+
+            def get_api_for_context(ctx):
+                api = api_func(ctx)
+                # pylint: disable=protected-access
+                return (ctx, api._client.api_client.configuration.host)
+
+            with concurrent.futures.ThreadPoolExecutor(
+                    max_workers=num_contexts) as executor:
+                futures = [
+                    executor.submit(get_api_for_context, ctx)
+                    for ctx in contexts
+                ]
+                results = [f.result() for f in futures]
+
+            for requested_ctx, actual_host in results:
+                expected_host = expected_hosts[requested_ctx]
+                assert actual_host == expected_host, (
+                    f'Iteration {iteration}: Host mismatch for '
+                    f'{requested_ctx}: expected {expected_host}, '
+                    f'got {actual_host}. Race condition detected.')
+    finally:
+        os.unlink(config_file)
diff --git a/tests/unit_tests/test_sky/adaptors/test_slurm_adaptor.py b/tests/unit_tests/test_sky/adaptors/test_slurm_adaptor.py
index 79a638d6755..7aea5d3bc7d 100644
--- a/tests/unit_tests/test_sky/adaptors/test_slurm_adaptor.py
+++ b/tests/unit_tests/test_sky/adaptors/test_slurm_adaptor.py
@@ -475,3 +475,58 @@ def test_parse_maxtime_no_match(self):
         line = 'PartitionName=dev Default=YES Nodes=node1'
         result = slurm._parse_maxtime(line)
         assert result is None
+
+
+class TestGetProctrackType:
+    """Test SlurmClient.get_proctrack_type()."""
+
+    @pytest.mark.parametrize(
+        'mock_output,expected',
+        [
+            # Standard output with padding
+            ('ProctrackType           = proctrack/cgroup\n', 'cgroup'),
+            ('ProctrackType           = proctrack/linuxproc\n', 'linuxproc'),
+            ('ProctrackType           = proctrack/pgid\n', 'pgid'),
+            # Minimal spacing
+            ('ProctrackType=proctrack/cgroup\n', 'cgroup'),
+            # No match
+            ('SomeOtherConfig = value\n', None),
+            ('', None),
+        ])
+    def test_get_proctrack_type_parsing(self, mock_output, expected):
+        """Test parsing various proctrack type outputs."""
+        client = slurm.SlurmClient(
+            ssh_host='localhost',
+            ssh_port=22,
+            ssh_user='root',
+            ssh_key=None,
+        )
+
+        with mock.patch.object(client._runner, 'run') as mock_run:
+            mock_run.return_value = (0, mock_output, '')
+
+            result = client.get_proctrack_type()
+            mock_run.assert_called_once_with(
+                'scontrol show config | grep -i "^ProctrackType"',
+                require_outputs=True,
+                separate_stderr=True,
+                stream_logs=False,
+            )
+
+            assert result == expected
+
+    def test_get_proctrack_type_command_failure(self):
+        """Test handling command failure returns None."""
+        client = slurm.SlurmClient(
+            ssh_host='localhost',
+            ssh_port=22,
+            ssh_user='root',
+            ssh_key=None,
+        )
+
+        with mock.patch.object(client._runner, 'run') as mock_run:
+            mock_run.return_value = (1, '', 'command not found')
+
+            result = client.get_proctrack_type()
+
+            assert result is None
diff --git a/tests/unit_tests/test_sky/backends/test_task_codegen.py b/tests/unit_tests/test_sky/backends/test_task_codegen.py
index 6b007a5eaf0..65785c610cf 100644
--- a/tests/unit_tests/test_sky/backends/test_task_codegen.py
+++ b/tests/unit_tests/test_sky/backends/test_task_codegen.py
@@ -14,6 +14,7 @@
 import pytest
 
 from sky.backends import task_codegen
+from sky.provision.slurm import utils as slurm_utils
 
 TESTDATA_DIR = Path(__file__).parent / 'testdata' / 'ray_codegen'
 SLURM_TESTDATA_DIR = Path(__file__).parent / 'testdata' / 'slurm_codegen'
@@ -150,7 +151,10 @@ def test_slurm_single_node_with_gpu():
 
     Mirrors test_single_node_with_gpu() but for SlurmCodeGen.
     """
-    codegen = task_codegen.SlurmCodeGen(slurm_job_id='12345')
+    codegen = task_codegen.SlurmCodeGen(
+        slurm_job_id='12345',
+        container_name=None,
+    )
     codegen.add_prologue(job_id=2)
 
     resources_dict = {'CPU': 4.0, 'GPU': 1.0}
@@ -188,13 +192,49 @@ def test_slurm_single_node_with_gpu():
                                     testdata_dir=SLURM_TESTDATA_DIR)
 
 
+def test_slurm_codegen_with_container():
+    codegen = task_codegen.SlurmCodeGen(
+        slurm_job_id='12345',
+        container_name=slurm_utils.pyxis_container_name('test-cluster'),
+    )
+    codegen.add_prologue(job_id=2)
+
+    resources_dict = {'CPU': 2.0}
+    task_env_vars = {}
+
+    codegen.add_setup(
+        1,
+        resources_dict=resources_dict,
+        stable_cluster_internal_ips=['10.0.0.1'],
+        env_vars=task_env_vars,
+        log_dir='/sky/logs',
+        setup_cmd=None,
+    )
+
+    codegen.add_task(
+        1,
+        bash_script='echo hello',
+        task_name='hello',
+        resources_dict={'CPU': 1.0},
+        log_dir='/sky/logs/tasks',
+        env_vars=task_env_vars,
+    )
+
+    codegen.add_epilogue()
+
+    result = codegen.build()
+    assert_codegen_matches_snapshot('slurm_codegen_with_container',
+                                    result,
+                                    testdata_dir=SLURM_TESTDATA_DIR)
+
+
 class TestRcloneFlushScript:
     """Unit tests for the rclone flush script output format."""
 
     def test_flush_script_contains_elapsed_time(self):
         """Test that flush script reports elapsed time in status messages."""
         codegen = task_codegen.RayCodeGen()
-        flush_script = codegen._get_rclone_flush_script()
+        flush_script = codegen.get_rclone_flush_script()
 
         # Should track and report elapsed time
         assert 'FLUSH_START_TIME=$(date +%s)' in flush_script
@@ -204,7 +244,7 @@ def test_flush_script_contains_elapsed_time(self):
     def test_flush_script_shows_cache_status(self):
         """Test that flush script extracts and shows vfs cache status."""
         codegen = task_codegen.RayCodeGen()
-        flush_script = codegen._get_rclone_flush_script()
+        flush_script = codegen.get_rclone_flush_script()
 
         # Should extract cache status from log
         assert 'CACHE_STATUS=' in flush_script
@@ -214,7 +254,7 @@ def test_flush_script_shows_cache_status(self):
     def test_flush_script_shows_uploading_files(self):
         """Test that flush script shows files currently being uploaded."""
         codegen = task_codegen.RayCodeGen()
-        flush_script = codegen._get_rclone_flush_script()
+        flush_script = codegen.get_rclone_flush_script()
 
         # Should extract uploading files from log
         assert 'UPLOADING_FILES=' in flush_script
@@ -226,7 +266,7 @@ def test_flush_script_shows_uploading_files(self):
     def test_flush_script_has_fallback_output(self):
         """Test that flush script falls back to last log line if parsing fails."""
         codegen = task_codegen.RayCodeGen()
-        flush_script = codegen._get_rclone_flush_script()
+        flush_script = codegen.get_rclone_flush_script()
 
         # Should have fallback to last line
         assert 'LAST_LINE=' in flush_script
@@ -237,7 +277,7 @@ def test_flush_script_has_fallback_output(self):
     def test_flush_script_shows_completion_time(self):
         """Test that flush script shows total time when upload completes."""
         codegen = task_codegen.RayCodeGen()
-        flush_script = codegen._get_rclone_flush_script()
+        flush_script = codegen.get_rclone_flush_script()
 
         # Should report total flush time on completion
         assert 'TOTAL_FLUSH_TIME=$(($(date +%s) - FLUSH_START_TIME))' in flush_script
@@ -246,7 +286,7 @@ def test_flush_script_shows_completion_time(self):
     def test_flush_script_output_message_formats(self):
         """Test that all output message variants are present."""
         codegen = task_codegen.RayCodeGen()
-        flush_script = codegen._get_rclone_flush_script()
+        flush_script = codegen.get_rclone_flush_script()
 
         # Format 1: cache status + uploading files
         assert (
@@ -267,9 +307,12 @@ def test_flush_script_output_message_formats(self):
     def test_slurm_flush_script_same_as_ray(self):
         """Test that SlurmCodeGen uses the same flush script as RayCodeGen."""
         ray_codegen = task_codegen.RayCodeGen()
-        slurm_codegen = task_codegen.SlurmCodeGen(slurm_job_id='12345')
+        slurm_codegen = task_codegen.SlurmCodeGen(
+            slurm_job_id='12345',
+            container_name=None,
+        )
 
-        ray_script = ray_codegen._get_rclone_flush_script()
-        slurm_script = slurm_codegen._get_rclone_flush_script()
+        ray_script = ray_codegen.get_rclone_flush_script()
+        slurm_script = slurm_codegen.get_rclone_flush_script()
 
         assert ray_script == slurm_script
diff --git a/tests/unit_tests/test_sky/backends/testdata/ray_codegen/multi_node_2nodes.py b/tests/unit_tests/test_sky/backends/testdata/ray_codegen/multi_node_2nodes.py
index c0a7dedfbfb..4bae1ae9ed1 100644
--- a/tests/unit_tests/test_sky/backends/testdata/ray_codegen/multi_node_2nodes.py
+++ b/tests/unit_tests/test_sky/backends/testdata/ray_codegen/multi_node_2nodes.py
@@ -216,6 +216,7 @@ def run_with_log(
     line_processor: Optional[log_utils.LineProcessor] = None,
     streaming_prefix: Optional[str] = None,
     log_cmd: bool = False,
+    timeout: Optional[int] = None,
     **kwargs,
 ) -> Union[int, Tuple[int, str, str], Tuple[int, int]]:
     """Runs a command and logs its output to a file.
@@ -230,9 +231,15 @@ def run_with_log(
             enabled, lines are printed only when '\r' or '\n' is found.
         streaming_prefix: Optional prefix for each log line. Can contain {pid}
             placeholder which will be replaced with the subprocess PID.
+        timeout: Optional timeout in seconds. If the command does not complete
+            within this time, it will be terminated and TimeoutExpired will be
+            raised. None means no timeout (default).
 
     Returns the returncode or returncode, stdout and stderr of the command.
       Note that the stdout and stderr is already decoded.
+
+    Raises:
+        subprocess.TimeoutExpired: If the command times out.
     """
     assert process_stream or not require_outputs, (
         process_stream, require_outputs,
@@ -323,24 +330,65 @@ def run_with_log(
                         _handle_io_stream,
                         args=err_args,
                     )
-            if ctx is not None:
-                # When runs in a coroutine, always process the subprocess
-                # stream to:
-                # 1. handle context cancellation
-                # 2. redirect subprocess stdout/stderr to the contextual
-                #    stdout/stderr of current coroutine.
-                stdout, stderr = context_utils.pipe_and_wait_process(
-                    ctx,
-                    proc,
-                    stdout_stream_handler=stdout_stream_handler,
-                    stderr_stream_handler=stderr_stream_handler)
-            elif process_stream:
-                # When runs in a process, only process subprocess stream if
-                # necessary to avoid unnecessary stream handling overhead.
-                stdout, stderr = process_subprocess_stream(
-                    proc, stdout_stream_handler, stderr_stream_handler)
+            # Use a timer to enforce timeout during stream processing.
+            # Without this, process_subprocess_stream blocks until the process
+            # finishes, making the timeout at proc.wait() ineffective.
+            timeout_triggered = False
+            timer = None
+
+            def _timeout_handler():
+                nonlocal timeout_triggered
+                timeout_triggered = True
+                subprocess_utils.kill_children_processes(proc.pid)
+
+            if timeout is not None:
+                timer = threading.Timer(timeout, _timeout_handler)
+                timer.start()
+
+            try:
+                if ctx is not None:
+                    # When runs in a coroutine, always process the subprocess
+                    # stream to:
+                    # 1. handle context cancellation
+                    # 2. redirect subprocess stdout/stderr to the contextual
+                    #    stdout/stderr of current coroutine.
+                    stdout, stderr = context_utils.pipe_and_wait_process(
+                        ctx,
+                        proc,
+                        stdout_stream_handler=stdout_stream_handler,
+                        stderr_stream_handler=stderr_stream_handler)
+                elif process_stream:
+                    # When runs in a process, only process subprocess stream if
+                    # necessary to avoid unnecessary stream handling overhead.
+                    stdout, stderr = process_subprocess_stream(
+                        proc, stdout_stream_handler, stderr_stream_handler)
+            finally:
+                if timer is not None:
+                    timer.cancel()
+
+            # Check if timeout was triggered during stream processing
+            if timeout_triggered:
+                logger.error(
+                    f'Command timed out after {timeout} seconds: {cmd}')
+                raise subprocess.TimeoutExpired(cmd, timeout)
+
             # Ensure returncode is set.
-            proc.wait()
+            if ctx is not None or process_stream:
+                # Stream processing already waited for process completion, so
+                # proc.wait() will return immediately. We still call it to
+                # ensure proc.returncode is set.
+                proc.wait()
+            else:
+                # No stream processing - use proc.wait with timeout as primary
+                # timeout mechanism.
+                try:
+                    proc.wait(timeout=timeout)
+                except subprocess.TimeoutExpired:
+                    # Kill the process and all its children
+                    subprocess_utils.kill_children_processes(proc.pid)
+                    logger.error(
+                        f'Command timed out after {timeout} seconds: {cmd}')
+                    raise
             if require_outputs:
                 return proc.returncode, stdout, stderr
             return proc.returncode
diff --git a/tests/unit_tests/test_sky/backends/testdata/ray_codegen/single_node_with_gpu.py b/tests/unit_tests/test_sky/backends/testdata/ray_codegen/single_node_with_gpu.py
index 974e92d7139..298963f5609 100644
--- a/tests/unit_tests/test_sky/backends/testdata/ray_codegen/single_node_with_gpu.py
+++ b/tests/unit_tests/test_sky/backends/testdata/ray_codegen/single_node_with_gpu.py
@@ -216,6 +216,7 @@ def run_with_log(
     line_processor: Optional[log_utils.LineProcessor] = None,
     streaming_prefix: Optional[str] = None,
     log_cmd: bool = False,
+    timeout: Optional[int] = None,
     **kwargs,
 ) -> Union[int, Tuple[int, str, str], Tuple[int, int]]:
     """Runs a command and logs its output to a file.
@@ -230,9 +231,15 @@ def run_with_log(
             enabled, lines are printed only when '\r' or '\n' is found.
         streaming_prefix: Optional prefix for each log line. Can contain {pid}
             placeholder which will be replaced with the subprocess PID.
+        timeout: Optional timeout in seconds. If the command does not complete
+            within this time, it will be terminated and TimeoutExpired will be
+            raised. None means no timeout (default).
 
     Returns the returncode or returncode, stdout and stderr of the command.
       Note that the stdout and stderr is already decoded.
+
+    Raises:
+        subprocess.TimeoutExpired: If the command times out.
     """
     assert process_stream or not require_outputs, (
         process_stream, require_outputs,
@@ -323,24 +330,65 @@ def run_with_log(
                         _handle_io_stream,
                         args=err_args,
                     )
-            if ctx is not None:
-                # When runs in a coroutine, always process the subprocess
-                # stream to:
-                # 1. handle context cancellation
-                # 2. redirect subprocess stdout/stderr to the contextual
-                #    stdout/stderr of current coroutine.
-                stdout, stderr = context_utils.pipe_and_wait_process(
-                    ctx,
-                    proc,
-                    stdout_stream_handler=stdout_stream_handler,
-                    stderr_stream_handler=stderr_stream_handler)
-            elif process_stream:
-                # When runs in a process, only process subprocess stream if
-                # necessary to avoid unnecessary stream handling overhead.
-                stdout, stderr = process_subprocess_stream(
-                    proc, stdout_stream_handler, stderr_stream_handler)
+            # Use a timer to enforce timeout during stream processing.
+            # Without this, process_subprocess_stream blocks until the process
+            # finishes, making the timeout at proc.wait() ineffective.
+            timeout_triggered = False
+            timer = None
+
+            def _timeout_handler():
+                nonlocal timeout_triggered
+                timeout_triggered = True
+                subprocess_utils.kill_children_processes(proc.pid)
+
+            if timeout is not None:
+                timer = threading.Timer(timeout, _timeout_handler)
+                timer.start()
+
+            try:
+                if ctx is not None:
+                    # When runs in a coroutine, always process the subprocess
+                    # stream to:
+                    # 1. handle context cancellation
+                    # 2. redirect subprocess stdout/stderr to the contextual
+                    #    stdout/stderr of current coroutine.
+                    stdout, stderr = context_utils.pipe_and_wait_process(
+                        ctx,
+                        proc,
+                        stdout_stream_handler=stdout_stream_handler,
+                        stderr_stream_handler=stderr_stream_handler)
+                elif process_stream:
+                    # When runs in a process, only process subprocess stream if
+                    # necessary to avoid unnecessary stream handling overhead.
+                    stdout, stderr = process_subprocess_stream(
+                        proc, stdout_stream_handler, stderr_stream_handler)
+            finally:
+                if timer is not None:
+                    timer.cancel()
+
+            # Check if timeout was triggered during stream processing
+            if timeout_triggered:
+                logger.error(
+                    f'Command timed out after {timeout} seconds: {cmd}')
+                raise subprocess.TimeoutExpired(cmd, timeout)
+
             # Ensure returncode is set.
-            proc.wait()
+            if ctx is not None or process_stream:
+                # Stream processing already waited for process completion, so
+                # proc.wait() will return immediately. We still call it to
+                # ensure proc.returncode is set.
+                proc.wait()
+            else:
+                # No stream processing - use proc.wait with timeout as primary
+                # timeout mechanism.
+                try:
+                    proc.wait(timeout=timeout)
+                except subprocess.TimeoutExpired:
+                    # Kill the process and all its children
+                    subprocess_utils.kill_children_processes(proc.pid)
+                    logger.error(
+                        f'Command timed out after {timeout} seconds: {cmd}')
+                    raise
             if require_outputs:
                 return proc.returncode, stdout, stderr
             return proc.returncode
diff --git a/tests/unit_tests/test_sky/backends/testdata/slurm_codegen/slurm_codegen_with_container.py b/tests/unit_tests/test_sky/backends/testdata/slurm_codegen/slurm_codegen_with_container.py
new file mode 100644
index 00000000000..770f8953972
--- /dev/null
+++ b/tests/unit_tests/test_sky/backends/testdata/slurm_codegen/slurm_codegen_with_container.py
@@ -0,0 +1,704 @@
+import functools
+import getpass
+import hashlib
+import io
+import os
+import pathlib
+import selectors
+import shlex
+import subprocess
+import sys
+import tempfile
+import textwrap
+import time
+from typing import Dict, List, Optional, Tuple, Union
+
+import colorama
+import copy
+import json
+import multiprocessing
+import signal
+import threading
+from sky.backends import backend_utils
+
+from sky.skylet import autostop_lib
+from sky.skylet import constants
+from sky.skylet import job_lib
+from sky.utils import log_utils
+from sky.utils import subprocess_utils
+
+SKY_REMOTE_WORKDIR = '~/sky_workdir'
+
+CANCELLED_RETURN_CODE = 137
+
+class _ProcessingArgs:
+    """Arguments for processing logs."""
+
+    def __init__(self,
+                 log_path: str,
+                 stream_logs: bool,
+                 start_streaming_at: str = '',
+                 end_streaming_at: Optional[str] = None,
+                 skip_lines: Optional[List[str]] = None,
+                 replace_crlf: bool = False,
+                 line_processor: Optional[log_utils.LineProcessor] = None,
+                 streaming_prefix: Optional[str] = None) -> None:
+        self.log_path = log_path
+        self.stream_logs = stream_logs
+        self.start_streaming_at = start_streaming_at
+        self.end_streaming_at = end_streaming_at
+        self.skip_lines = skip_lines
+        self.replace_crlf = replace_crlf
+        self.line_processor = line_processor
+        self.streaming_prefix = streaming_prefix
+
+def _get_context():
+    # TODO(aylei): remove this after we drop the backward-compatibility for
+    # 0.9.x in 0.12.0
+    # Keep backward-compatibility for the old version of SkyPilot runtimes.
+    if 'context' in globals():
+        return context.get()
+    else:
+        return None
+
+def _handle_io_stream(io_stream, out_stream, args: _ProcessingArgs):
+    """Process the stream of a process."""
+    out_io = io.TextIOWrapper(io_stream,
+                              encoding='utf-8',
+                              newline='',
+                              errors='replace',
+                              write_through=True)
+
+    start_streaming_flag = False
+    end_streaming_flag = False
+    streaming_prefix = args.streaming_prefix if args.streaming_prefix else ''
+    line_processor = (log_utils.LineProcessor()
+                      if args.line_processor is None else args.line_processor)
+
+    out = []
+    with open(args.log_path, 'a', encoding='utf-8') as fout:
+        with line_processor:
+            while True:
+                ctx = _get_context()
+                if ctx is not None and ctx.is_canceled():
+                    return
+                line = out_io.readline()
+                if not line:
+                    break
+                # start_streaming_at logic in processor.process_line(line)
+                if args.replace_crlf and line.endswith('\r\n'):
+                    # Replace CRLF with LF to avoid ray logging to the same
+                    # line due to separating lines with '\n'.
+                    line = line[:-2] + '\n'
+                if (args.skip_lines is not None and
+                        any(skip in line for skip in args.skip_lines)):
+                    continue
+                if args.start_streaming_at in line:
+                    start_streaming_flag = True
+                if (args.end_streaming_at is not None and
+                        args.end_streaming_at in line):
+                    # Keep executing the loop, only stop streaming.
+                    # E.g., this is used for `sky bench` to hide the
+                    # redundant messages of `sky launch` while
+                    # saving them in log files.
+                    end_streaming_flag = True
+                if (args.stream_logs and start_streaming_flag and
+                        not end_streaming_flag):
+                    print(streaming_prefix + line,
+                          end='',
+                          file=out_stream,
+                          flush=True)
+                if args.log_path != '/dev/null':
+                    fout.write(line)
+                    fout.flush()
+                line_processor.process_line(line)
+                out.append(line)
+    return ''.join(out)
+
+def process_subprocess_stream(proc, stdout_stream_handler,
+                              stderr_stream_handler) -> Tuple[str, str]:
+    """Process the stream of a process in threads, blocking."""
+    if proc.stderr is not None:
+        # Asyncio does not work as the output processing can be executed in a
+        # different thread.
+        # selectors is possible to handle the multiplexing of stdout/stderr,
+        # but it introduces buffering making the output not streaming.
+        with multiprocessing.pool.ThreadPool(processes=1) as pool:
+            stderr_fut = pool.apply_async(stderr_stream_handler,
+                                          args=(proc.stderr, sys.stderr))
+            # Do not launch a thread for stdout as the rich.status does not
+            # work in a thread, which is used in
+            # log_utils.RayUpLineProcessor.
+            stdout = stdout_stream_handler(proc.stdout, sys.stdout)
+            stderr = stderr_fut.get()
+    else:
+        stdout = stdout_stream_handler(proc.stdout, sys.stdout)
+        stderr = ''
+    return stdout, stderr
+
+def run_with_log(
+    cmd: Union[List[str], str],
+    log_path: str,
+    *,
+    require_outputs: bool = False,
+    stream_logs: bool = False,
+    start_streaming_at: str = '',
+    end_streaming_at: Optional[str] = None,
+    skip_lines: Optional[List[str]] = None,
+    shell: bool = False,
+    with_ray: bool = False,
+    process_stream: bool = True,
+    line_processor: Optional[log_utils.LineProcessor] = None,
+    streaming_prefix: Optional[str] = None,
+    log_cmd: bool = False,
+    timeout: Optional[int] = None,
+    **kwargs,
+) -> Union[int, Tuple[int, str, str], Tuple[int, int]]:
+    """Runs a command and logs its output to a file.
+
+    Args:
+        cmd: The command to run.
+        log_path: The path to the log file.
+        stream_logs: Whether to stream the logs to stdout/stderr.
+        require_outputs: Whether to return the stdout/stderr of the command.
+        process_stream: Whether to post-process the stdout/stderr of the
+            command, such as replacing or skipping lines on the fly. If
+            enabled, lines are printed only when '\r' or '\n' is found.
+        streaming_prefix: Optional prefix for each log line. Can contain {pid}
+            placeholder which will be replaced with the subprocess PID.
+        timeout: Optional timeout in seconds. If the command does not complete
+            within this time, it will be terminated and TimeoutExpired will be
+            raised. None means no timeout (default).
+
+    Returns the returncode or returncode, stdout and stderr of the command.
+      Note that the stdout and stderr is already decoded.
+
+    Raises:
+        subprocess.TimeoutExpired: If the command times out.
+    """
+    assert process_stream or not require_outputs, (
+        process_stream, require_outputs,
+        'require_outputs should be False when process_stream is False')
+
+    log_path = os.path.expanduser(log_path)
+    dirname = os.path.dirname(log_path)
+    os.makedirs(dirname, exist_ok=True)
+    # Redirect stderr to stdout when using ray, to preserve the order of
+    # stdout and stderr.
+    stdout_arg = stderr_arg = None
+    ctx = _get_context()
+    if process_stream or ctx is not None:
+        # Capture stdout/stderr of the subprocess if:
+        # 1. Post-processing is needed (process_stream=True)
+        # 2. Potential contextual handling is needed (ctx is not None)
+        # TODO(aylei): can we always capture the stdout/stderr?
+        stdout_arg = subprocess.PIPE
+        stderr_arg = subprocess.PIPE if not with_ray else subprocess.STDOUT
+    # Use stdin=subprocess.DEVNULL by default, as allowing inputs will mess up
+    # the terminal output when typing in the terminal that starts the API
+    # server.
+    stdin = kwargs.pop('stdin', subprocess.DEVNULL)
+    if log_cmd:
+        with open(log_path, 'a', encoding='utf-8') as f:
+            print(f'Running command: {cmd}', file=f)
+    with subprocess.Popen(cmd,
+                          stdout=stdout_arg,
+                          stderr=stderr_arg,
+                          start_new_session=True,
+                          shell=shell,
+                          stdin=stdin,
+                          **kwargs) as proc:
+        try:
+            if ctx is not None:
+                # When runs in coroutine, use kill_pg if available to avoid
+                # the overhead of refreshing the process tree in the daemon.
+                subprocess_utils.kill_process_daemon(proc.pid, use_kill_pg=True)
+            else:
+                # For backward compatibility, do not specify use_kill_pg by
+                # default.
+                subprocess_utils.kill_process_daemon(proc.pid)
+
+            # Format streaming_prefix with subprocess PID if it contains {pid}
+            formatted_streaming_prefix = streaming_prefix
+            if streaming_prefix and '{pid}' in streaming_prefix:
+                formatted_streaming_prefix = streaming_prefix.format(
+                    pid=proc.pid)
+
+            stdout = ''
+            stderr = ''
+            stdout_stream_handler = None
+            stderr_stream_handler = None
+
+            if process_stream:
+                if skip_lines is None:
+                    skip_lines = []
+                # Skip these lines caused by `-i` option of bash. Failed to
+                # find other way to turn off these two warning.
+                # https://stackoverflow.com/questions/13300764/how-to-tell-bash-not-to-issue-warnings-cannot-set-terminal-process-group-and # pylint: disable=line-too-long
+                # `ssh -T -i -tt` still cause the problem.
+                skip_lines += [
+                    'bash: cannot set terminal process group',
+                    'bash: no job control in this shell',
+                ]
+                # We need this even if the log_path is '/dev/null' to ensure the
+                # progress bar is shown.
+                # NOTE: Lines are printed only when '\r' or '\n' is found.
+                args = _ProcessingArgs(
+                    log_path=log_path,
+                    stream_logs=stream_logs,
+                    start_streaming_at=start_streaming_at,
+                    end_streaming_at=end_streaming_at,
+                    skip_lines=skip_lines,
+                    line_processor=line_processor,
+                    # Replace CRLF when the output is logged to driver by ray.
+                    replace_crlf=with_ray,
+                    streaming_prefix=formatted_streaming_prefix,
+                )
+                stdout_stream_handler = functools.partial(
+                    _handle_io_stream,
+                    args=args,
+                )
+                if proc.stderr is not None:
+                    err_args = copy.copy(args)
+                    err_args.line_processor = None
+                    stderr_stream_handler = functools.partial(
+                        _handle_io_stream,
+                        args=err_args,
+                    )
+            # Use a timer to enforce timeout during stream processing.
+            # Without this, process_subprocess_stream blocks until the process
+            # finishes, making the timeout at proc.wait() ineffective.
+            timeout_triggered = False
+            timer = None
+
+            def _timeout_handler():
+                nonlocal timeout_triggered
+                timeout_triggered = True
+                subprocess_utils.kill_children_processes(proc.pid)
+
+            if timeout is not None:
+                timer = threading.Timer(timeout, _timeout_handler)
+                timer.start()
+
+            try:
+                if ctx is not None:
+                    # When runs in a coroutine, always process the subprocess
+                    # stream to:
+                    # 1. handle context cancellation
+                    # 2. redirect subprocess stdout/stderr to the contextual
+                    #    stdout/stderr of current coroutine.
+                    stdout, stderr = context_utils.pipe_and_wait_process(
+                        ctx,
+                        proc,
+                        stdout_stream_handler=stdout_stream_handler,
+                        stderr_stream_handler=stderr_stream_handler)
+                elif process_stream:
+                    # When runs in a process, only process subprocess stream if
+                    # necessary to avoid unnecessary stream handling overhead.
+                    stdout, stderr = process_subprocess_stream(
+                        proc, stdout_stream_handler, stderr_stream_handler)
+            finally:
+                if timer is not None:
+                    timer.cancel()
+
+            # Check if timeout was triggered during stream processing
+            if timeout_triggered:
+                logger.error(
+                    f'Command timed out after {timeout} seconds: {cmd}')
+                raise subprocess.TimeoutExpired(cmd, timeout)
+
+            # Ensure returncode is set.
+            if ctx is not None or process_stream:
+                # Stream processing already waited for process completion, so
+                # proc.wait() will return immediately. We still call it to
+                # ensure proc.returncode is set.
+                proc.wait()
+            else:
+                # No stream processing - use proc.wait with timeout as primary
+                # timeout mechanism.
+                try:
+                    proc.wait(timeout=timeout)
+                except subprocess.TimeoutExpired:
+                    # Kill the process and all its children
+                    subprocess_utils.kill_children_processes(proc.pid)
+                    logger.error(
+                        f'Command timed out after {timeout} seconds: {cmd}')
+                    raise
+            if require_outputs:
+                return proc.returncode, stdout, stderr
+            return proc.returncode
+        except KeyboardInterrupt:
+            # Kill the subprocess directly, otherwise, the underlying
+            # process will only be killed after the python program exits,
+            # causing the stream handling stuck at `readline`.
+            subprocess_utils.kill_children_processes()
+            raise
+
+def make_task_bash_script(codegen: str,
+                          env_vars: Optional[Dict[str, str]] = None) -> str:
+    # set -a is used for exporting all variables functions to the environment
+    # so that bash `user_script` can access `conda activate`. Detail: #436.
+    # Reference: https://www.gnu.org/software/bash/manual/html_node/The-Set-Builtin.html # pylint: disable=line-too-long
+    # DEACTIVATE_SKY_REMOTE_PYTHON_ENV: Deactivate the SkyPilot runtime env, as
+    # the ray cluster is started within the runtime env, which may cause the
+    # user program to run in that env as well.
+    # PYTHONUNBUFFERED is used to disable python output buffering.
+    script = [
+        textwrap.dedent(f"""\
+            #!/bin/bash
+            source ~/.bashrc
+            set -a
+            . $(conda info --base 2> /dev/null)/etc/profile.d/conda.sh > /dev/null 2>&1 || true
+            set +a
+            {constants.DEACTIVATE_SKY_REMOTE_PYTHON_ENV}
+            export PYTHONUNBUFFERED=1
+            cd {constants.SKY_REMOTE_WORKDIR}"""),
+    ]
+    if env_vars is not None:
+        for k, v in env_vars.items():
+            script.append(f'export {k}={shlex.quote(str(v))}')
+    script += [
+        codegen,
+        '',  # New line at EOF.
+    ]
+    script = '\n'.join(script)
+    return script
+
+def add_ray_env_vars(
+        env_vars: Optional[Dict[str, str]] = None) -> Dict[str, str]:
+    # Adds Ray-related environment variables.
+    if env_vars is None:
+        env_vars = {}
+    ray_env_vars = [
+        'CUDA_VISIBLE_DEVICES', 'RAY_CLIENT_MODE', 'RAY_JOB_ID',
+        'RAY_RAYLET_PID', 'OMP_NUM_THREADS'
+    ]
+    env_dict = dict(os.environ)
+    for env_var in ray_env_vars:
+        if env_var in env_dict:
+            env_vars[env_var] = env_dict[env_var]
+    return env_vars
+
+def run_bash_command_with_log(bash_command: str,
+                              log_path: str,
+                              env_vars: Optional[Dict[str, str]] = None,
+                              stream_logs: bool = False,
+                              with_ray: bool = False,
+                              streaming_prefix: Optional[str] = None):
+    with tempfile.NamedTemporaryFile('w', prefix='sky_app_',
+                                     delete=False) as fp:
+        bash_command = make_task_bash_script(bash_command, env_vars=env_vars)
+        fp.write(bash_command)
+        fp.flush()
+        script_path = fp.name
+
+        # Need this `-i` option to make sure `source ~/.bashrc` work.
+        inner_command = f'/bin/bash -i {script_path}'
+
+        return run_with_log(inner_command,
+                            log_path,
+                            stream_logs=stream_logs,
+                            with_ray=with_ray,
+                            streaming_prefix=streaming_prefix,
+                            shell=True)
+
+def run_bash_command_with_log_and_return_pid(
+        bash_command: str,
+        log_path: str,
+        env_vars: Optional[Dict[str, str]] = None,
+        stream_logs: bool = False,
+        with_ray: bool = False,
+        streaming_prefix: Optional[str] = None):
+    return_code = run_bash_command_with_log(bash_command,
+                                            log_path,
+                                            env_vars,
+                                            stream_logs,
+                                            with_ray,
+                                            streaming_prefix=streaming_prefix)
+    return {'return_code': return_code, 'pid': os.getpid()}
+
+def _cancel_slurm_job_steps():
+    slurm_job_id = '12345'
+    assert slurm_job_id is not None, 'SLURM_JOB_ID is not set'
+    try:
+        # Query steps for this job: squeue -s -j JOBID -h -o "%i %j"
+        # Output format: "JOBID.STEPID STEPNAME"
+        # TODO(kevin): This assumes that compute node is able
+        # to run client commands against the controller.
+        # Validate this assumption.
+        result = subprocess.run(
+            ['squeue', '-s', '-j', slurm_job_id, '-h', '-o', '%i %j'],
+            capture_output=True, text=True, check=False)
+        for line in result.stdout.strip().split('\n'):
+            if not line:
+                continue
+            parts = line.split()
+            assert len(parts) >= 2, 'Expected at least 2 parts'
+            step_id, step_name = parts[0], parts[1]
+            if step_name == f'sky-2':
+                subprocess.run(['scancel', step_id],
+                                check=False, capture_output=True)
+    except Exception as e:
+        print(f'Error in _cancel_slurm_job_steps: {e}', flush=True)
+        pass
+
+def _slurm_cleanup_handler(signum, _frame):
+    _cancel_slurm_job_steps()
+    # Re-raise to let default handler terminate.
+    signal.signal(signum, signal.SIG_DFL)
+    os.kill(os.getpid(), signum)
+
+signal.signal(signal.SIGTERM, _slurm_cleanup_handler)
+
+autostop_lib.set_last_active_time_to_now()
+job_lib.set_status(2, job_lib.JobStatus.PENDING)
+plural = 's' if 1 > 1 else ''
+node_str = f'1 node{plural}'
+message = ('[2m├── [0m[2m'
+           'Waiting for task resources on '
+           f'{node_str}.[0m')
+print(message, flush=True)
+sky_env_vars_dict = {}
+sky_env_vars_dict['SKYPILOT_INTERNAL_JOB_ID'] = 2
+
+script = 'echo hello'
+if script is None:
+    script = ''
+rclone_flush_script = '\n# Only waits if cached mount is enabled (RCLONE_MOUNT_CACHED_LOG_DIR is not empty)\n# findmnt alone is not enough, as some clouds (e.g. AWS on ARM64) uses\n# rclone for normal mounts as well.\nif [ $(findmnt -t fuse.rclone --noheading | wc -l) -gt 0 ] &&            [ -d ~/.sky/rclone_log ] &&            [ "$(ls -A ~/.sky/rclone_log)" ]; then\n    FLUSH_START_TIME=$(date +%s)\n    flushed=0\n    # extra second on top of --vfs-cache-poll-interval to\n    # avoid race condition between rclone log line creation and this check.\n    sleep 1\n    while [ $flushed -eq 0 ]; do\n        # sleep for the same interval as --vfs-cache-poll-interval\n        sleep 10\n        flushed=1\n        for file in ~/.sky/rclone_log/*; do\n            exitcode=0\n            tac $file | grep "vfs cache: cleaned:" -m 1 | grep "in use 0, to upload 0, uploading 0" -q || exitcode=$?\n            if [ $exitcode -ne 0 ]; then\n                ELAPSED=$(($(date +%s) - FLUSH_START_TIME))\n                # Extract the last vfs cache status line to show what we\'re waiting for\n                CACHE_STATUS=$(tac $file | grep "vfs cache: cleaned:" -m 1 | sed \'s/.*vfs cache: cleaned: //\' 2>/dev/null)\n                # Extract currently uploading files from recent log lines (show up to 2 files)\n                UPLOADING_FILES=$(tac $file | head -30 | grep -E "queuing for upload" | head -2 | sed \'s/.*INFO  : //\' | sed \'s/: vfs cache:.*//\' | tr \'\\n\' \',\' | sed \'s/,$//\' | sed \'s/,/, /g\' 2>/dev/null)\n                # Build status message with available info\n                if [ -n "$CACHE_STATUS" ] && [ -n "$UPLOADING_FILES" ]; then\n                    echo "skypilot: cached mount is still uploading (elapsed: ${ELAPSED}s) [${CACHE_STATUS}] uploading: ${UPLOADING_FILES}"\n                elif [ -n "$CACHE_STATUS" ]; then\n                    echo "skypilot: cached mount is still uploading (elapsed: ${ELAPSED}s) [${CACHE_STATUS}]"\n                else\n                    # Fallback: show last non-empty line from log\n                    LAST_LINE=$(tac $file | grep -v "^$" | head -1 | sed \'s/.*INFO  : //\' | sed \'s/.*ERROR : //\' | sed \'s/.*NOTICE: //\' 2>/dev/null)\n                    if [ -n "$LAST_LINE" ]; then\n                        echo "skypilot: cached mount is still uploading (elapsed: ${ELAPSED}s) ${LAST_LINE}"\n                    else\n                        echo "skypilot: cached mount is still uploading (elapsed: ${ELAPSED}s)"\n                    fi\n                fi\n                flushed=0\n                break\n            fi\n        done\n    done\n    TOTAL_FLUSH_TIME=$(($(date +%s) - FLUSH_START_TIME))\n    echo "skypilot: cached mount upload complete (took ${TOTAL_FLUSH_TIME}s)"\nfi'
+
+if script or False:
+    script += rclone_flush_script
+    sky_env_vars_dict['SKYPILOT_NUM_GPUS_PER_NODE'] = 0
+
+    # Signal files for setup/run synchronization:
+    # 1. alloc_signal_file: srun has acquired allocation
+    # 2. setup_done_signal_file: Driver has finished setup, run can proceed
+    #
+    # Signal files are stored in home directory, which is
+    # assumed to be on a shared NFS mount accessible by all nodes.
+    # To support clusters with non-NFS home directories, we would
+    # need to let users specify an NFS-backed "working directory"
+    # or use a different coordination mechanism.
+    alloc_signal_file = f'~/.sky_alloc_12345_2'
+    alloc_signal_file = os.path.expanduser(alloc_signal_file)
+    setup_done_signal_file = f'~/.sky_setup_done_12345_2'
+    setup_done_signal_file = os.path.expanduser(setup_done_signal_file)
+
+    # Start exclusive srun in a thread to reserve allocation (similar to ray.get(pg.ready()))
+    gpu_arg = f'--gpus-per-node=0' if 0 > 0 else ''
+
+    def build_task_runner_cmd(user_script, extra_flags, log_dir, env_vars_dict,
+                              task_name=None, is_setup=False,
+                              alloc_signal=None, setup_done_signal=None):
+        env_vars_json = json.dumps(env_vars_dict)
+
+        log_dir = shlex.quote(log_dir)
+        env_vars = shlex.quote(env_vars_json)
+        cluster_ips = shlex.quote(",".join(['10.0.0.1']))
+
+        runner_args = f'--log-dir={log_dir} --env-vars={env_vars} --cluster-num-nodes=1 --cluster-ips={cluster_ips}'
+
+        if task_name is not None:
+            runner_args += f' --task-name={shlex.quote(task_name)}'
+
+        if is_setup:
+            runner_args += ' --is-setup'
+
+        if alloc_signal is not None:
+            runner_args += f' --alloc-signal-file={shlex.quote(alloc_signal)}'
+
+        if setup_done_signal is not None:
+            runner_args += f' --setup-done-signal-file={shlex.quote(setup_done_signal)}'
+
+        script_path = None
+        prefix = 'sky_setup_' if is_setup else 'sky_task_'
+        if backend_utils.is_command_length_over_limit(user_script):
+            with tempfile.NamedTemporaryFile('w', prefix=prefix, suffix='.sh', delete=False) as f:
+                f.write(user_script)
+                script_path = f.name
+            runner_args += f' --script-path={shlex.quote(script_path)}'
+        else:
+            runner_args += f' --script={shlex.quote(user_script)}'
+
+        # Use /usr/bin/env explicitly to work around a Slurm quirk where
+        # srun's execvp() doesn't check execute permissions, failing when
+        # $HOME/.local/bin/env (non-executable, from uv installation)
+        # shadows /usr/bin/env.
+        job_suffix = '-setup' if is_setup else ''
+        # Unset SLURM_* environment variables before running srun.
+        # When this srun runs inside another srun (from
+        # SlurmCommandRunner.run), inherited variables like
+        # SLURM_CPU_BIND, SLURM_NNODES, and SLURM_NODELIST constrain
+        # the inner srun to the parent step's allocation. This causes
+        # "CPU binding outside of job step allocation" errors.
+        # Unsetting all SLURM_* variables allows this srun to access the full job
+        # allocation. See:
+        # https://support.schedmd.com/show_bug.cgi?id=14298
+        # https://github.com/huggingface/datatrove/issues/248
+        cmd_parts = []
+        # Only unset SKY_RUNTIME_DIR for container runs. For non-container
+        # runs, we want to inherit the node-local SKY_RUNTIME_DIR set by
+        # SlurmCommandRunner to avoid SQLite WAL issues on shared filesystems.
+        if True:
+            cmd_parts.append('unset SKY_RUNTIME_DIR;')
+        cmd_parts.extend([
+            constants.SKY_SLURM_PYTHON_CMD,
+            '-m sky.skylet.executor.slurm',
+            runner_args,
+        ])
+        bash_cmd = shlex.quote(' '.join(cmd_parts))
+        srun_cmd = (
+            "unset $(env | awk -F= '/^SLURM_/ {print $1}') && "
+            f'srun --export=ALL --quiet --unbuffered --kill-on-bad-exit --jobid=12345 '
+            f'--job-name=sky-2{job_suffix} --ntasks-per-node=1 --container-remap-root --container-name=test-cluster:exec {extra_flags} '
+            f'/bin/bash -c {bash_cmd}'
+        )
+
+        def cleanup():
+            if script_path is not None:
+                os.remove(script_path)
+
+        return srun_cmd, cleanup
+
+    def run_thread_func():
+        # This blocks until Slurm allocates resources (--exclusive)
+        # --mem=0 to match RayCodeGen's behavior where we don't explicitly request memory.
+        run_flags = f'--nodes=1 --cpus-per-task=1 --mem=0 {gpu_arg} --exclusive'
+        srun_cmd, cleanup = build_task_runner_cmd(
+            script, run_flags, '/sky/logs/tasks', sky_env_vars_dict,
+            task_name='hello',
+            alloc_signal=alloc_signal_file,
+            setup_done_signal=setup_done_signal_file
+        )
+
+        proc = subprocess.Popen(srun_cmd, shell=True,
+                              stdout=subprocess.PIPE,
+                              stderr=subprocess.STDOUT,
+                              text=True)
+        for line in proc.stdout:
+            print(line, end='', flush=True)
+        proc.wait()
+
+        cleanup()
+        return {'return_code': proc.returncode, 'pid': proc.pid}
+
+    run_thread_result = {'result': None}
+    def run_thread_wrapper():
+        run_thread_result['result'] = run_thread_func()
+
+    run_thread = threading.Thread(target=run_thread_wrapper)
+    run_thread.start()
+
+    # Wait for allocation signal from inside srun
+    while not os.path.exists(alloc_signal_file):
+        if not run_thread.is_alive():
+            # srun failed before creating the signal file.
+            run_thread.join()
+            result = run_thread_result['result']
+            returncode = int(result.get('return_code', 1))
+            pid = result.get('pid', os.getpid())
+            msg = f'ERROR: [31mJob 2\'s setup failed with return code {returncode} (pid={pid}).'
+            msg += f' See error logs above for more details.[0m'
+            print(msg, flush=True)
+            returncodes = [returncode]
+            if int(constants.SKYLET_VERSION) >= 28:
+                job_lib.set_exit_codes(2, returncodes)
+            job_lib.set_status(2, job_lib.JobStatus.FAILED_SETUP)
+            sys.exit(1)
+        time.sleep(0.1)
+
+    print('\x1b[2m└── \x1b[0mJob started. Streaming logs... \x1b[2m(Ctrl-C to exit log streaming; job will not be killed)\x1b[0m', flush=True)
+
+    if False:
+        job_lib.set_status(2, job_lib.JobStatus.SETTING_UP)
+
+        # The schedule_step should be called after the job status is set to
+        # non-PENDING, otherwise, the scheduler will think the current job
+        # is not submitted yet, and skip the scheduling step.
+        job_lib.scheduler.schedule_step()
+
+        # --overlap as we have already secured allocation with the srun for the run section,
+        # and otherwise this srun would get blocked and deadlock.
+        setup_flags = f'--overlap --nodes=None'
+        setup_srun, setup_cleanup = build_task_runner_cmd(
+            None, setup_flags, None, None,
+            is_setup=True
+        )
+
+        # Run setup srun directly, streaming output to driver stdout
+        setup_proc = subprocess.Popen(setup_srun, shell=True,
+                                     stdout=subprocess.PIPE,
+                                     stderr=subprocess.STDOUT,
+                                     text=True)
+        for line in setup_proc.stdout:
+            print(line, end='', flush=True)
+        setup_proc.wait()
+
+        setup_cleanup()
+
+        setup_returncode = setup_proc.returncode
+        if setup_returncode != 0:
+            setup_pid = setup_proc.pid
+            msg = f'ERROR: [31mJob 2\'s setup failed with return code {setup_returncode} (pid={setup_pid}).'
+            msg += f' See error logs above for more details.[0m'
+            print(msg, flush=True)
+            job_lib.set_status(2, job_lib.JobStatus.FAILED_SETUP)
+            # Cancel the srun spawned by run_thread_func.
+            _cancel_slurm_job_steps()
+            sys.exit(1)
+
+    job_lib.set_job_started(2)
+    if not False:
+        # Need to call schedule_step() to make sure the scheduler
+        # schedule the next pending job.
+        job_lib.scheduler.schedule_step()
+
+    # Signal run thread to proceed.
+    pathlib.Path(setup_done_signal_file).touch()
+
+    # Wait for run thread to complete.
+    run_thread.join()
+    result = run_thread_result['result']
+
+    # Cleanup signal files
+    if os.path.exists(alloc_signal_file):
+        os.remove(alloc_signal_file)
+    if os.path.exists(setup_done_signal_file):
+        os.remove(setup_done_signal_file)
+
+    returncodes = [int(result.get('return_code', 1))]
+else:
+    returncodes = [0]
+
+if sum(returncodes) != 0:
+    # Save exit codes to job metadata for potential recovery logic
+    if int(constants.SKYLET_VERSION) >= 28:
+        job_lib.set_exit_codes(2, returncodes)
+    job_lib.set_status(2, job_lib.JobStatus.FAILED)
+    # Schedule the next pending job immediately to make the job
+    # scheduling more efficient.
+    job_lib.scheduler.schedule_step()
+    # This waits for all streaming logs to finish.
+    time.sleep(0.5)
+    reason = ''
+    # 139 is the return code of SIGSEGV, i.e. Segmentation Fault.
+    if any(r == 139 for r in returncodes):
+        reason = '(likely due to Segmentation Fault)'
+    if any(r == 137 for r in returncodes):
+        # Find the first non-137 return code
+        non_137 = next(r for r in returncodes if r != 137)
+        reason = f'(A Worker failed with return code {non_137}, SkyPilot cleaned up the processes on other nodes with return code 137)'
+    print('ERROR: [31mJob 2 failed with '
+          'return code list:[0m',
+          returncodes,
+          reason,
+          flush=True)
+    # Need this to set the job status in ray job to be FAILED.
+    sys.exit(1)
+else:
+    job_lib.set_status(2, job_lib.JobStatus.SUCCEEDED)
+    # Schedule the next pending job immediately to make the job
+    # scheduling more efficient.
+    job_lib.scheduler.schedule_step()
+    # This waits for all streaming logs to finish.
+    time.sleep(0.5)
diff --git a/tests/unit_tests/test_sky/backends/testdata/slurm_codegen/slurm_single_node_with_gpu.py b/tests/unit_tests/test_sky/backends/testdata/slurm_codegen/slurm_single_node_with_gpu.py
index 04f5e1b6bf0..dab237d49eb 100644
--- a/tests/unit_tests/test_sky/backends/testdata/slurm_codegen/slurm_single_node_with_gpu.py
+++ b/tests/unit_tests/test_sky/backends/testdata/slurm_codegen/slurm_single_node_with_gpu.py
@@ -151,6 +151,7 @@ def run_with_log(
     line_processor: Optional[log_utils.LineProcessor] = None,
     streaming_prefix: Optional[str] = None,
     log_cmd: bool = False,
+    timeout: Optional[int] = None,
     **kwargs,
 ) -> Union[int, Tuple[int, str, str], Tuple[int, int]]:
     """Runs a command and logs its output to a file.
@@ -165,9 +166,15 @@ def run_with_log(
             enabled, lines are printed only when '\r' or '\n' is found.
         streaming_prefix: Optional prefix for each log line. Can contain {pid}
             placeholder which will be replaced with the subprocess PID.
+        timeout: Optional timeout in seconds. If the command does not complete
+            within this time, it will be terminated and TimeoutExpired will be
+            raised. None means no timeout (default).
 
     Returns the returncode or returncode, stdout and stderr of the command.
       Note that the stdout and stderr is already decoded.
+
+    Raises:
+        subprocess.TimeoutExpired: If the command times out.
     """
     assert process_stream or not require_outputs, (
         process_stream, require_outputs,
@@ -258,24 +265,65 @@ def run_with_log(
                         _handle_io_stream,
                         args=err_args,
                     )
-            if ctx is not None:
-                # When runs in a coroutine, always process the subprocess
-                # stream to:
-                # 1. handle context cancellation
-                # 2. redirect subprocess stdout/stderr to the contextual
-                #    stdout/stderr of current coroutine.
-                stdout, stderr = context_utils.pipe_and_wait_process(
-                    ctx,
-                    proc,
-                    stdout_stream_handler=stdout_stream_handler,
-                    stderr_stream_handler=stderr_stream_handler)
-            elif process_stream:
-                # When runs in a process, only process subprocess stream if
-                # necessary to avoid unnecessary stream handling overhead.
-                stdout, stderr = process_subprocess_stream(
-                    proc, stdout_stream_handler, stderr_stream_handler)
+            # Use a timer to enforce timeout during stream processing.
+            # Without this, process_subprocess_stream blocks until the process
+            # finishes, making the timeout at proc.wait() ineffective.
+            timeout_triggered = False
+            timer = None
+
+            def _timeout_handler():
+                nonlocal timeout_triggered
+                timeout_triggered = True
+                subprocess_utils.kill_children_processes(proc.pid)
+
+            if timeout is not None:
+                timer = threading.Timer(timeout, _timeout_handler)
+                timer.start()
+
+            try:
+                if ctx is not None:
+                    # When runs in a coroutine, always process the subprocess
+                    # stream to:
+                    # 1. handle context cancellation
+                    # 2. redirect subprocess stdout/stderr to the contextual
+                    #    stdout/stderr of current coroutine.
+                    stdout, stderr = context_utils.pipe_and_wait_process(
+                        ctx,
+                        proc,
+                        stdout_stream_handler=stdout_stream_handler,
+                        stderr_stream_handler=stderr_stream_handler)
+                elif process_stream:
+                    # When runs in a process, only process subprocess stream if
+                    # necessary to avoid unnecessary stream handling overhead.
+                    stdout, stderr = process_subprocess_stream(
+                        proc, stdout_stream_handler, stderr_stream_handler)
+            finally:
+                if timer is not None:
+                    timer.cancel()
+
+            # Check if timeout was triggered during stream processing
+            if timeout_triggered:
+                logger.error(
+                    f'Command timed out after {timeout} seconds: {cmd}')
+                raise subprocess.TimeoutExpired(cmd, timeout)
+
             # Ensure returncode is set.
-            proc.wait()
+            if ctx is not None or process_stream:
+                # Stream processing already waited for process completion, so
+                # proc.wait() will return immediately. We still call it to
+                # ensure proc.returncode is set.
+                proc.wait()
+            else:
+                # No stream processing - use proc.wait with timeout as primary
+                # timeout mechanism.
+                try:
+                    proc.wait(timeout=timeout)
+                except subprocess.TimeoutExpired:
+                    # Kill the process and all its children
+                    subprocess_utils.kill_children_processes(proc.pid)
+                    logger.error(
+                        f'Command timed out after {timeout} seconds: {cmd}')
+                    raise
             if require_outputs:
                 return proc.returncode, stdout, stderr
             return proc.returncode
@@ -489,19 +537,36 @@ def build_task_runner_cmd(user_script, extra_flags, log_dir, env_vars_dict,
         # allocation. See:
         # https://support.schedmd.com/show_bug.cgi?id=14298
         # https://github.com/huggingface/datatrove/issues/248
+        cmd_parts = []
+        # Only unset SKY_RUNTIME_DIR for container runs. For non-container
+        # runs, we want to inherit the node-local SKY_RUNTIME_DIR set by
+        # SlurmCommandRunner to avoid SQLite WAL issues on shared filesystems.
+        if False:
+            cmd_parts.append('unset SKY_RUNTIME_DIR;')
+        cmd_parts.extend([
+            constants.SKY_SLURM_PYTHON_CMD,
+            '-m sky.skylet.executor.slurm',
+            runner_args,
+        ])
+        bash_cmd = shlex.quote(' '.join(cmd_parts))
         srun_cmd = (
             "unset $(env | awk -F= '/^SLURM_/ {print $1}') && "
             f'srun --export=ALL --quiet --unbuffered --kill-on-bad-exit --jobid=12345 '
             f'--job-name=sky-2{job_suffix} --ntasks-per-node=1 {extra_flags} '
-            f'{constants.SKY_SLURM_PYTHON_CMD} -m sky.skylet.executor.slurm {runner_args}'
+            f'/bin/bash -c {bash_cmd}'
         )
-        return srun_cmd, script_path
+
+        def cleanup():
+            if script_path is not None:
+                os.remove(script_path)
+
+        return srun_cmd, cleanup
 
     def run_thread_func():
         # This blocks until Slurm allocates resources (--exclusive)
         # --mem=0 to match RayCodeGen's behavior where we don't explicitly request memory.
         run_flags = f'--nodes=1 --cpus-per-task=4 --mem=0 {gpu_arg} --exclusive'
-        srun_cmd, task_script_path = build_task_runner_cmd(
+        srun_cmd, cleanup = build_task_runner_cmd(
             script, run_flags, '/sky/logs/tasks', sky_env_vars_dict,
             task_name='train_task',
             alloc_signal=alloc_signal_file,
@@ -516,8 +581,7 @@ def run_thread_func():
             print(line, end='', flush=True)
         proc.wait()
 
-        if task_script_path is not None:
-            os.remove(task_script_path)
+        cleanup()
         return {'return_code': proc.returncode, 'pid': proc.pid}
 
     run_thread_result = {'result': None}
@@ -558,7 +622,7 @@ def run_thread_wrapper():
         # --overlap as we have already secured allocation with the srun for the run section,
         # and otherwise this srun would get blocked and deadlock.
         setup_flags = f'--overlap --nodes=1'
-        setup_srun, setup_script_path = build_task_runner_cmd(
+        setup_srun, setup_cleanup = build_task_runner_cmd(
             'pip install torch', setup_flags, '/sky/logs', {'SKYPILOT_TASK_ID': 'sky-2024-11-17-00-00-00-000001-cluster-2', 'MODEL_NAME': 'resnet50', 'SKYPILOT_NUM_NODES': '1'},
             is_setup=True
         )
@@ -572,8 +636,7 @@ def run_thread_wrapper():
             print(line, end='', flush=True)
         setup_proc.wait()
 
-        if setup_script_path is not None:
-            os.remove(setup_script_path)
+        setup_cleanup()
 
         setup_returncode = setup_proc.returncode
         if setup_returncode != 0:
diff --git a/tests/unit_tests/test_sky/client/test_interactive_utils.py b/tests/unit_tests/test_sky/client/test_interactive_utils.py
index ab79d7ca5ba..b20557bf488 100644
--- a/tests/unit_tests/test_sky/client/test_interactive_utils.py
+++ b/tests/unit_tests/test_sky/client/test_interactive_utils.py
@@ -52,7 +52,7 @@ async def __anext__(self):
 
                 if not self.to_send:
                     # Wait for stdin data before completing
-                    await context_utils.to_thread(self.data_received.wait, 5.0)
+                    await asyncio.to_thread(self.data_received.wait, 5.0)
                     raise StopAsyncIteration
                 return self.to_send.pop(0)
 
diff --git a/tests/unit_tests/test_sky/client/test_sdk_async.py b/tests/unit_tests/test_sky/client/test_sdk_async.py
index faa02444166..5995c18ef56 100644
--- a/tests/unit_tests/test_sky/client/test_sdk_async.py
+++ b/tests/unit_tests/test_sky/client/test_sdk_async.py
@@ -37,12 +37,12 @@ async def mock_stream_and_get_async(*args, **kwargs):
 
 @pytest.fixture
 def mock_to_thread():
-    """Mock context_utils.to_thread to run synchronously."""
+    """Mock asyncio.to_thread to run synchronously."""
 
     async def mock_to_thread_func(func, *args, **kwargs):
         return func(*args, **kwargs)
 
-    with mock.patch('sky.utils.context_utils.to_thread',
+    with mock.patch('sky.client.sdk_async.asyncio.to_thread',
                     side_effect=mock_to_thread_func):
         yield
 
diff --git a/tests/unit_tests/test_sky/clouds/test_kubernetes.py b/tests/unit_tests/test_sky/clouds/test_kubernetes.py
index 7fed1ca8acd..072ac153dae 100644
--- a/tests/unit_tests/test_sky/clouds/test_kubernetes.py
+++ b/tests/unit_tests/test_sky/clouds/test_kubernetes.py
@@ -410,7 +410,7 @@ def test_ipc_lock_capability_enabled_with_user_security_context(
         mock_cluster_type = mock.MagicMock()
         mock_cluster_type.supports_high_performance_networking.return_value = True
         mock_cluster_type.requires_ipc_lock_capability.return_value = True
-        mock_detect_network_type.return_value = (mock_cluster_type, '')
+        mock_detect_network_type.return_value = (mock_cluster_type, None)
 
         mock_get_current_context.return_value = "test-context"
         mock_get_namespace.return_value = "default"
@@ -479,7 +479,7 @@ def test_ipc_lock_capability_disabled_when_no_high_perf_networking(
         from sky.provision.kubernetes.utils import (
             KubernetesHighPerformanceNetworkType)
         mock_detect_network_type.return_value = (
-            KubernetesHighPerformanceNetworkType.NONE, '')
+            KubernetesHighPerformanceNetworkType.NONE, None)
 
         mock_get_current_context.return_value = "test-context"
         mock_get_namespace.return_value = "default"
@@ -549,7 +549,7 @@ def test_ipc_lock_capability_disabled_when_network_tier_not_best(
         from sky.provision.kubernetes.utils import (
             KubernetesHighPerformanceNetworkType)
         mock_detect_network_type.return_value = (
-            KubernetesHighPerformanceNetworkType.NONE, '')
+            KubernetesHighPerformanceNetworkType.NONE, None)
 
         mock_get_current_context.return_value = "test-context"
         mock_get_namespace.return_value = "default"
@@ -631,7 +631,7 @@ def test_nebius_network_tier_with_gpu_environment_variables(
         from sky.provision.kubernetes.utils import (
             KubernetesHighPerformanceNetworkType)
         mock_detect_network_type.return_value = (
-            KubernetesHighPerformanceNetworkType.NEBIUS, '')
+            KubernetesHighPerformanceNetworkType.NEBIUS, None)
 
         mock_get_current_context.return_value = "nebius-context"
         mock_get_namespace.return_value = "default"
@@ -745,7 +745,7 @@ def test_kubernetes_cloud_uses_kubernetes_config_for_provision_timeout(
         from sky.provision.kubernetes.utils import (
             KubernetesHighPerformanceNetworkType)
         mock_detect_network_type.return_value = (
-            KubernetesHighPerformanceNetworkType.NONE, '')
+            KubernetesHighPerformanceNetworkType.NONE, None)
 
         mock_get_current_context.return_value = "my-k8s-cluster"
         mock_get_namespace.return_value = "default"
@@ -840,7 +840,7 @@ def test_kubernetes_cloud_uses_context_specific_config(
         from sky.provision.kubernetes.utils import (
             KubernetesHighPerformanceNetworkType)
         mock_detect_network_type.return_value = (
-            KubernetesHighPerformanceNetworkType.NONE, '')
+            KubernetesHighPerformanceNetworkType.NONE, None)
 
         mock_get_current_context.return_value = "prod-k8s-cluster"
         mock_get_namespace.return_value = "default"
@@ -907,6 +907,317 @@ def config_side_effect(cloud,
         self.assertIn('timeout', deploy_vars)
         self.assertEqual(deploy_vars['timeout'], '5400')
 
+    @patch('sky.provision.kubernetes.utils.get_kubernetes_nodes')
+    @patch('sky.provision.kubernetes.utils.get_current_kube_config_context_name'
+          )
+    @patch('sky.provision.kubernetes.utils.get_kube_config_context_namespace')
+    @patch('sky.provision.kubernetes.utils.get_accelerator_label_keys')
+    @patch('sky.provision.kubernetes.utils.is_kubeconfig_exec_auth')
+    @patch('sky.skypilot_config.get_effective_region_config')
+    @patch('sky.skypilot_config.get_workspace_cloud')
+    @patch('sky.provision.kubernetes.network_utils.get_port_mode')
+    @patch('sky.catalog.get_image_id_from_tag')
+    @patch('sky.clouds.kubernetes.Kubernetes._detect_network_type')
+    def test_remote_identity_with_cluster_overrides(
+            self, mock_detect_network_type, mock_get_image, mock_get_port_mode,
+            mock_get_workspace_cloud, mock_get_cloud_config_value,
+            mock_is_exec_auth, mock_get_accelerator_label_keys,
+            mock_get_namespace, mock_get_current_context, mock_get_k8s_nodes):
+        """Test that remote_identity override from task config is passed correctly."""
+
+        # Setup mocks
+        from sky.provision.kubernetes.utils import (
+            KubernetesHighPerformanceNetworkType)
+        mock_detect_network_type.return_value = (
+            KubernetesHighPerformanceNetworkType.NONE, None)
+
+        mock_get_current_context.return_value = "my-k8s-cluster"
+        mock_get_namespace.return_value = "default"
+        mock_get_accelerator_label_keys.return_value = []
+        mock_get_workspace_cloud.return_value.get.return_value = None
+        mock_is_exec_auth.return_value = (False, None)
+
+        # Track calls to get_effective_region_config
+        config_calls = []
+
+        def config_side_effect(cloud,
+                               keys,
+                               region,
+                               default_value=None,
+                               override_configs=None):
+            config_calls.append({
+                'cloud': cloud,
+                'keys': keys,
+                'region': region,
+                'override_configs': override_configs
+            })
+            if keys == ('remote_identity',):
+                # Return NO_UPLOAD when override is provided
+                if override_configs and override_configs.get(
+                        'kubernetes', {}).get('remote_identity') == 'NO_UPLOAD':
+                    return 'NO_UPLOAD'
+                return 'SERVICE_ACCOUNT'
+            elif keys == ('provision_timeout',):
+                return 3600
+            elif keys == ('high_availability', 'storage_class_name'):
+                return None
+            return default_value
+
+        mock_get_cloud_config_value.side_effect = config_side_effect
+
+        # Mock networking
+        mock_port_mode = mock.MagicMock()
+        mock_port_mode.value = "portforward"
+        mock_get_port_mode.return_value = mock_port_mode
+
+        # Mock image
+        mock_get_image.return_value = "test-image:latest"
+
+        # Create Kubernetes cloud instance
+        k8s_cloud = kubernetes.Kubernetes()
+
+        # Set up resources with cluster_config_overrides
+        override_resources = mock.MagicMock()
+        override_resources.instance_type = "2CPU--4GB"
+        override_resources.accelerators = None
+        override_resources.use_spot = False
+        override_resources.region = "my-k8s-cluster"
+        override_resources.zone = None
+        override_resources.cluster_config_overrides = {
+            'kubernetes': {
+                'remote_identity': 'NO_UPLOAD'
+            }
+        }
+        override_resources.image_id = None
+        setattr(override_resources, 'assert_launchable',
+                lambda: override_resources)
+        override_resources.network_tier = resources_utils.NetworkTier.BEST
+
+        # Call make_deploy_resources_variables
+        k8s_cloud.make_deploy_resources_variables(
+            resources=override_resources,
+            cluster_name=resources_utils.ClusterName(
+                display_name="test-cluster", name_on_cloud="test-cluster"),
+            region=self.region,
+            zones=None,
+            num_nodes=1,
+            dryrun=False)
+
+        # Find the call for remote_identity
+        remote_identity_calls = [
+            c for c in config_calls if c['keys'] == ('remote_identity',)
+        ]
+        self.assertTrue(
+            len(remote_identity_calls) > 0,
+            "remote_identity config should be fetched")
+
+        # Verify override_configs was passed
+        remote_identity_call = remote_identity_calls[0]
+        self.assertEqual(
+            remote_identity_call['override_configs'],
+            {'kubernetes': {
+                'remote_identity': 'NO_UPLOAD'
+            }},
+            "override_configs should be passed to get_effective_region_config")
+
+    def _setup_mocks_for_pod_resource_limits_test(
+            self, mock_detect_network_type, mock_get_image, mock_get_port_mode,
+            mock_get_workspace_cloud, mock_get_workspace_region_config,
+            mock_get_cloud_config_value, mock_is_exec_auth,
+            mock_get_accelerator_label_keys, mock_get_namespace,
+            mock_get_current_context, set_pod_resource_limits_value):
+        """Helper to set up common mocks for set_pod_resource_limits tests."""
+        from sky.provision.kubernetes.utils import (
+            KubernetesHighPerformanceNetworkType)
+        mock_detect_network_type.return_value = (
+            KubernetesHighPerformanceNetworkType.NONE, None)
+
+        mock_get_current_context.return_value = "my-k8s-cluster"
+        mock_get_namespace.return_value = "default"
+        mock_get_accelerator_label_keys.return_value = []
+        mock_get_workspace_cloud.return_value.get.return_value = None
+        mock_is_exec_auth.return_value = (False, None)
+
+        def workspace_config_side_effect(cloud,
+                                         region,
+                                         keys,
+                                         default_value=None,
+                                         override_configs=None):
+            if keys == ('set_pod_resource_limits',):
+                return set_pod_resource_limits_value
+            elif keys == ('kueue', 'local_queue_name'):
+                return None
+            return default_value
+
+        mock_get_workspace_region_config.side_effect = workspace_config_side_effect
+
+        def config_side_effect(cloud,
+                               keys,
+                               region,
+                               default_value=None,
+                               override_configs=None):
+            if keys == ('remote_identity',):
+                return 'SERVICE_ACCOUNT'
+            elif keys == ('high_availability', 'storage_class_name'):
+                return None
+            elif keys == ('provision_timeout',):
+                return 600
+            return default_value
+
+        mock_get_cloud_config_value.side_effect = config_side_effect
+
+        mock_port_mode = mock.MagicMock()
+        mock_port_mode.value = "portforward"
+        mock_get_port_mode.return_value = mock_port_mode
+        mock_get_image.return_value = "test-image:latest"
+
+    @patch('sky.provision.kubernetes.utils.get_kubernetes_nodes')
+    @patch('sky.provision.kubernetes.utils.get_current_kube_config_context_name'
+          )
+    @patch('sky.provision.kubernetes.utils.get_kube_config_context_namespace')
+    @patch('sky.provision.kubernetes.utils.get_accelerator_label_keys')
+    @patch('sky.provision.kubernetes.utils.is_kubeconfig_exec_auth')
+    @patch('sky.skypilot_config.get_effective_region_config')
+    @patch('sky.skypilot_config.get_effective_workspace_region_config')
+    @patch('sky.skypilot_config.get_workspace_cloud')
+    @patch('sky.provision.kubernetes.network_utils.get_port_mode')
+    @patch('sky.catalog.get_image_id_from_tag')
+    @patch('sky.clouds.kubernetes.Kubernetes._detect_network_type')
+    def test_set_pod_resource_limits_config_option(
+            self, mock_detect_network_type, mock_get_image, mock_get_port_mode,
+            mock_get_workspace_cloud, mock_get_workspace_region_config,
+            mock_get_cloud_config_value, mock_is_exec_auth,
+            mock_get_accelerator_label_keys, mock_get_namespace,
+            mock_get_current_context, mock_get_k8s_nodes):
+        """Test that set_pod_resource_limits=True sets limits equal to requests."""
+        self._setup_mocks_for_pod_resource_limits_test(
+            mock_detect_network_type,
+            mock_get_image,
+            mock_get_port_mode,
+            mock_get_workspace_cloud,
+            mock_get_workspace_region_config,
+            mock_get_cloud_config_value,
+            mock_is_exec_auth,
+            mock_get_accelerator_label_keys,
+            mock_get_namespace,
+            mock_get_current_context,
+            set_pod_resource_limits_value=True)
+
+        k8s_cloud = kubernetes.Kubernetes()
+        deploy_vars = k8s_cloud.make_deploy_resources_variables(
+            resources=self.resources,
+            cluster_name=resources_utils.ClusterName(
+                display_name=self.cluster_name,
+                name_on_cloud=self.cluster_name),
+            region=self.region,
+            zones=None,
+            num_nodes=1,
+            dryrun=False)
+
+        # Instance type "2CPU--4GB" means cpus=2, memory=4
+        # With True (multiplier 1.0): limits = requests
+        self.assertIn('k8s_cpu_limit', deploy_vars)
+        self.assertIn('k8s_memory_limit', deploy_vars)
+        self.assertEqual(deploy_vars['k8s_cpu_limit'], 2.0)
+        self.assertEqual(deploy_vars['k8s_memory_limit'], 4.0)
+
+    @patch('sky.provision.kubernetes.utils.get_kubernetes_nodes')
+    @patch('sky.provision.kubernetes.utils.get_current_kube_config_context_name'
+          )
+    @patch('sky.provision.kubernetes.utils.get_kube_config_context_namespace')
+    @patch('sky.provision.kubernetes.utils.get_accelerator_label_keys')
+    @patch('sky.provision.kubernetes.utils.is_kubeconfig_exec_auth')
+    @patch('sky.skypilot_config.get_effective_region_config')
+    @patch('sky.skypilot_config.get_effective_workspace_region_config')
+    @patch('sky.skypilot_config.get_workspace_cloud')
+    @patch('sky.provision.kubernetes.network_utils.get_port_mode')
+    @patch('sky.catalog.get_image_id_from_tag')
+    @patch('sky.clouds.kubernetes.Kubernetes._detect_network_type')
+    def test_set_pod_resource_limits_with_multiplier(
+            self, mock_detect_network_type, mock_get_image, mock_get_port_mode,
+            mock_get_workspace_cloud, mock_get_workspace_region_config,
+            mock_get_cloud_config_value, mock_is_exec_auth,
+            mock_get_accelerator_label_keys, mock_get_namespace,
+            mock_get_current_context, mock_get_k8s_nodes):
+        """Test set_pod_resource_limits with a numeric multiplier value."""
+        self._setup_mocks_for_pod_resource_limits_test(
+            mock_detect_network_type,
+            mock_get_image,
+            mock_get_port_mode,
+            mock_get_workspace_cloud,
+            mock_get_workspace_region_config,
+            mock_get_cloud_config_value,
+            mock_is_exec_auth,
+            mock_get_accelerator_label_keys,
+            mock_get_namespace,
+            mock_get_current_context,
+            set_pod_resource_limits_value=1.5)
+
+        k8s_cloud = kubernetes.Kubernetes()
+        deploy_vars = k8s_cloud.make_deploy_resources_variables(
+            resources=self.resources,
+            cluster_name=resources_utils.ClusterName(
+                display_name=self.cluster_name,
+                name_on_cloud=self.cluster_name),
+            region=self.region,
+            zones=None,
+            num_nodes=1,
+            dryrun=False)
+
+        # Instance type "2CPU--4GB" means cpus=2, memory=4
+        # With multiplier 1.5: limits = requests * 1.5
+        self.assertIn('k8s_cpu_limit', deploy_vars)
+        self.assertIn('k8s_memory_limit', deploy_vars)
+        self.assertEqual(deploy_vars['k8s_cpu_limit'], 3.0)
+        self.assertEqual(deploy_vars['k8s_memory_limit'], 6.0)
+
+    @patch('sky.provision.kubernetes.utils.get_kubernetes_nodes')
+    @patch('sky.provision.kubernetes.utils.get_current_kube_config_context_name'
+          )
+    @patch('sky.provision.kubernetes.utils.get_kube_config_context_namespace')
+    @patch('sky.provision.kubernetes.utils.get_accelerator_label_keys')
+    @patch('sky.provision.kubernetes.utils.is_kubeconfig_exec_auth')
+    @patch('sky.skypilot_config.get_effective_region_config')
+    @patch('sky.skypilot_config.get_effective_workspace_region_config')
+    @patch('sky.skypilot_config.get_workspace_cloud')
+    @patch('sky.provision.kubernetes.network_utils.get_port_mode')
+    @patch('sky.catalog.get_image_id_from_tag')
+    @patch('sky.clouds.kubernetes.Kubernetes._detect_network_type')
+    def test_set_pod_resource_limits_disabled(
+            self, mock_detect_network_type, mock_get_image, mock_get_port_mode,
+            mock_get_workspace_cloud, mock_get_workspace_region_config,
+            mock_get_cloud_config_value, mock_is_exec_auth,
+            mock_get_accelerator_label_keys, mock_get_namespace,
+            mock_get_current_context, mock_get_k8s_nodes):
+        """Test set_pod_resource_limits when disabled (False)."""
+        self._setup_mocks_for_pod_resource_limits_test(
+            mock_detect_network_type,
+            mock_get_image,
+            mock_get_port_mode,
+            mock_get_workspace_cloud,
+            mock_get_workspace_region_config,
+            mock_get_cloud_config_value,
+            mock_is_exec_auth,
+            mock_get_accelerator_label_keys,
+            mock_get_namespace,
+            mock_get_current_context,
+            set_pod_resource_limits_value=False)
+
+        k8s_cloud = kubernetes.Kubernetes()
+        deploy_vars = k8s_cloud.make_deploy_resources_variables(
+            resources=self.resources,
+            cluster_name=resources_utils.ClusterName(
+                display_name=self.cluster_name,
+                name_on_cloud=self.cluster_name),
+            region=self.region,
+            zones=None,
+            num_nodes=1,
+            dryrun=False)
+
+        # With False: no limits should be set
+        self.assertNotIn('k8s_cpu_limit', deploy_vars)
+        self.assertNotIn('k8s_memory_limit', deploy_vars)
+
 
 class TestKubernetesSecurityContext(unittest.TestCase):
     """Test cases for Kubernetes security context handling."""
@@ -1512,7 +1823,7 @@ def test_basic_unsupported_features(self, mock_detect_network_type,
         from sky.provision.kubernetes.utils import (
             KubernetesHighPerformanceNetworkType)
         mock_detect_network_type.return_value = (
-            KubernetesHighPerformanceNetworkType.NONE, '')
+            KubernetesHighPerformanceNetworkType.NONE, None)
 
         resources = mock.MagicMock()
         resources.region = None
@@ -1542,7 +1853,7 @@ def test_spot_instance_supported(self, mock_detect_network_type,
         from sky.provision.kubernetes.utils import (
             KubernetesHighPerformanceNetworkType)
         mock_detect_network_type.return_value = (
-            KubernetesHighPerformanceNetworkType.NONE, '')
+            KubernetesHighPerformanceNetworkType.NONE, None)
 
         resources = mock.MagicMock()
         resources.region = 'test-context'
@@ -1573,7 +1884,7 @@ def test_custom_network_tier_supported(self, mock_detect_network_type,
         from sky.provision.kubernetes.utils import (
             KubernetesHighPerformanceNetworkType)
         mock_detect_network_type.return_value = (
-            KubernetesHighPerformanceNetworkType.NEBIUS, '')
+            KubernetesHighPerformanceNetworkType.NEBIUS, None)
 
         resources = mock.MagicMock()
         resources.region = 'test-context'
@@ -1603,7 +1914,9 @@ def test_both_spot_and_network_tier_supported(
         from sky.provision.kubernetes.utils import (
             KubernetesHighPerformanceNetworkType)
         mock_detect_network_type.return_value = (
-            KubernetesHighPerformanceNetworkType.GCP_TCPX, 'a3-highgpu-8g')
+            KubernetesHighPerformanceNetworkType.GCP_TCPX, {
+                'instance_type': 'a3-highgpu-8g'
+            })
 
         resources = mock.MagicMock()
         resources.region = 'test-context'
@@ -1667,7 +1980,7 @@ def spot_label_side_effect(context):
         from sky.provision.kubernetes.utils import (
             KubernetesHighPerformanceNetworkType)
         mock_detect_network_type.return_value = (
-            KubernetesHighPerformanceNetworkType.NONE, '')
+            KubernetesHighPerformanceNetworkType.NONE, None)
 
         resources = mock.MagicMock()
         resources.region = None
@@ -1990,5 +2303,840 @@ def test_no_contexts_available(self, mock_existing_allowed_contexts):
         self.assertEqual(len(regions), 0)
 
 
+class TestKubernetesDetectNetworkType(unittest.TestCase):
+    """Test cases for Kubernetes._detect_network_type method."""
+
+    def _create_mock_node(self, labels=None):
+        """Helper to create a mock Kubernetes node with labels."""
+        mock_node = mock.MagicMock()
+        mock_node.metadata.labels = labels or {}
+        return mock_node
+
+    def test_network_tier_none_returns_none(self):
+        """Test that when network_tier is None, returns NONE type."""
+        result = kubernetes.Kubernetes._detect_network_type(
+            context='test-context', network_tier=None)
+
+        self.assertEqual(
+            result,
+            (kubernetes_utils.KubernetesHighPerformanceNetworkType.NONE, None))
+
+    def test_network_tier_not_best_returns_none(self):
+        """Test that when network_tier is not BEST, returns NONE type."""
+        result = kubernetes.Kubernetes._detect_network_type(
+            context='test-context',
+            network_tier=resources_utils.NetworkTier.STANDARD)
+
+        self.assertEqual(
+            result,
+            (kubernetes_utils.KubernetesHighPerformanceNetworkType.NONE, None))
+
+    @patch('sky.provision.kubernetes.utils.get_kubernetes_nodes')
+    def test_nebius_cluster_detection(self, mock_get_nodes):
+        """Test detection of Nebius clusters via node labels."""
+        mock_node = self._create_mock_node({
+            'nebius.com/gpu-model': 'h100',
+            'kubernetes.io/hostname': 'node-1'
+        })
+        mock_get_nodes.return_value = [mock_node]
+
+        result = kubernetes.Kubernetes._detect_network_type(
+            context='test-context',
+            network_tier=resources_utils.NetworkTier.BEST)
+
+        self.assertEqual(
+            result,
+            (kubernetes_utils.KubernetesHighPerformanceNetworkType.NEBIUS,
+             None))
+
+    @patch('sky.provision.kubernetes.utils.get_kubernetes_nodes')
+    def test_coreweave_cluster_detection(self, mock_get_nodes):
+        """Test detection of CoreWeave clusters via node labels."""
+        mock_node = self._create_mock_node({
+            'ib.coreweave.cloud/enabled': 'true',
+            'kubernetes.io/hostname': 'node-1'
+        })
+        mock_get_nodes.return_value = [mock_node]
+
+        result = kubernetes.Kubernetes._detect_network_type(
+            context='test-context',
+            network_tier=resources_utils.NetworkTier.BEST)
+
+        self.assertEqual(
+            result,
+            (kubernetes_utils.KubernetesHighPerformanceNetworkType.COREWEAVE,
+             None))
+
+    @patch('sky.provision.kubernetes.utils.get_kubernetes_nodes')
+    def test_together_cluster_detection(self, mock_get_nodes):
+        """Test detection of Together AI clusters via node labels."""
+        mock_node = self._create_mock_node({
+            'node-role.together.ai/gpu': 'true',
+            'kubernetes.io/hostname': 'node-1'
+        })
+        mock_get_nodes.return_value = [mock_node]
+
+        result = kubernetes.Kubernetes._detect_network_type(
+            context='test-context',
+            network_tier=resources_utils.NetworkTier.BEST)
+
+        self.assertEqual(
+            result,
+            (kubernetes_utils.KubernetesHighPerformanceNetworkType.TOGETHER,
+             None))
+
+    @patch('sky.provision.kubernetes.utils.get_kubernetes_nodes')
+    def test_gke_a3_highgpu_detection(self, mock_get_nodes):
+        """Test detection of GKE A3 highgpu instance type."""
+        mock_node = self._create_mock_node({
+            'cloud.google.com/machine-family': 'a3',
+            'node.kubernetes.io/instance-type': 'a3-highgpu-8g',
+            'cloud.google.com/gke-accelerator': 'nvidia-h100-80gb'
+        })
+        mock_get_nodes.return_value = [mock_node]
+
+        result = kubernetes.Kubernetes._detect_network_type(
+            context='test-context',
+            network_tier=resources_utils.NetworkTier.BEST)
+
+        self.assertEqual(
+            result,
+            (kubernetes_utils.KubernetesHighPerformanceNetworkType.GCP_TCPX, {
+                'instance_type': 'a3-highgpu-8g'
+            }))
+
+    @patch('sky.provision.kubernetes.utils.get_kubernetes_nodes')
+    def test_gke_a3_edgegpu_detection(self, mock_get_nodes):
+        """Test detection of GKE A3 edgegpu instance type."""
+        mock_node = self._create_mock_node({
+            'cloud.google.com/machine-family': 'a3',
+            'node.kubernetes.io/instance-type': 'a3-edgegpu-8g',
+            'cloud.google.com/gke-accelerator': 'nvidia-h100-80gb'
+        })
+        mock_get_nodes.return_value = [mock_node]
+
+        result = kubernetes.Kubernetes._detect_network_type(
+            context='test-context',
+            network_tier=resources_utils.NetworkTier.BEST)
+
+        self.assertEqual(
+            result,
+            (kubernetes_utils.KubernetesHighPerformanceNetworkType.GCP_TCPX, {
+                'instance_type': 'a3-edgegpu-8g'
+            }))
+
+    @patch('sky.provision.kubernetes.utils.get_kubernetes_nodes')
+    def test_gke_a3_megagpu_detection(self, mock_get_nodes):
+        """Test detection of GKE A3 megagpu instance type (TCPXO)."""
+        mock_node = self._create_mock_node({
+            'cloud.google.com/machine-family': 'a3',
+            'node.kubernetes.io/instance-type': 'a3-megagpu-8g',
+            'cloud.google.com/gke-accelerator': 'nvidia-h100-80gb'
+        })
+        mock_get_nodes.return_value = [mock_node]
+
+        result = kubernetes.Kubernetes._detect_network_type(
+            context='test-context',
+            network_tier=resources_utils.NetworkTier.BEST)
+
+        self.assertEqual(
+            result,
+            (kubernetes_utils.KubernetesHighPerformanceNetworkType.GCP_TCPXO, {
+                'instance_type': 'a3-megagpu-8g'
+            }))
+
+    @patch('sky.provision.kubernetes.utils.get_kubernetes_nodes')
+    def test_gke_a4_highgpu_detection(self, mock_get_nodes):
+        """Test detection of GKE A4 highgpu instance type (GPUDirect RDMA)."""
+        mock_node = self._create_mock_node({
+            'cloud.google.com/machine-family': 'a4',
+            'node.kubernetes.io/instance-type': 'a4-highgpu-8g',
+            'cloud.google.com/gke-accelerator': 'nvidia-b200'
+        })
+        mock_get_nodes.return_value = [mock_node]
+
+        result = kubernetes.Kubernetes._detect_network_type(
+            context='test-context',
+            network_tier=resources_utils.NetworkTier.BEST)
+
+        self.assertEqual(result,
+                         (kubernetes_utils.KubernetesHighPerformanceNetworkType.
+                          GCP_GPUDIRECT_RDMA, {
+                              'instance_type': 'a4-highgpu-8g'
+                          }))
+
+    @patch('sky.provision.kubernetes.utils.get_kubernetes_nodes')
+    def test_gke_a3_ultragpu_detection(self, mock_get_nodes):
+        """Test detection of GKE A3 ultragpu instance type (GPUDirect RDMA)."""
+        mock_node = self._create_mock_node({
+            'cloud.google.com/machine-family': 'a3',
+            'node.kubernetes.io/instance-type': 'a3-ultragpu-8g',
+            'cloud.google.com/gke-accelerator': 'nvidia-h200'
+        })
+        mock_get_nodes.return_value = [mock_node]
+
+        result = kubernetes.Kubernetes._detect_network_type(
+            context='test-context',
+            network_tier=resources_utils.NetworkTier.BEST)
+
+        self.assertEqual(result,
+                         (kubernetes_utils.KubernetesHighPerformanceNetworkType.
+                          GCP_GPUDIRECT_RDMA, {
+                              'instance_type': 'a3-ultragpu-8g'
+                          }))
+
+    @patch('sky.provision.kubernetes.utils.get_kubernetes_nodes')
+    def test_gke_a4_generic_fallback(self, mock_get_nodes):
+        """Test generic A4 machine family detection as fallback."""
+        mock_node = self._create_mock_node({
+            'cloud.google.com/machine-family': 'a4',
+            'node.kubernetes.io/instance-type': 'a4-unknown-type',
+            'cloud.google.com/gke-accelerator': 'nvidia-b200'
+        })
+        mock_get_nodes.return_value = [mock_node]
+
+        result = kubernetes.Kubernetes._detect_network_type(
+            context='test-context',
+            network_tier=resources_utils.NetworkTier.BEST)
+
+        self.assertEqual(result,
+                         (kubernetes_utils.KubernetesHighPerformanceNetworkType.
+                          GCP_GPUDIRECT_RDMA, {
+                              'instance_type': 'a4'
+                          }))
+
+    @patch('sky.provision.kubernetes.utils.get_kubernetes_nodes')
+    def test_gke_tcpx_fallback_with_h100(self, mock_get_nodes):
+        """Test TCPX fallback detection via GPU and instance type."""
+        mock_node = self._create_mock_node({
+            'node.kubernetes.io/instance-type': 'a3-highgpu-8g',
+            'cloud.google.com/gke-accelerator': 'nvidia-h100-80gb'
+        })
+        mock_get_nodes.return_value = [mock_node]
+
+        result = kubernetes.Kubernetes._detect_network_type(
+            context='test-context',
+            network_tier=resources_utils.NetworkTier.BEST)
+
+        self.assertEqual(
+            result,
+            (kubernetes_utils.KubernetesHighPerformanceNetworkType.GCP_TCPX, {
+                'instance_type': 'a3-highgpu-8g'
+            }))
+
+    @patch('sky.provision.kubernetes.utils.get_kubernetes_nodes')
+    def test_gke_tcpx_fallback_with_h200(self, mock_get_nodes):
+        """Test TCPX fallback detection via H200 GPU."""
+        mock_node = self._create_mock_node({
+            'node.kubernetes.io/instance-type': 'a3-edgegpu-8g',
+            'cloud.google.com/gke-accelerator': 'nvidia-h200-141gb'
+        })
+        mock_get_nodes.return_value = [mock_node]
+
+        result = kubernetes.Kubernetes._detect_network_type(
+            context='test-context',
+            network_tier=resources_utils.NetworkTier.BEST)
+
+        self.assertEqual(
+            result,
+            (kubernetes_utils.KubernetesHighPerformanceNetworkType.GCP_TCPX, {
+                'instance_type': 'a3-edgegpu-8g'
+            }))
+
+    @patch('sky.provision.kubernetes.utils.get_kubernetes_nodes')
+    def test_gke_tcpx_fallback_with_b200(self, mock_get_nodes):
+        """Test TCPX fallback detection via B200 GPU."""
+        mock_node = self._create_mock_node({
+            'node.kubernetes.io/instance-type': 'a3-highgpu-8g',
+            'cloud.google.com/gke-accelerator': 'nvidia-b200-180gb'
+        })
+        mock_get_nodes.return_value = [mock_node]
+
+        result = kubernetes.Kubernetes._detect_network_type(
+            context='test-context',
+            network_tier=resources_utils.NetworkTier.BEST)
+
+        self.assertEqual(
+            result,
+            (kubernetes_utils.KubernetesHighPerformanceNetworkType.GCP_TCPX, {
+                'instance_type': 'a3-highgpu-8g'
+            }))
+
+    @patch('sky.provision.kubernetes.utils.get_kubernetes_nodes')
+    def test_no_high_perf_gpu_returns_none_from_node_loop(self, mock_get_nodes):
+        """Test node with TCPX instance but no high-perf GPU doesn't match."""
+        mock_node = self._create_mock_node({
+            'node.kubernetes.io/instance-type': 'a3-highgpu-8g',
+            'cloud.google.com/gke-accelerator': 'nvidia-t4'
+        })
+        mock_get_nodes.return_value = [mock_node]
+
+        # This should not match the TCPX fallback, so continue to autoscaler check
+        with patch('sky.skypilot_config.get_effective_region_config',
+                   return_value=None):
+            result = kubernetes.Kubernetes._detect_network_type(
+                context='test-context',
+                network_tier=resources_utils.NetworkTier.BEST)
+
+        self.assertEqual(
+            result,
+            (kubernetes_utils.KubernetesHighPerformanceNetworkType.NONE, None))
+
+    @patch('sky.provision.kubernetes.utils.get_kubernetes_nodes')
+    def test_node_without_labels_continues_iteration(self, mock_get_nodes):
+        """Test that nodes without labels are skipped."""
+        mock_node_no_labels = mock.MagicMock()
+        mock_node_no_labels.metadata.labels = None
+
+        mock_node_with_labels = self._create_mock_node(
+            {'nebius.com/gpu-model': 'h100'})
+
+        mock_get_nodes.return_value = [
+            mock_node_no_labels, mock_node_with_labels
+        ]
+
+        result = kubernetes.Kubernetes._detect_network_type(
+            context='test-context',
+            network_tier=resources_utils.NetworkTier.BEST)
+
+        self.assertEqual(
+            result,
+            (kubernetes_utils.KubernetesHighPerformanceNetworkType.NEBIUS,
+             None))
+
+    @patch('sky.provision.kubernetes.utils.get_kubernetes_nodes')
+    def test_kube_api_unreachable_falls_through(self, mock_get_nodes):
+        """Test that KubeAPIUnreachableError is caught and continues."""
+        from sky import exceptions
+        mock_get_nodes.side_effect = exceptions.KubeAPIUnreachableError(
+            'Cannot reach cluster')
+
+        with patch('sky.skypilot_config.get_effective_region_config',
+                   return_value=None):
+            result = kubernetes.Kubernetes._detect_network_type(
+                context='test-context',
+                network_tier=resources_utils.NetworkTier.BEST)
+
+        self.assertEqual(
+            result,
+            (kubernetes_utils.KubernetesHighPerformanceNetworkType.NONE, None))
+
+    @patch('sky.provision.kubernetes.utils.get_kubernetes_nodes')
+    @patch('sky.skypilot_config.get_effective_region_config')
+    def test_autoscaler_non_gke_returns_none(self, mock_get_config,
+                                             mock_get_nodes):
+        """Test that non-GKE autoscaler returns NONE."""
+        mock_get_nodes.return_value = []
+        mock_get_config.return_value = 'karpenter'
+
+        result = kubernetes.Kubernetes._detect_network_type(
+            context='test-context',
+            network_tier=resources_utils.NetworkTier.BEST)
+
+        self.assertEqual(
+            result,
+            (kubernetes_utils.KubernetesHighPerformanceNetworkType.NONE, None))
+
+    @patch('sky.provision.kubernetes.utils.get_kubernetes_nodes')
+    @patch('sky.skypilot_config.get_effective_region_config')
+    @patch('sky.provision.kubernetes.utils.get_autoscaler')
+    def test_gke_autoscaler_a3_highgpu(self, mock_get_autoscaler,
+                                       mock_get_config, mock_get_nodes):
+        """Test GKE autoscaler with a3-highgpu-8g machine type."""
+        mock_get_nodes.return_value = []
+        mock_get_config.return_value = 'gke'
+
+        mock_autoscaler = mock.MagicMock()
+        mock_autoscaler.get_available_machine_types.return_value = [
+            'n1-standard-4', 'a3-highgpu-8g', 'n2-standard-8'
+        ]
+        mock_get_autoscaler.return_value = mock_autoscaler
+
+        result = kubernetes.Kubernetes._detect_network_type(
+            context='test-context',
+            network_tier=resources_utils.NetworkTier.BEST)
+
+        self.assertEqual(
+            result,
+            (kubernetes_utils.KubernetesHighPerformanceNetworkType.GCP_TCPX, {
+                'instance_type': 'a3-highgpu-8g'
+            }))
+
+    @patch('sky.provision.kubernetes.utils.get_kubernetes_nodes')
+    @patch('sky.skypilot_config.get_effective_region_config')
+    @patch('sky.provision.kubernetes.utils.get_autoscaler')
+    def test_gke_autoscaler_a3_edgegpu(self, mock_get_autoscaler,
+                                       mock_get_config, mock_get_nodes):
+        """Test GKE autoscaler with a3-edgegpu-8g machine type."""
+        mock_get_nodes.return_value = []
+        mock_get_config.return_value = 'gke'
+
+        mock_autoscaler = mock.MagicMock()
+        mock_autoscaler.get_available_machine_types.return_value = [
+            'n1-standard-4', 'a3-edgegpu-8g'
+        ]
+        mock_get_autoscaler.return_value = mock_autoscaler
+
+        result = kubernetes.Kubernetes._detect_network_type(
+            context='test-context',
+            network_tier=resources_utils.NetworkTier.BEST)
+
+        self.assertEqual(
+            result,
+            (kubernetes_utils.KubernetesHighPerformanceNetworkType.GCP_TCPX, {
+                'instance_type': 'a3-edgegpu-8g'
+            }))
+
+    @patch('sky.provision.kubernetes.utils.get_kubernetes_nodes')
+    @patch('sky.skypilot_config.get_effective_region_config')
+    @patch('sky.provision.kubernetes.utils.get_autoscaler')
+    def test_gke_autoscaler_a3_megagpu(self, mock_get_autoscaler,
+                                       mock_get_config, mock_get_nodes):
+        """Test GKE autoscaler with a3-megagpu-8g machine type."""
+        mock_get_nodes.return_value = []
+        mock_get_config.return_value = 'gke'
+
+        mock_autoscaler = mock.MagicMock()
+        mock_autoscaler.get_available_machine_types.return_value = [
+            'n1-standard-4', 'a3-megagpu-8g'
+        ]
+        mock_get_autoscaler.return_value = mock_autoscaler
+
+        result = kubernetes.Kubernetes._detect_network_type(
+            context='test-context',
+            network_tier=resources_utils.NetworkTier.BEST)
+
+        self.assertEqual(
+            result,
+            (kubernetes_utils.KubernetesHighPerformanceNetworkType.GCP_TCPXO, {
+                'instance_type': 'a3-megagpu-8g'
+            }))
+
+    @patch('sky.provision.kubernetes.utils.get_kubernetes_nodes')
+    @patch('sky.skypilot_config.get_effective_region_config')
+    @patch('sky.provision.kubernetes.utils.get_autoscaler')
+    def test_gke_autoscaler_a4_highgpu(self, mock_get_autoscaler,
+                                       mock_get_config, mock_get_nodes):
+        """Test GKE autoscaler with a4-highgpu-8g machine type."""
+        mock_get_nodes.return_value = []
+        mock_get_config.return_value = 'gke'
+
+        mock_autoscaler = mock.MagicMock()
+        mock_autoscaler.get_available_machine_types.return_value = [
+            'n1-standard-4', 'a4-highgpu-8g'
+        ]
+        mock_get_autoscaler.return_value = mock_autoscaler
+
+        result = kubernetes.Kubernetes._detect_network_type(
+            context='test-context',
+            network_tier=resources_utils.NetworkTier.BEST)
+
+        self.assertEqual(result,
+                         (kubernetes_utils.KubernetesHighPerformanceNetworkType.
+                          GCP_GPUDIRECT_RDMA, {
+                              'instance_type': 'a4-highgpu-8g'
+                          }))
+
+    @patch('sky.provision.kubernetes.utils.get_kubernetes_nodes')
+    @patch('sky.skypilot_config.get_effective_region_config')
+    @patch('sky.provision.kubernetes.utils.get_autoscaler')
+    def test_gke_autoscaler_a3_ultragpu(self, mock_get_autoscaler,
+                                        mock_get_config, mock_get_nodes):
+        """Test GKE autoscaler with a3-ultragpu-8g machine type."""
+        mock_get_nodes.return_value = []
+        mock_get_config.return_value = 'gke'
+
+        mock_autoscaler = mock.MagicMock()
+        mock_autoscaler.get_available_machine_types.return_value = [
+            'n1-standard-4', 'a3-ultragpu-8g'
+        ]
+        mock_get_autoscaler.return_value = mock_autoscaler
+
+        result = kubernetes.Kubernetes._detect_network_type(
+            context='test-context',
+            network_tier=resources_utils.NetworkTier.BEST)
+
+        self.assertEqual(result,
+                         (kubernetes_utils.KubernetesHighPerformanceNetworkType.
+                          GCP_GPUDIRECT_RDMA, {
+                              'instance_type': 'a3-ultragpu-8g'
+                          }))
+
+    @patch('sky.provision.kubernetes.utils.get_kubernetes_nodes')
+    @patch('sky.skypilot_config.get_effective_region_config')
+    @patch('sky.provision.kubernetes.utils.get_autoscaler')
+    def test_gke_autoscaler_no_high_perf_types(self, mock_get_autoscaler,
+                                               mock_get_config, mock_get_nodes):
+        """Test GKE autoscaler without high-performance machine types."""
+        mock_get_nodes.return_value = []
+        mock_get_config.return_value = 'gke'
+
+        mock_autoscaler = mock.MagicMock()
+        mock_autoscaler.get_available_machine_types.return_value = [
+            'n1-standard-4', 'n2-standard-8'
+        ]
+        mock_get_autoscaler.return_value = mock_autoscaler
+
+        result = kubernetes.Kubernetes._detect_network_type(
+            context='test-context',
+            network_tier=resources_utils.NetworkTier.BEST)
+
+        self.assertEqual(
+            result,
+            (kubernetes_utils.KubernetesHighPerformanceNetworkType.NONE, None))
+
+    @patch('sky.provision.kubernetes.utils.get_kubernetes_nodes')
+    @patch('sky.skypilot_config.get_effective_region_config')
+    def test_autoscaler_none_returns_none(self, mock_get_config,
+                                          mock_get_nodes):
+        """Test that None autoscaler config returns NONE."""
+        mock_get_nodes.return_value = []
+        mock_get_config.return_value = None
+
+        result = kubernetes.Kubernetes._detect_network_type(
+            context='test-context',
+            network_tier=resources_utils.NetworkTier.BEST)
+
+        self.assertEqual(
+            result,
+            (kubernetes_utils.KubernetesHighPerformanceNetworkType.NONE, None))
+
+    @patch('sky.provision.kubernetes.utils.get_kubernetes_nodes')
+    def test_empty_node_list_falls_through_to_autoscaler(self, mock_get_nodes):
+        """Test that empty node list falls through to autoscaler check."""
+        mock_get_nodes.return_value = []
+
+        with patch('sky.skypilot_config.get_effective_region_config',
+                   return_value=None):
+            result = kubernetes.Kubernetes._detect_network_type(
+                context='test-context',
+                network_tier=resources_utils.NetworkTier.BEST)
+
+        self.assertEqual(
+            result,
+            (kubernetes_utils.KubernetesHighPerformanceNetworkType.NONE, None))
+
+    @patch('sky.provision.kubernetes.utils.get_kubernetes_nodes')
+    def test_a3_without_specific_instance_no_fallback(self, mock_get_nodes):
+        """Test A3 machine family without specific instance type doesn't match A4 fallback."""
+        mock_node = self._create_mock_node({
+            'cloud.google.com/machine-family': 'a3',
+            'node.kubernetes.io/instance-type': 'a3-some-other-type',
+            'cloud.google.com/gke-accelerator': 'nvidia-v100'
+        })
+        mock_get_nodes.return_value = [mock_node]
+
+        with patch('sky.skypilot_config.get_effective_region_config',
+                   return_value=None):
+            result = kubernetes.Kubernetes._detect_network_type(
+                context='test-context',
+                network_tier=resources_utils.NetworkTier.BEST)
+
+        # A3 family without specific instance type and without high-perf GPU
+        # should fall through to NONE
+        self.assertEqual(
+            result,
+            (kubernetes_utils.KubernetesHighPerformanceNetworkType.NONE, None))
+
+    @patch('sky.provision.kubernetes.utils.get_kubernetes_nodes')
+    @patch('sky.skypilot_config.get_effective_region_config')
+    @patch('sky.provision.kubernetes.utils.get_autoscaler')
+    def test_gke_autoscaler_priority_order(self, mock_get_autoscaler,
+                                           mock_get_config, mock_get_nodes):
+        """Test that autoscaler checks machine types in priority order.
+
+        When multiple high-perf types available, a3-highgpu-8g should be
+        returned first.
+        """
+        mock_get_nodes.return_value = []
+        mock_get_config.return_value = 'gke'
+
+        mock_autoscaler = mock.MagicMock()
+        # Include multiple high-perf types, but a3-highgpu-8g should match first
+        mock_autoscaler.get_available_machine_types.return_value = [
+            'a3-ultragpu-8g', 'a4-highgpu-8g', 'a3-megagpu-8g', 'a3-edgegpu-8g',
+            'a3-highgpu-8g'
+        ]
+        mock_get_autoscaler.return_value = mock_autoscaler
+
+        result = kubernetes.Kubernetes._detect_network_type(
+            context='test-context',
+            network_tier=resources_utils.NetworkTier.BEST)
+
+        # a3-highgpu-8g is checked first in the elif chain
+        self.assertEqual(
+            result,
+            (kubernetes_utils.KubernetesHighPerformanceNetworkType.GCP_TCPX, {
+                'instance_type': 'a3-highgpu-8g'
+            }))
+
+    @patch('sky.provision.kubernetes.utils.get_kubernetes_nodes')
+    def test_multiple_nodes_first_match_wins(self, mock_get_nodes):
+        """Test that first matching node determines the result."""
+        nebius_node = self._create_mock_node({'nebius.com/gpu-model': 'h100'})
+        coreweave_node = self._create_mock_node(
+            {'ib.coreweave.cloud/enabled': 'true'})
+
+        mock_get_nodes.return_value = [nebius_node, coreweave_node]
+
+        result = kubernetes.Kubernetes._detect_network_type(
+            context='test-context',
+            network_tier=resources_utils.NetworkTier.BEST)
+
+        # Nebius is detected first
+        self.assertEqual(
+            result,
+            (kubernetes_utils.KubernetesHighPerformanceNetworkType.NEBIUS,
+             None))
+
+    def _create_mock_node_with_allocatable(self, labels=None, allocatable=None):
+        """Helper to create a mock Kubernetes node with labels and allocatable resources."""
+        mock_node = mock.MagicMock()
+        mock_node.metadata.labels = labels or {}
+        mock_node.status.allocatable = allocatable or {}
+        return mock_node
+
+    @patch('sky.provision.kubernetes.utils.get_kubernetes_nodes')
+    def test_aws_efa_detection_basic(self, mock_get_nodes):
+        """Test detection of AWS EKS clusters via node labels (without GPU params)."""
+        mock_node = self._create_mock_node(
+            {'k8s.io/cloud-provider-aws': 'true'})
+        mock_get_nodes.return_value = [mock_node]
+
+        result = kubernetes.Kubernetes._detect_network_type(
+            context='test-context',
+            network_tier=resources_utils.NetworkTier.BEST)
+
+        self.assertEqual(
+            result,
+            (kubernetes_utils.KubernetesHighPerformanceNetworkType.AWS_EFA,
+             None))
+
+    @patch('sky.provision.kubernetes.utils.get_kubernetes_nodes')
+    def test_aws_efa_detection_without_acc_params(self, mock_get_nodes):
+        """Test AWS EFA detection returns early when GPU params are not specified."""
+        mock_node = self._create_mock_node_with_allocatable(
+            labels={'k8s.io/cloud-provider-aws': 'true'},
+            allocatable={
+                'nvidia.com/gpu': '8',
+                'vpc.amazonaws.com/efa': '4'
+            })
+        mock_get_nodes.return_value = [mock_node]
+
+        # Without k8s_acc_label_key, should return early without EFA count
+        result = kubernetes.Kubernetes._detect_network_type(
+            context='test-context',
+            network_tier=resources_utils.NetworkTier.BEST)
+
+        self.assertEqual(
+            result,
+            (kubernetes_utils.KubernetesHighPerformanceNetworkType.AWS_EFA,
+             None))
+
+    @patch('sky.provision.kubernetes.utils.get_kubernetes_nodes')
+    def test_aws_efa_detection_with_efa_resources(self, mock_get_nodes):
+        """Test AWS EFA detection with EFA count calculation."""
+        mock_node = self._create_mock_node_with_allocatable(
+            labels={
+                'k8s.io/cloud-provider-aws': 'true',
+                'nvidia.com/gpu.product': 'NVIDIA-H100-80GB-HBM3'
+            },
+            allocatable={
+                'nvidia.com/gpu': '8',
+                'vpc.amazonaws.com/efa': '4'
+            })
+        mock_get_nodes.return_value = [mock_node]
+
+        result = kubernetes.Kubernetes._detect_network_type(
+            context='test-context',
+            network_tier=resources_utils.NetworkTier.BEST,
+            k8s_acc_label_key='nvidia.com/gpu.product',
+            k8s_resource_key='nvidia.com/gpu',
+            acc_count=8)
+
+        self.assertEqual(
+            result,
+            (kubernetes_utils.KubernetesHighPerformanceNetworkType.AWS_EFA, {
+                'efa_count': 4
+            }))
+
+    @patch('sky.provision.kubernetes.utils.get_kubernetes_nodes')
+    def test_aws_efa_detection_proportional_allocation(self, mock_get_nodes):
+        """Test AWS EFA count is calculated proportionally to GPU request."""
+        mock_node = self._create_mock_node_with_allocatable(
+            labels={
+                'k8s.io/cloud-provider-aws': 'true',
+                'nvidia.com/gpu.product': 'NVIDIA-H100-80GB-HBM3'
+            },
+            allocatable={
+                'nvidia.com/gpu': '8',
+                'vpc.amazonaws.com/efa': '4'
+            })
+        mock_get_nodes.return_value = [mock_node]
+
+        # Requesting 4 GPUs out of 8 should give 2 EFAs (4/8 * 4 = 2)
+        result = kubernetes.Kubernetes._detect_network_type(
+            context='test-context',
+            network_tier=resources_utils.NetworkTier.BEST,
+            k8s_acc_label_key='nvidia.com/gpu.product',
+            k8s_resource_key='nvidia.com/gpu',
+            acc_count=4)
+
+        self.assertEqual(
+            result,
+            (kubernetes_utils.KubernetesHighPerformanceNetworkType.AWS_EFA, {
+                'efa_count': 2
+            }))
+
+    @patch('sky.provision.kubernetes.utils.get_kubernetes_nodes')
+    def test_aws_efa_detection_minimum_one_efa(self, mock_get_nodes):
+        """Test AWS EFA count is at least 1 when EFA is available."""
+        mock_node = self._create_mock_node_with_allocatable(
+            labels={
+                'k8s.io/cloud-provider-aws': 'true',
+                'nvidia.com/gpu.product': 'NVIDIA-H100-80GB-HBM3'
+            },
+            allocatable={
+                'nvidia.com/gpu': '8',
+                'vpc.amazonaws.com/efa': '4'
+            })
+        mock_get_nodes.return_value = [mock_node]
+
+        # Requesting 1 GPU out of 8 should give at least 1 EFA (floor(1/8 * 4) = 0, but min is 1)
+        result = kubernetes.Kubernetes._detect_network_type(
+            context='test-context',
+            network_tier=resources_utils.NetworkTier.BEST,
+            k8s_acc_label_key='nvidia.com/gpu.product',
+            k8s_resource_key='nvidia.com/gpu',
+            acc_count=1)
+
+        self.assertEqual(
+            result,
+            (kubernetes_utils.KubernetesHighPerformanceNetworkType.AWS_EFA, {
+                'efa_count': 1
+            }))
+
+    @patch('sky.provision.kubernetes.utils.get_kubernetes_nodes')
+    def test_aws_efa_detection_node_without_enough_gpus(self, mock_get_nodes):
+        """Test AWS EFA detection skips nodes without enough GPUs."""
+        # First node doesn't have enough GPUs
+        mock_node1 = self._create_mock_node_with_allocatable(
+            labels={
+                'k8s.io/cloud-provider-aws': 'true',
+                'nvidia.com/gpu.product': 'NVIDIA-H100-80GB-HBM3'
+            },
+            allocatable={
+                'nvidia.com/gpu': '4',
+                'vpc.amazonaws.com/efa': '2'
+            })
+        # Second node has enough GPUs
+        mock_node2 = self._create_mock_node_with_allocatable(
+            labels={
+                'k8s.io/cloud-provider-aws': 'true',
+                'nvidia.com/gpu.product': 'NVIDIA-H100-80GB-HBM3'
+            },
+            allocatable={
+                'nvidia.com/gpu': '8',
+                'vpc.amazonaws.com/efa': '4'
+            })
+        mock_get_nodes.return_value = [mock_node1, mock_node2]
+
+        # Requesting 8 GPUs - first node only has 4, should use second node
+        result = kubernetes.Kubernetes._detect_network_type(
+            context='test-context',
+            network_tier=resources_utils.NetworkTier.BEST,
+            k8s_acc_label_key='nvidia.com/gpu.product',
+            k8s_resource_key='nvidia.com/gpu',
+            acc_count=8)
+
+        self.assertEqual(
+            result,
+            (kubernetes_utils.KubernetesHighPerformanceNetworkType.AWS_EFA, {
+                'efa_count': 4
+            }))
+
+    @patch('sky.provision.kubernetes.utils.get_kubernetes_nodes')
+    def test_aws_efa_detection_node_without_efa_resource(self, mock_get_nodes):
+        """Test AWS EFA detection when node doesn't have EFA resources."""
+        mock_node = self._create_mock_node_with_allocatable(
+            labels={
+                'k8s.io/cloud-provider-aws': 'true',
+                'nvidia.com/gpu.product': 'NVIDIA-H100-80GB-HBM3'
+            },
+            allocatable={'nvidia.com/gpu': '8'
+                         # No EFA resource
+                        })
+        mock_get_nodes.return_value = [mock_node]
+
+        result = kubernetes.Kubernetes._detect_network_type(
+            context='test-context',
+            network_tier=resources_utils.NetworkTier.BEST,
+            k8s_acc_label_key='nvidia.com/gpu.product',
+            k8s_resource_key='nvidia.com/gpu',
+            acc_count=8)
+
+        # Should return AWS_EFA type but without efa_count metadata
+        self.assertEqual(
+            result,
+            (kubernetes_utils.KubernetesHighPerformanceNetworkType.AWS_EFA,
+             None))
+
+    @patch('sky.provision.kubernetes.utils.get_kubernetes_nodes')
+    def test_aws_efa_detection_node_without_gpu_label(self, mock_get_nodes):
+        """Test AWS EFA detection skips nodes without the required GPU label."""
+        mock_node = self._create_mock_node_with_allocatable(
+            labels={
+                'k8s.io/cloud-provider-aws': 'true',
+                # Missing 'nvidia.com/gpu.product' label
+            },
+            allocatable={
+                'nvidia.com/gpu': '8',
+                'vpc.amazonaws.com/efa': '4'
+            })
+        mock_get_nodes.return_value = [mock_node]
+
+        with patch('sky.skypilot_config.get_effective_region_config',
+                   return_value=None):
+            result = kubernetes.Kubernetes._detect_network_type(
+                context='test-context',
+                network_tier=resources_utils.NetworkTier.BEST,
+                k8s_acc_label_key='nvidia.com/gpu.product',
+                k8s_resource_key='nvidia.com/gpu',
+                acc_count=8)
+
+        # Node doesn't have the required GPU label, continues to next node
+        # Since there's no matching node and no GKE autoscaler, returns NONE
+        self.assertEqual(
+            result,
+            (kubernetes_utils.KubernetesHighPerformanceNetworkType.NONE, None))
+
+    @patch('sky.provision.kubernetes.utils.get_kubernetes_nodes')
+    def test_aws_efa_detection_zero_efa_available(self, mock_get_nodes):
+        """Test AWS EFA detection when EFA count is zero."""
+        mock_node = self._create_mock_node_with_allocatable(
+            labels={
+                'k8s.io/cloud-provider-aws': 'true',
+                'nvidia.com/gpu.product': 'NVIDIA-H100-80GB-HBM3'
+            },
+            allocatable={
+                'nvidia.com/gpu': '8',
+                'vpc.amazonaws.com/efa': '0'
+            })
+        mock_get_nodes.return_value = [mock_node]
+
+        result = kubernetes.Kubernetes._detect_network_type(
+            context='test-context',
+            network_tier=resources_utils.NetworkTier.BEST,
+            k8s_acc_label_key='nvidia.com/gpu.product',
+            k8s_resource_key='nvidia.com/gpu',
+            acc_count=8)
+
+        # EFA count is 0, so AWS_EFA is still returned but without efa_count metadata
+        self.assertEqual(
+            result,
+            (kubernetes_utils.KubernetesHighPerformanceNetworkType.AWS_EFA,
+             None))
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/unit_tests/test_sky/clouds/test_slurm.py b/tests/unit_tests/test_sky/clouds/test_slurm.py
index 83ca66ab222..2f06d282535 100644
--- a/tests/unit_tests/test_sky/clouds/test_slurm.py
+++ b/tests/unit_tests/test_sky/clouds/test_slurm.py
@@ -1,5 +1,7 @@
 """Tests for Slurm cloud implementation."""
 
+import os
+from pathlib import Path
 from unittest.mock import patch
 import unittest.mock as mock
 
@@ -344,3 +346,178 @@ def test_get_gpu_type_and_count(self, gres_str, expected_type,
         gpu_type, gpu_count = slurm_utils.get_gpu_type_and_count(gres_str)
         assert gpu_type == expected_type
         assert gpu_count == expected_count
+
+
+SBATCH_TESTDATA_DIR = Path(__file__).parent / 'testdata' / 'slurm_sbatch'
+
+
+def assert_sbatch_matches_snapshot(test_name: str,
+                                   generated_script: str) -> None:
+    """Compare generated sbatch script against snapshot file.
+
+    Args:
+        test_name: Name of the test (used to find snapshot file)
+        generated_script: The script generated by _create_virtual_instance
+
+    If UPDATE_SNAPSHOT=1 env var is set, updates the snapshot file instead
+    of comparing.
+    """
+    snapshot_path = SBATCH_TESTDATA_DIR / f'{test_name}.sh'
+
+    if os.environ.get('UPDATE_SNAPSHOT') == '1':
+        snapshot_path.parent.mkdir(parents=True, exist_ok=True)
+        snapshot_path.write_text(generated_script)
+        print(f'Updated snapshot: {snapshot_path}')
+        return
+
+    if not snapshot_path.exists():
+        pytest.fail(f'Snapshot file not found: {snapshot_path}\n'
+                    f'Run with UPDATE_SNAPSHOT=1 to create it.')
+
+    expected = snapshot_path.read_text()
+
+    if generated_script != expected:
+        import difflib
+        diff = difflib.unified_diff(
+            expected.splitlines(keepends=True),
+            generated_script.splitlines(keepends=True),
+            fromfile=f'{test_name}.sh (expected)',
+            tofile=f'{test_name}.sh (actual)',
+        )
+        diff_text = ''.join(diff)
+        pytest.fail(
+            f'Generated script does not match snapshot: {snapshot_path}\n\n'
+            f'Diff:\n{diff_text}\n\n'
+            f'Run with UPDATE_SNAPSHOT=1 to update the snapshot.')
+
+
+class TestCreateVirtualInstance:
+    """Test slurm_instance._create_virtual_instance() script generation."""
+
+    def _setup_mocks(self, mock_ssh_runner, mock_slurm_client,
+                     mock_get_partition_info, partition_name):
+        """Configure standard mocks for _create_virtual_instance tests."""
+        from sky.adaptors.slurm import SlurmPartition
+
+        mock_get_partition_info.return_value = SlurmPartition(
+            name=partition_name, is_default=False, maxtime=7 * 24 * 60 * 60)
+
+        mock_client = mock.MagicMock()
+        mock_client.query_jobs.return_value = []
+        mock_client.get_job_nodes.return_value = (['node1'], {
+            'node1': '10.0.0.5'
+        })
+        mock_slurm_client.return_value = mock_client
+
+        mock_runner = mock.MagicMock()
+        mock_runner.run.return_value = (0, '', '')
+        mock_runner.get_remote_home_dir.return_value = '/home/testuser'
+        mock_ssh_runner.return_value = mock_runner
+
+    def _run_and_capture_script(self, cluster_name, config):
+        """Run _create_virtual_instance and capture the generated script."""
+        written_script = None
+
+        def capture_write(content):
+            nonlocal written_script
+            written_script = content
+
+        with patch('tempfile.NamedTemporaryFile') as mock_tempfile:
+            mock_file = mock.MagicMock()
+            mock_file.__enter__.return_value = mock_file
+            mock_file.write.side_effect = capture_write
+            mock_tempfile.return_value = mock_file
+
+            slurm_instance._create_virtual_instance(
+                region='us-west-2',
+                cluster_name_on_cloud=cluster_name,
+                config=config,
+            )
+
+        assert written_script is not None, "Script was not written"
+        return written_script
+
+    @patch('sky.provision.slurm.instance.slurm_utils.get_proctrack_type')
+    @patch('sky.provision.slurm.instance.slurm_utils.get_partition_info')
+    @patch('sky.provision.slurm.instance.slurm.SlurmClient')
+    @patch('sky.provision.slurm.instance.command_runner.SSHCommandRunner')
+    def test_container_script_format(self, mock_ssh_runner, mock_slurm_client,
+                                     mock_get_partition_info,
+                                     mock_get_proctrack_type):
+        """Test that sbatch provision script for containers is correct."""
+        from sky.provision import common
+
+        self._setup_mocks(mock_ssh_runner, mock_slurm_client,
+                          mock_get_partition_info, 'gpu')
+        mock_get_proctrack_type.return_value = 'cgroup'
+
+        config = common.ProvisionConfig(
+            provider_config={
+                'ssh': {
+                    'hostname': 'login.example.com',
+                    'port': '22',
+                    'user': 'testuser',
+                    'private_key': '/path/to/key',
+                },
+                'cluster': 'test-slurm',
+                'partition': 'gpu',
+            },
+            authentication_config={},
+            docker_config={},
+            node_config={
+                'cpus': 4,
+                'memory': 16,
+                'accelerator_type': 'A100',
+                'accelerator_count': 2,
+                'image_id': 'nvcr.io/nvidia/pytorch:24.01-py3',
+            },
+            count=1,
+            tags={},
+            resume_stopped_nodes=False,
+            ports_to_open_on_launch=None,
+        )
+
+        written_script = self._run_and_capture_script('test-cluster', config)
+        assert_sbatch_matches_snapshot('containers', written_script)
+
+    @patch('sky.provision.slurm.instance.slurm_utils.get_proctrack_type')
+    @patch('sky.provision.slurm.instance.slurm_utils.get_partition_info')
+    @patch('sky.provision.slurm.instance.slurm.SlurmClient')
+    @patch('sky.provision.slurm.instance.command_runner.SSHCommandRunner')
+    def test_non_container_script_format(self, mock_ssh_runner,
+                                         mock_slurm_client,
+                                         mock_get_partition_info,
+                                         mock_get_proctrack_type):
+        """Test that sbatch provision script without containers is correct."""
+        from sky.provision import common
+
+        self._setup_mocks(mock_ssh_runner, mock_slurm_client,
+                          mock_get_partition_info, 'cpus')
+        mock_get_proctrack_type.return_value = 'cgroup'
+
+        config = common.ProvisionConfig(
+            provider_config={
+                'ssh': {
+                    'hostname': 'login.example.com',
+                    'port': '22',
+                    'user': 'testuser',
+                    'private_key': '/path/to/key',
+                },
+                'cluster': 'test-slurm',
+                'partition': 'cpus',
+            },
+            authentication_config={},
+            docker_config={},
+            node_config={
+                'cpus': 2,
+                'memory': 8,
+            },
+            count=1,
+            tags={},
+            resume_stopped_nodes=False,
+            ports_to_open_on_launch=None,
+        )
+
+        written_script = self._run_and_capture_script(
+            'test-cluster-no-container', config)
+        assert_sbatch_matches_snapshot('basic', written_script)
diff --git a/tests/unit_tests/test_sky/clouds/testdata/slurm_sbatch/basic.sh b/tests/unit_tests/test_sky/clouds/testdata/slurm_sbatch/basic.sh
new file mode 100644
index 00000000000..24c32f7b33e
--- /dev/null
+++ b/tests/unit_tests/test_sky/clouds/testdata/slurm_sbatch/basic.sh
@@ -0,0 +1,51 @@
+#!/bin/bash
+#SBATCH --job-name=test-cluster-no-container
+#SBATCH --output=.sky_provision/slurm-%j.out
+#SBATCH --error=.sky_provision/slurm-%j.out
+#SBATCH --nodes=1
+#SBATCH --time=7-00:00:00
+#SBATCH --wait-all-nodes=1
+# Let the job be terminated rather than requeued implicitly.
+#SBATCH --no-requeue
+#SBATCH --cpus-per-task=2
+#SBATCH --mem=8G
+
+
+# Cleanup function to remove cluster dirs on job termination.
+cleanup() {
+    # The Skylet is daemonized, so it is not automatically terminated when
+    # the Slurm job is terminated, we need to kill it manually.
+    echo "Terminating Skylet..."
+    if [ -f "/tmp/test-cluster-no-container/.sky/skylet_pid" ]; then
+        kill $(cat "/tmp/test-cluster-no-container/.sky/skylet_pid") 2>/dev/null || true
+    fi
+    echo "Cleaning up sky directories..."
+    # Remove the per-node enroot container, if it exists.
+    # This is only needed when container_scope=global.
+    # When container_scope=job, named containers are removed automatically
+    # at the end of the Slurm job, see: https://github.com/NVIDIA/pyxis/wiki/Setup#slurm-epilog
+    srun --nodes=1 --ntasks-per-node=1 enroot remove -f pyxis_test-cluster-no-container 2>/dev/null || true
+    # Clean up sky runtime directory on each node.
+    # NOTE: We can do this because --nodes for both this srun and the
+    # sbatch is the same number. Otherwise, there are no guarantees
+    # that this srun will run on the same subset of nodes as the srun
+    # that created the sky directories.
+    srun --nodes=1 rm -rf /tmp/test-cluster-no-container
+    rm -rf /home/testuser/.sky_clusters/test-cluster-no-container
+    exit 0
+}
+trap cleanup TERM
+
+# Create sky home directory and subdirectories for the cluster.
+mkdir -p /home/testuser/.sky_clusters/test-cluster-no-container/sky_logs /home/testuser/.sky_clusters/test-cluster-no-container/sky_workdir /home/testuser/.sky_clusters/test-cluster-no-container/.sky
+# Create sky runtime directory on each node.
+srun --nodes=1 mkdir -p /tmp/test-cluster-no-container
+# Marker file to indicate we're in a Slurm cluster.
+touch /home/testuser/.sky_clusters/test-cluster-no-container/.sky_slurm_cluster
+# Store proctrack type for task executor to read.
+echo 'cgroup' > /home/testuser/.sky_clusters/test-cluster-no-container/.sky_proctrack_type
+# Suppress login messages.
+touch /home/testuser/.sky_clusters/test-cluster-no-container/.hushlogin
+
+touch /home/testuser/.sky_clusters/test-cluster-no-container/.sky_sbatch_ready
+sleep infinity
diff --git a/tests/unit_tests/test_sky/clouds/testdata/slurm_sbatch/containers.sh b/tests/unit_tests/test_sky/clouds/testdata/slurm_sbatch/containers.sh
new file mode 100644
index 00000000000..e6c251409e2
--- /dev/null
+++ b/tests/unit_tests/test_sky/clouds/testdata/slurm_sbatch/containers.sh
@@ -0,0 +1,72 @@
+#!/bin/bash
+#SBATCH --job-name=test-cluster
+#SBATCH --output=.sky_provision/slurm-%j.out
+#SBATCH --error=.sky_provision/slurm-%j.out
+#SBATCH --nodes=1
+#SBATCH --time=7-00:00:00
+#SBATCH --wait-all-nodes=1
+# Let the job be terminated rather than requeued implicitly.
+#SBATCH --no-requeue
+#SBATCH --cpus-per-task=4
+#SBATCH --mem=16G
+#SBATCH --gres=gpu:A100:2
+
+# Cleanup function to remove cluster dirs on job termination.
+cleanup() {
+    # The Skylet is daemonized, so it is not automatically terminated when
+    # the Slurm job is terminated, we need to kill it manually.
+    echo "Terminating Skylet..."
+    if [ -f "/tmp/test-cluster/.sky/skylet_pid" ]; then
+        kill $(cat "/tmp/test-cluster/.sky/skylet_pid") 2>/dev/null || true
+    fi
+    echo "Cleaning up sky directories..."
+    # Remove the per-node enroot container, if it exists.
+    # This is only needed when container_scope=global.
+    # When container_scope=job, named containers are removed automatically
+    # at the end of the Slurm job, see: https://github.com/NVIDIA/pyxis/wiki/Setup#slurm-epilog
+    srun --nodes=1 --ntasks-per-node=1 enroot remove -f pyxis_test-cluster 2>/dev/null || true
+    # Clean up sky runtime directory on each node.
+    # NOTE: We can do this because --nodes for both this srun and the
+    # sbatch is the same number. Otherwise, there are no guarantees
+    # that this srun will run on the same subset of nodes as the srun
+    # that created the sky directories.
+    srun --nodes=1 rm -rf /tmp/test-cluster
+    rm -rf /home/testuser/.sky_clusters/test-cluster
+    exit 0
+}
+trap cleanup TERM
+
+# Create sky home directory and subdirectories for the cluster.
+mkdir -p /home/testuser/.sky_clusters/test-cluster/sky_logs /home/testuser/.sky_clusters/test-cluster/sky_workdir /home/testuser/.sky_clusters/test-cluster/.sky
+# Create sky runtime directory on each node.
+srun --nodes=1 mkdir -p /tmp/test-cluster
+# Marker file to indicate we're in a Slurm cluster.
+touch /home/testuser/.sky_clusters/test-cluster/.sky_slurm_cluster
+# Store proctrack type for task executor to read.
+echo 'cgroup' > /home/testuser/.sky_clusters/test-cluster/.sky_proctrack_type
+# Suppress login messages.
+touch /home/testuser/.sky_clusters/test-cluster/.hushlogin
+srun --nodes=1 mkdir -p /tmp/ccache_$(id -u)
+CONTAINER_START=$SECONDS
+echo "[container] Initializing test-cluster on all nodes"
+rm -rf /home/testuser/.sky_clusters/test-cluster/.sky_container_init_done
+mkdir -p /home/testuser/.sky_clusters/test-cluster/.sky_container_init_done
+srun --overlap --unbuffered --nodes=1 --ntasks-per-node=1 --container-image='nvcr.io#nvidia/pytorch:24.01-py3' --container-name=test-cluster:create --container-mounts="/home/testuser:/home/testuser,/tmp/ccache_$(id -u):/var/cache/ccache" --container-remap-root --no-container-mount-home --container-writable bash -c 'set -e
+echo "[container-init] Starting..."
+INIT_START=$SECONDS
+apt-get update
+apt-get install -y ca-certificates rsync curl git wget fuse
+echo '"'"'alias sudo=""'"'"' >> ~/.bashrc
+echo "[container-init] Packages installed in $((SECONDS - INIT_START))s"
+touch /home/testuser/.sky_clusters/test-cluster/.sky_container_init_done/$SLURM_PROCID && sleep infinity' &
+while true; do
+  num_ready=$(ls -1 /home/testuser/.sky_clusters/test-cluster/.sky_container_init_done 2>/dev/null | wc -l)
+  if [ "$num_ready" -ge "1" ]; then
+    break
+  fi
+  sleep 1
+done
+echo "[container] Ready in $((SECONDS - CONTAINER_START))s"
+touch /home/testuser/.sky_clusters/test-cluster/.sky_slurm_container /home/testuser/.sky_clusters/test-cluster/.sky_sbatch_ready
+
+wait
diff --git a/tests/unit_tests/test_sky/jobs/test_controller.py b/tests/unit_tests/test_sky/jobs/test_controller.py
new file mode 100644
index 00000000000..d310ac5c0c3
--- /dev/null
+++ b/tests/unit_tests/test_sky/jobs/test_controller.py
@@ -0,0 +1,699 @@
+"""Unit tests for sky.jobs.controller - recovery logic for all job types.
+
+Tests cover controller recovery during rolling upgrades for:
+- Normal jobs (single task): Recovery based on task status
+- Pipeline jobs (sequential multi-task): Recovery with task skip logic
+- JobGroups (parallel tasks): Recovery with independent task states
+"""
+import asyncio
+from typing import Dict, List, Optional, Tuple
+from unittest.mock import AsyncMock
+from unittest.mock import MagicMock
+from unittest.mock import patch
+
+import pytest
+
+from sky.jobs import state as managed_job_state
+
+
+class TestNormalJobRecovery:
+    """Tests for normal (single task) job recovery during controller restart.
+
+    When a controller restarts (e.g., during rolling upgrade), it needs to
+    correctly recover a single-task job based on:
+    - latest_task_id: The highest task_id that has been started
+    - last_task_prev_status: The status of that task
+
+    Recovery logic for single task (task_id=0):
+    - If latest_task_id is None or status is PENDING: fresh launch
+    - If latest_task_id > task_id: task already completed, skip
+    - If latest_task_id == task_id and status != PENDING: resume
+    """
+
+    @pytest.fixture
+    def mock_task(self):
+        """Create a mock task."""
+        task = MagicMock()
+        task.name = 'test-task'
+        task.envs = {}
+        task.run = 'echo hello'
+        return task
+
+    @pytest.mark.asyncio
+    async def test_fresh_launch_when_pending(self, mock_task):
+        """Test that PENDING status results in fresh launch."""
+
+        async def mock_get_latest(job_id):
+            return (0, managed_job_state.ManagedJobStatus.PENDING)
+
+        with patch('sky.jobs.state.get_latest_task_id_status_async',
+                   side_effect=mock_get_latest):
+            latest_task_id, last_task_prev_status = await mock_get_latest(
+                job_id=1)
+            task_id = 0
+
+            is_resume = False
+            if (latest_task_id is not None and last_task_prev_status !=
+                    managed_job_state.ManagedJobStatus.PENDING):
+                assert latest_task_id >= task_id
+                if latest_task_id > task_id:
+                    pass  # Already executed
+                elif latest_task_id == task_id:
+                    is_resume = True
+
+            # PENDING means fresh launch, not resume
+            assert is_resume is False
+
+    @pytest.mark.asyncio
+    async def test_fresh_launch_when_none_status(self, mock_task):
+        """Test that None latest_task_id results in fresh launch."""
+
+        async def mock_get_latest(job_id):
+            return (None, None)
+
+        with patch('sky.jobs.state.get_latest_task_id_status_async',
+                   side_effect=mock_get_latest):
+            latest_task_id, last_task_prev_status = await mock_get_latest(
+                job_id=1)
+            task_id = 0
+
+            is_resume = False
+            if (latest_task_id is not None and last_task_prev_status !=
+                    managed_job_state.ManagedJobStatus.PENDING):
+                if latest_task_id > task_id:
+                    pass
+                elif latest_task_id == task_id:
+                    is_resume = True
+
+            # None means fresh launch
+            assert is_resume is False
+
+    @pytest.mark.asyncio
+    async def test_resume_when_running(self, mock_task):
+        """Test that RUNNING status triggers resume."""
+
+        async def mock_get_latest(job_id):
+            return (0, managed_job_state.ManagedJobStatus.RUNNING)
+
+        with patch('sky.jobs.state.get_latest_task_id_status_async',
+                   side_effect=mock_get_latest):
+            latest_task_id, last_task_prev_status = await mock_get_latest(
+                job_id=1)
+            task_id = 0
+
+            is_resume = False
+            if (latest_task_id is not None and last_task_prev_status !=
+                    managed_job_state.ManagedJobStatus.PENDING):
+                assert latest_task_id >= task_id
+                if latest_task_id > task_id:
+                    pass
+                elif latest_task_id == task_id:
+                    is_resume = True
+
+            # RUNNING means we should resume
+            assert is_resume is True
+
+    @pytest.mark.asyncio
+    async def test_resume_when_starting(self, mock_task):
+        """Test that STARTING status triggers resume."""
+
+        async def mock_get_latest(job_id):
+            return (0, managed_job_state.ManagedJobStatus.STARTING)
+
+        with patch('sky.jobs.state.get_latest_task_id_status_async',
+                   side_effect=mock_get_latest):
+            latest_task_id, last_task_prev_status = await mock_get_latest(
+                job_id=1)
+            task_id = 0
+
+            is_resume = False
+            if (latest_task_id is not None and last_task_prev_status !=
+                    managed_job_state.ManagedJobStatus.PENDING):
+                assert latest_task_id >= task_id
+                if latest_task_id > task_id:
+                    pass
+                elif latest_task_id == task_id:
+                    is_resume = True
+
+            # STARTING means we should resume
+            assert is_resume is True
+
+    @pytest.mark.asyncio
+    async def test_resume_when_recovering(self, mock_task):
+        """Test that RECOVERING status triggers resume."""
+
+        async def mock_get_latest(job_id):
+            return (0, managed_job_state.ManagedJobStatus.RECOVERING)
+
+        with patch('sky.jobs.state.get_latest_task_id_status_async',
+                   side_effect=mock_get_latest):
+            latest_task_id, last_task_prev_status = await mock_get_latest(
+                job_id=1)
+            task_id = 0
+
+            is_resume = False
+            if (latest_task_id is not None and last_task_prev_status !=
+                    managed_job_state.ManagedJobStatus.PENDING):
+                assert latest_task_id >= task_id
+                if latest_task_id > task_id:
+                    pass
+                elif latest_task_id == task_id:
+                    is_resume = True
+
+            # RECOVERING means we should resume
+            assert is_resume is True
+
+    @pytest.mark.asyncio
+    async def test_skip_launch_does_not_happen_for_single_task(self, mock_task):
+        """Test that single task never has latest_task_id > task_id."""
+        # For a single task job, task_id is always 0
+        # latest_task_id can only be 0 or None
+        # So the skip logic (latest_task_id > task_id) never applies
+        task_id = 0
+
+        # Simulate completed task - but for single task this means the job
+        # finished successfully and wouldn't be resumed at all
+        latest_task_id = 0
+        last_task_prev_status = managed_job_state.ManagedJobStatus.SUCCEEDED
+
+        should_skip = False
+        is_resume = False
+        if (latest_task_id is not None and last_task_prev_status !=
+                managed_job_state.ManagedJobStatus.PENDING):
+            if latest_task_id > task_id:
+                should_skip = True
+            elif latest_task_id == task_id:
+                is_resume = True
+
+        # For single task, skip never happens (task_id is always 0)
+        assert should_skip is False
+        # Terminal status still triggers resume logic path
+        assert is_resume is True
+
+
+class TestPipelineJobRecovery:
+    """Tests for pipeline (sequential multi-task) job recovery.
+
+    When a controller restarts during a pipeline job:
+    - Tasks with task_id < latest_task_id: Already completed, skip
+    - Task with task_id == latest_task_id: Resume based on status
+    - Tasks with task_id > latest_task_id: Will be run after current completes
+
+    Pipeline jobs run tasks sequentially, so only one task is active at a time.
+    """
+
+    @pytest.fixture
+    def mock_pipeline_dag(self):
+        """Create a mock DAG with 3 sequential tasks."""
+        dag = MagicMock()
+        dag.name = 'test-pipeline'
+        tasks = []
+        for i in range(3):
+            t = MagicMock()
+            t.name = f'pipeline-task-{i}'
+            t.envs = {}
+            t.run = f'echo task-{i}'
+            tasks.append(t)
+        dag.tasks = tasks
+        dag.is_job_group.return_value = False
+        return dag
+
+    @pytest.mark.asyncio
+    async def test_resume_first_task_running(self, mock_pipeline_dag):
+        """Test resuming when first task (task_id=0) was RUNNING."""
+
+        async def mock_get_latest(job_id):
+            return (0, managed_job_state.ManagedJobStatus.RUNNING)
+
+        with patch('sky.jobs.state.get_latest_task_id_status_async',
+                   side_effect=mock_get_latest):
+            latest_task_id, last_task_prev_status = await mock_get_latest(
+                job_id=1)
+
+            # Simulate the loop in run()
+            task_actions: Dict[int, str] = {}  # 'skip', 'resume', 'launch'
+            for task_id, task in enumerate(mock_pipeline_dag.tasks):
+                if (latest_task_id is not None and last_task_prev_status !=
+                        managed_job_state.ManagedJobStatus.PENDING):
+                    if latest_task_id > task_id:
+                        task_actions[task_id] = 'skip'
+                        continue
+                    elif latest_task_id == task_id:
+                        task_actions[task_id] = 'resume'
+                        # In real code, we'd run the task here
+                        break  # Simulate sequential execution
+                else:
+                    task_actions[task_id] = 'launch'
+                    break
+
+            # Task 0 should resume, tasks 1 and 2 not yet processed
+            assert task_actions == {0: 'resume'}
+
+    @pytest.mark.asyncio
+    async def test_resume_middle_task_running(self, mock_pipeline_dag):
+        """Test resuming when middle task (task_id=1) was RUNNING."""
+
+        async def mock_get_latest(job_id):
+            return (1, managed_job_state.ManagedJobStatus.RUNNING)
+
+        with patch('sky.jobs.state.get_latest_task_id_status_async',
+                   side_effect=mock_get_latest):
+            latest_task_id, last_task_prev_status = await mock_get_latest(
+                job_id=1)
+
+            task_actions: Dict[int, str] = {}
+            for task_id, task in enumerate(mock_pipeline_dag.tasks):
+                if (latest_task_id is not None and last_task_prev_status !=
+                        managed_job_state.ManagedJobStatus.PENDING):
+                    if latest_task_id > task_id:
+                        task_actions[task_id] = 'skip'
+                        continue
+                    elif latest_task_id == task_id:
+                        task_actions[task_id] = 'resume'
+                        break
+                else:
+                    task_actions[task_id] = 'launch'
+                    break
+
+            # Task 0 should be skipped, task 1 should resume
+            assert task_actions == {0: 'skip', 1: 'resume'}
+
+    @pytest.mark.asyncio
+    async def test_resume_last_task_running(self, mock_pipeline_dag):
+        """Test resuming when last task (task_id=2) was RUNNING."""
+
+        async def mock_get_latest(job_id):
+            return (2, managed_job_state.ManagedJobStatus.RUNNING)
+
+        with patch('sky.jobs.state.get_latest_task_id_status_async',
+                   side_effect=mock_get_latest):
+            latest_task_id, last_task_prev_status = await mock_get_latest(
+                job_id=1)
+
+            task_actions: Dict[int, str] = {}
+            for task_id, task in enumerate(mock_pipeline_dag.tasks):
+                if (latest_task_id is not None and last_task_prev_status !=
+                        managed_job_state.ManagedJobStatus.PENDING):
+                    if latest_task_id > task_id:
+                        task_actions[task_id] = 'skip'
+                        continue
+                    elif latest_task_id == task_id:
+                        task_actions[task_id] = 'resume'
+                        break
+                else:
+                    task_actions[task_id] = 'launch'
+                    break
+
+            # Tasks 0, 1 should be skipped, task 2 should resume
+            assert task_actions == {0: 'skip', 1: 'skip', 2: 'resume'}
+
+    @pytest.mark.asyncio
+    async def test_skip_completed_task_in_pipeline(self, mock_pipeline_dag):
+        """Test that _run_one_task returns True for completed tasks."""
+        # When task_id < latest_task_id, the task should return True (success)
+        # without actually running, allowing the pipeline to continue
+
+        latest_task_id = 2
+
+        for task_id in range(3):
+            should_skip = latest_task_id > task_id
+
+            if task_id == 0:
+                assert should_skip is True
+            elif task_id == 1:
+                assert should_skip is True
+            elif task_id == 2:
+                assert should_skip is False
+
+    @pytest.mark.asyncio
+    async def test_fresh_launch_all_pending(self, mock_pipeline_dag):
+        """Test fresh launch when all tasks are PENDING."""
+
+        async def mock_get_latest(job_id):
+            return (0, managed_job_state.ManagedJobStatus.PENDING)
+
+        with patch('sky.jobs.state.get_latest_task_id_status_async',
+                   side_effect=mock_get_latest):
+            latest_task_id, last_task_prev_status = await mock_get_latest(
+                job_id=1)
+
+            task_actions: Dict[int, str] = {}
+            for task_id, task in enumerate(mock_pipeline_dag.tasks):
+                if (latest_task_id is not None and last_task_prev_status !=
+                        managed_job_state.ManagedJobStatus.PENDING):
+                    if latest_task_id > task_id:
+                        task_actions[task_id] = 'skip'
+                        continue
+                    elif latest_task_id == task_id:
+                        task_actions[task_id] = 'resume'
+                        break
+                else:
+                    task_actions[task_id] = 'launch'
+                    break
+
+            # First task should be fresh launch (PENDING)
+            assert task_actions == {0: 'launch'}
+
+    @pytest.mark.asyncio
+    async def test_resume_recovering_task(self, mock_pipeline_dag):
+        """Test resuming when task was in RECOVERING state."""
+
+        async def mock_get_latest(job_id):
+            return (1, managed_job_state.ManagedJobStatus.RECOVERING)
+
+        with patch('sky.jobs.state.get_latest_task_id_status_async',
+                   side_effect=mock_get_latest):
+            latest_task_id, last_task_prev_status = await mock_get_latest(
+                job_id=1)
+
+            task_actions: Dict[int, str] = {}
+            for task_id, task in enumerate(mock_pipeline_dag.tasks):
+                if (latest_task_id is not None and last_task_prev_status !=
+                        managed_job_state.ManagedJobStatus.PENDING):
+                    if latest_task_id > task_id:
+                        task_actions[task_id] = 'skip'
+                        continue
+                    elif latest_task_id == task_id:
+                        task_actions[task_id] = 'resume'
+                        break
+                else:
+                    task_actions[task_id] = 'launch'
+                    break
+
+            # Task 0 skipped, task 1 should resume from RECOVERING
+            assert task_actions == {0: 'skip', 1: 'resume'}
+
+    @pytest.mark.asyncio
+    async def test_resume_starting_task(self, mock_pipeline_dag):
+        """Test resuming when task was in STARTING state."""
+
+        async def mock_get_latest(job_id):
+            return (0, managed_job_state.ManagedJobStatus.STARTING)
+
+        with patch('sky.jobs.state.get_latest_task_id_status_async',
+                   side_effect=mock_get_latest):
+            latest_task_id, last_task_prev_status = await mock_get_latest(
+                job_id=1)
+
+            task_actions: Dict[int, str] = {}
+            for task_id, task in enumerate(mock_pipeline_dag.tasks):
+                if (latest_task_id is not None and last_task_prev_status !=
+                        managed_job_state.ManagedJobStatus.PENDING):
+                    if latest_task_id > task_id:
+                        task_actions[task_id] = 'skip'
+                        continue
+                    elif latest_task_id == task_id:
+                        task_actions[task_id] = 'resume'
+                        break
+                else:
+                    task_actions[task_id] = 'launch'
+                    break
+
+            # Task 0 should resume from STARTING
+            assert task_actions == {0: 'resume'}
+
+
+class TestJobGroupRecovery:
+    """Tests for JobGroup recovery during controller rolling upgrade.
+
+    When a controller restarts (e.g., during rolling upgrade), it needs to
+    correctly recover job groups based on each task's individual state:
+    - None/PENDING: fresh launch
+    - Terminal (SUCCEEDED/FAILED/etc.): skip (already done)
+    - RUNNING: resume monitoring without forced recovery
+    - Other non-terminal (STARTING/RECOVERING): resume with forced recovery
+    - CANCELLING: raise CancelledError
+    """
+
+    @pytest.fixture
+    def mock_task(self):
+        """Create a mock task."""
+        task = MagicMock()
+        task.name = 'test-task'
+        task.envs = {}
+        return task
+
+    @pytest.fixture
+    def mock_dag(self, mock_task):
+        """Create a mock DAG with multiple tasks."""
+        dag = MagicMock()
+        dag.name = 'test-job-group'
+        # Create 3 tasks for testing different scenarios
+        tasks = []
+        for i in range(3):
+            t = MagicMock()
+            t.name = f'task-{i}'
+            t.envs = {}
+            tasks.append(t)
+        dag.tasks = tasks
+        return dag
+
+    @pytest.mark.asyncio
+    async def test_resume_with_mixed_task_states(self, mock_dag):
+        """Test resume when tasks are in different states.
+
+        Scenario:
+        - Task 0: SUCCEEDED (terminal) - should be skipped
+        - Task 1: RUNNING - should resume monitoring without forced recovery
+        - Task 2: STARTING - should resume with forced recovery
+        """
+
+        # Mock the state queries to return different statuses for each task
+        async def mock_get_status(job_id, task_id):
+            statuses = {
+                0: managed_job_state.ManagedJobStatus.SUCCEEDED,
+                1: managed_job_state.ManagedJobStatus.RUNNING,
+                2: managed_job_state.ManagedJobStatus.STARTING,
+            }
+            return statuses.get(task_id)
+
+        with patch('sky.jobs.state.get_job_status_with_task_id_async',
+                   side_effect=mock_get_status):
+            # Simulate the resume logic from _run_job_group
+            task_resume_info: Dict[int, Tuple[
+                Optional[managed_job_state.ManagedJobStatus], bool]] = {}
+
+            for task_id, task in enumerate(mock_dag.tasks):
+                task_status = await mock_get_status(job_id=1, task_id=task_id)
+
+                if task_status is None or task_status == (
+                        managed_job_state.ManagedJobStatus.PENDING):
+                    task_resume_info[task_id] = (None, False)
+                elif task_status.is_terminal():
+                    task_resume_info[task_id] = (task_status, False)
+                elif task_status == managed_job_state.ManagedJobStatus.CANCELLING:
+                    raise asyncio.CancelledError()
+                elif task_status == managed_job_state.ManagedJobStatus.RUNNING:
+                    task_resume_info[task_id] = (task_status, False)
+                else:
+                    # Non-terminal, non-RUNNING state - force recovery
+                    task_resume_info[task_id] = (task_status, True)
+
+            # Verify results
+            # Task 0: SUCCEEDED - should be (SUCCEEDED, False) - skip
+            assert task_resume_info[0] == (
+                managed_job_state.ManagedJobStatus.SUCCEEDED, False)
+
+            # Task 1: RUNNING - should be (RUNNING, False) - resume without forced recovery
+            assert task_resume_info[1] == (
+                managed_job_state.ManagedJobStatus.RUNNING, False)
+
+            # Task 2: STARTING - should be (STARTING, True) - force recovery
+            assert task_resume_info[2] == (
+                managed_job_state.ManagedJobStatus.STARTING, True)
+
+    @pytest.mark.asyncio
+    async def test_resume_all_pending_is_fresh_launch(self, mock_dag):
+        """Test that all PENDING tasks result in fresh launch (no resume)."""
+
+        async def mock_get_status(job_id, task_id):
+            return managed_job_state.ManagedJobStatus.PENDING
+
+        with patch('sky.jobs.state.get_job_status_with_task_id_async',
+                   side_effect=mock_get_status):
+            task_resume_info: Dict[int, Tuple[
+                Optional[managed_job_state.ManagedJobStatus], bool]] = {}
+
+            for task_id, task in enumerate(mock_dag.tasks):
+                task_status = await mock_get_status(job_id=1, task_id=task_id)
+
+                if task_status is None or task_status == (
+                        managed_job_state.ManagedJobStatus.PENDING):
+                    task_resume_info[task_id] = (None, False)
+                elif task_status.is_terminal():
+                    task_resume_info[task_id] = (task_status, False)
+                elif task_status == managed_job_state.ManagedJobStatus.RUNNING:
+                    task_resume_info[task_id] = (task_status, False)
+                else:
+                    task_resume_info[task_id] = (task_status, True)
+
+            # All tasks should be (None, False) - fresh launch
+            for task_id in range(len(mock_dag.tasks)):
+                assert task_resume_info[task_id] == (None, False)
+
+    @pytest.mark.asyncio
+    async def test_resume_all_terminal_returns_early(self, mock_dag):
+        """Test that all terminal tasks result in early return."""
+
+        async def mock_get_status(job_id, task_id):
+            # All tasks succeeded
+            return managed_job_state.ManagedJobStatus.SUCCEEDED
+
+        with patch('sky.jobs.state.get_job_status_with_task_id_async',
+                   side_effect=mock_get_status):
+            task_resume_info: Dict[int, Tuple[
+                Optional[managed_job_state.ManagedJobStatus], bool]] = {}
+
+            for task_id, task in enumerate(mock_dag.tasks):
+                task_status = await mock_get_status(job_id=1, task_id=task_id)
+
+                if task_status is None or task_status == (
+                        managed_job_state.ManagedJobStatus.PENDING):
+                    task_resume_info[task_id] = (None, False)
+                elif task_status.is_terminal():
+                    task_resume_info[task_id] = (task_status, False)
+                elif task_status == managed_job_state.ManagedJobStatus.RUNNING:
+                    task_resume_info[task_id] = (task_status, False)
+                else:
+                    task_resume_info[task_id] = (task_status, True)
+
+            # Check if all tasks are terminal
+            all_terminal = all(status is not None and status.is_terminal()
+                               for status, _ in task_resume_info.values())
+
+            assert all_terminal is True
+
+            # All succeeded
+            all_succeeded = all(
+                status == managed_job_state.ManagedJobStatus.SUCCEEDED
+                for status, _ in task_resume_info.values())
+            assert all_succeeded is True
+
+    @pytest.mark.asyncio
+    async def test_resume_cancelling_raises_cancelled_error(self, mock_dag):
+        """Test that CANCELLING status raises CancelledError."""
+
+        async def mock_get_status(job_id, task_id):
+            if task_id == 1:
+                return managed_job_state.ManagedJobStatus.CANCELLING
+            return managed_job_state.ManagedJobStatus.RUNNING
+
+        with patch('sky.jobs.state.get_job_status_with_task_id_async',
+                   side_effect=mock_get_status):
+            with pytest.raises(asyncio.CancelledError):
+                for task_id, task in enumerate(mock_dag.tasks):
+                    task_status = await mock_get_status(job_id=1,
+                                                        task_id=task_id)
+
+                    if task_status is None or task_status == (
+                            managed_job_state.ManagedJobStatus.PENDING):
+                        pass
+                    elif task_status.is_terminal():
+                        pass
+                    elif task_status == managed_job_state.ManagedJobStatus.CANCELLING:
+                        raise asyncio.CancelledError()
+
+    @pytest.mark.asyncio
+    async def test_resume_recovering_state_forces_recovery(self, mock_dag):
+        """Test that RECOVERING status triggers forced recovery."""
+
+        async def mock_get_status(job_id, task_id):
+            return managed_job_state.ManagedJobStatus.RECOVERING
+
+        with patch('sky.jobs.state.get_job_status_with_task_id_async',
+                   side_effect=mock_get_status):
+            task_resume_info: Dict[int, Tuple[
+                Optional[managed_job_state.ManagedJobStatus], bool]] = {}
+
+            for task_id, task in enumerate(mock_dag.tasks):
+                task_status = await mock_get_status(job_id=1, task_id=task_id)
+
+                if task_status is None or task_status == (
+                        managed_job_state.ManagedJobStatus.PENDING):
+                    task_resume_info[task_id] = (None, False)
+                elif task_status.is_terminal():
+                    task_resume_info[task_id] = (task_status, False)
+                elif task_status == managed_job_state.ManagedJobStatus.RUNNING:
+                    task_resume_info[task_id] = (task_status, False)
+                else:
+                    # RECOVERING is non-terminal, non-RUNNING - force recovery
+                    task_resume_info[task_id] = (task_status, True)
+
+            # All tasks should have force_transit_to_recovering=True
+            for task_id in range(len(mock_dag.tasks)):
+                status, force_recovery = task_resume_info[task_id]
+                assert status == managed_job_state.ManagedJobStatus.RECOVERING
+                assert force_recovery is True
+
+    @pytest.mark.asyncio
+    async def test_tasks_to_launch_excludes_non_pending(self, mock_dag):
+        """Test that only PENDING/None tasks are included in launch list."""
+        # Simulate the logic from _run_job_group
+        task_resume_info = {
+            0: (managed_job_state.ManagedJobStatus.SUCCEEDED, False
+               ),  # Terminal
+            1: (managed_job_state.ManagedJobStatus.RUNNING, False),  # Running
+            2: (None, False),  # Fresh launch
+        }
+
+        tasks_to_launch: List[int] = []
+        for task_id in range(len(mock_dag.tasks)):
+            task_status, _ = task_resume_info[task_id]
+            needs_launch = (task_status is None or task_status
+                            == managed_job_state.ManagedJobStatus.PENDING)
+            if needs_launch:
+                tasks_to_launch.append(task_id)
+
+        # Only task 2 should be launched
+        assert tasks_to_launch == [2]
+
+    @pytest.mark.asyncio
+    async def test_terminal_tasks_skipped_in_monitoring(self, mock_dag):
+        """Test that terminal tasks are skipped during monitoring phase."""
+        task_resume_info = {
+            0: (managed_job_state.ManagedJobStatus.SUCCEEDED, False),  # Skip
+            1: (managed_job_state.ManagedJobStatus.FAILED, False),  # Skip
+            2: (managed_job_state.ManagedJobStatus.RUNNING, False),  # Monitor
+        }
+
+        monitor_task_ids: List[int] = []
+        for task_id in range(len(mock_dag.tasks)):
+            task_status, force_recovery = task_resume_info[task_id]
+            if task_status is not None and task_status.is_terminal():
+                continue  # Skip terminal tasks
+            monitor_task_ids.append(task_id)
+
+        # Only task 2 should be monitored
+        assert monitor_task_ids == [2]
+
+    @pytest.mark.asyncio
+    async def test_mixed_terminal_results_check(self, mock_dag):
+        """Test result checking with mix of terminal and monitored tasks."""
+        task_resume_info = {
+            0: (managed_job_state.ManagedJobStatus.SUCCEEDED, False),
+            1: (managed_job_state.ManagedJobStatus.FAILED, False),
+            2: (managed_job_state.ManagedJobStatus.RUNNING, False),
+        }
+
+        # Simulate monitoring results (only task 2 was monitored)
+        monitor_task_ids = [2]
+        results = [True]  # Task 2 succeeded
+
+        # Check results logic from _run_job_group
+        all_succeeded = True
+        for task_id in range(len(mock_dag.tasks)):
+            task_status, _ = task_resume_info[task_id]
+            if task_status is not None and task_status.is_terminal():
+                # Terminal task - check if it succeeded
+                if task_status != managed_job_state.ManagedJobStatus.SUCCEEDED:
+                    all_succeeded = False
+                continue
+
+            # Find the result for this monitored task
+            result_idx = monitor_task_ids.index(task_id)
+            result = results[result_idx]
+            if not result:
+                all_succeeded = False
+
+        # Task 1 FAILED, so overall should be False
+        assert all_succeeded is False
diff --git a/tests/unit_tests/test_sky/jobs/test_file_content_utils.py b/tests/unit_tests/test_sky/jobs/test_file_content_utils.py
index b68860f1c01..d0ada1e3385 100644
--- a/tests/unit_tests/test_sky/jobs/test_file_content_utils.py
+++ b/tests/unit_tests/test_sky/jobs/test_file_content_utils.py
@@ -68,7 +68,7 @@ def _create_basic_job(tmp_path,
                       metadata='{}')
 
     if store_content:
-        state.scheduler_set_waiting(job_id,
+        state.scheduler_set_waiting([job_id],
                                     dag_content,
                                     user_yaml_content,
                                     env_content,
diff --git a/tests/unit_tests/test_sky/jobs/test_jobs_state.py b/tests/unit_tests/test_sky/jobs/test_jobs_state.py
index 35d1e8be68d..32d22e3d919 100644
--- a/tests/unit_tests/test_sky/jobs/test_jobs_state.py
+++ b/tests/unit_tests/test_sky/jobs/test_jobs_state.py
@@ -66,7 +66,7 @@ async def create_job_states():
                           resources_str='{}',
                           metadata='{}')
         # Set priority and schedule state
-        state.scheduler_set_waiting(job_id1, '/tmp/dag1.yaml',
+        state.scheduler_set_waiting([job_id1], '/tmp/dag1.yaml',
                                     '/tmp/user1.yaml', '/tmp/env1', None, 100)
 
         # Job 2: STARTING state (launched but not yet running)
@@ -81,7 +81,7 @@ async def create_job_states():
                           task_name='task0',
                           resources_str='{}',
                           metadata='{}')
-        state.scheduler_set_waiting(job_id2, '/tmp/dag2.yaml',
+        state.scheduler_set_waiting([job_id2], '/tmp/dag2.yaml',
                                     '/tmp/user2.yaml', '/tmp/env2', None, 200)
         await state.set_starting_async(job_id2, 0, 'run_123', time.time(), '{}',
                                        {}, mock_callback)
@@ -98,7 +98,7 @@ async def create_job_states():
                           task_name='task0',
                           resources_str='{}',
                           metadata='{}')
-        state.scheduler_set_waiting(job_id3, '/tmp/dag3.yaml',
+        state.scheduler_set_waiting([job_id3], '/tmp/dag3.yaml',
                                     '/tmp/user3.yaml', '/tmp/env3', None, 50)
         await state.set_starting_async(job_id3, 0, 'run_456', time.time(), '{}',
                                        {}, mock_callback)
@@ -116,7 +116,7 @@ async def create_job_states():
                           task_name='task0',
                           resources_str='{}',
                           metadata='{}')
-        state.scheduler_set_waiting(job_id4, '/tmp/dag4.yaml',
+        state.scheduler_set_waiting([job_id4], '/tmp/dag4.yaml',
                                     '/tmp/user4.yaml', '/tmp/env4', None, 75)
         await state.set_starting_async(job_id4, 0, 'run_789', time.time(), '{}',
                                        {}, mock_callback)
@@ -136,7 +136,7 @@ async def create_job_states():
                           task_name='task0',
                           resources_str='{}',
                           metadata='{}')
-        state.scheduler_set_waiting(job_id5, '/tmp/dag5.yaml',
+        state.scheduler_set_waiting([job_id5], '/tmp/dag5.yaml',
                                     '/tmp/user5.yaml', '/tmp/env5', None, 150)
         await state.set_starting_async(job_id5, 0, 'run_abc', time.time(), '{}',
                                        {}, mock_callback)
@@ -286,7 +286,7 @@ def test_only_terminal_jobs(self, _mock_managed_jobs_db_conn):
                                                    pool_hash=None,
                                                    user_hash='user1')
         state.set_pending(job_id, 0, 'task0', '{}', '{}')
-        state.scheduler_set_waiting(job_id, '/tmp/dag.yaml', '/tmp/user.yaml',
+        state.scheduler_set_waiting([job_id], '/tmp/dag.yaml', '/tmp/user.yaml',
                                     '/tmp/env', None, 300)
         state.scheduler_set_done(job_id)
 
@@ -308,7 +308,7 @@ async def setup_jobs():
                                                         pool_hash=None,
                                                         user_hash='user1')
             state.set_pending(job_id1, 0, 'task0', '{}', '{}')
-            state.scheduler_set_waiting(job_id1, '/tmp/dag1.yaml',
+            state.scheduler_set_waiting([job_id1], '/tmp/dag1.yaml',
                                         '/tmp/user1.yaml', '/tmp/env1', None,
                                         100)
 
@@ -320,7 +320,7 @@ async def setup_jobs():
                                                         pool_hash=None,
                                                         user_hash='user1')
             state.set_pending(job_id2, 0, 'task0', '{}', '{}')
-            state.scheduler_set_waiting(job_id2, '/tmp/dag2.yaml',
+            state.scheduler_set_waiting([job_id2], '/tmp/dag2.yaml',
                                         '/tmp/user2.yaml', '/tmp/env2', None,
                                         250)
             await state.scheduler_set_launching_async(job_id2)
@@ -333,7 +333,7 @@ async def setup_jobs():
                                                         pool_hash=None,
                                                         user_hash='user1')
             state.set_pending(job_id3, 0, 'task0', '{}', '{}')
-            state.scheduler_set_waiting(job_id3, '/tmp/dag3.yaml',
+            state.scheduler_set_waiting([job_id3], '/tmp/dag3.yaml',
                                         '/tmp/user3.yaml', '/tmp/env3', None,
                                         500)
             state.scheduler_set_done(job_id3)
diff --git a/tests/unit_tests/test_sky/jobs/test_server_queue.py b/tests/unit_tests/test_sky/jobs/test_server_queue.py
index 3acbf7b4172..1c570ba6702 100644
--- a/tests/unit_tests/test_sky/jobs/test_server_queue.py
+++ b/tests/unit_tests/test_sky/jobs/test_server_queue.py
@@ -223,7 +223,8 @@ def fake_get_workspaces():
 
         def fake_get_job_table(skip_finished, accessible_workspaces, job_ids,
                                workspace_match, name_match, pool_match, page,
-                               limit, user_hashes, statuses, fields):
+                               limit, user_hashes, statuses, fields, sort_by,
+                               sort_order):
             # Return a payload containing all args for the loader to consume
             return {
                 'skip_finished': skip_finished,
@@ -237,6 +238,8 @@ def fake_get_job_table(skip_finished, accessible_workspaces, job_ids,
                 'user_hashes': user_hashes,
                 'statuses': statuses,
                 'fields': fields,
+                'sort_by': sort_by,
+                'sort_order': sort_order,
             }
 
         def fake_load_managed_job_queue(payload):
@@ -653,12 +656,19 @@ def _apply_pre_filters(base_jobs: List[Dict[str, Any]],
                 result = [j for j in result if not j['status'].is_terminal()]
             return result
 
-        def fake_get_managed_jobs_with_filters(fields, job_ids,
+        def fake_get_managed_jobs_with_filters(fields,
+                                               job_ids,
                                                accessible_workspaces,
-                                               workspace_match, name_match,
-                                               pool_match, user_hashes,
-                                               statuses, skip_finished, page,
-                                               limit):
+                                               workspace_match,
+                                               name_match,
+                                               pool_match,
+                                               user_hashes,
+                                               statuses,
+                                               skip_finished,
+                                               page,
+                                               limit,
+                                               sort_by=None,
+                                               sort_order=None):
             # Apply pre-filters aligned with utils.get_managed_job_queue
             prefiltered = _apply_pre_filters(jobs, accessible_workspaces,
                                              job_ids, user_hashes,
diff --git a/tests/unit_tests/test_sky/jobs/test_utils.py b/tests/unit_tests/test_sky/jobs/test_utils.py
index 7e18e5e721a..67f1629591d 100644
--- a/tests/unit_tests/test_sky/jobs/test_utils.py
+++ b/tests/unit_tests/test_sky/jobs/test_utils.py
@@ -1,9 +1,10 @@
 """Unit tests for sky.jobs.utils functions."""
 import time
-from typing import Any, Dict, List
+from typing import Any, Dict, List, Optional, Tuple, Union
 
 import pytest
 
+from sky import exceptions
 from sky.jobs import state as managed_job_state
 from sky.jobs import utils as jobs_utils
 
@@ -749,3 +750,232 @@ def cmdline(self):
         monkeypatch.setattr(jobs_utils.psutil, 'Process', _NoKeywordProcess)
         assert (jobs_utils.controller_process_alive(record, legacy_job_id=42) is
                 False)
+
+
+class TestStreamLogsByIdTaskFiltering:
+    """Tests for task filtering logic in stream_logs_by_id.
+
+    These tests verify that the task parameter correctly filters logs by
+    task ID (when int) or task name (when str).
+    """
+
+    def _create_task_info(
+        self,
+        tasks: List[Tuple[int, str]],
+        log_file: Optional[str] = None
+    ) -> List[Tuple[int, str, managed_job_state.ManagedJobStatus, Optional[str],
+                    Optional[str]]]:
+        """Create task info tuples for mocking.
+
+        Args:
+            tasks: List of (task_id, task_name) tuples.
+            log_file: Log file path for each task. If None, log reading is
+                skipped which is useful for unit tests that don't need to
+                verify log content.
+
+        Returns:
+            List of task info tuples (task_id, task_name, status, log_path,
+            logs_cleaned_at).
+        """
+        return [(t_id, t_name, managed_job_state.ManagedJobStatus.SUCCEEDED,
+                 log_file, None) for t_id, t_name in tasks]
+
+    def test_task_filter_by_int_matches_task_id(self, monkeypatch):
+        """Test that int task filter matches against task_id."""
+        job_id = 1
+        tasks = [(0, 'train'), (1, 'eval'), (2, 'export')]
+        task_info = self._create_task_info(tasks)
+
+        monkeypatch.setattr(jobs_utils.managed_job_state, 'get_num_tasks',
+                            lambda jid: len(tasks))
+        monkeypatch.setattr(
+            jobs_utils.managed_job_state,
+            'get_all_task_ids_names_statuses_logs',
+            lambda jid: task_info,
+        )
+
+        # Task filter is int 1, should match task_id=1 (eval)
+        # We need to verify the filter finds the right task by checking
+        # that it doesn't return NOT_FOUND
+        # Mock get_status to return a terminal status so we exit early
+        monkeypatch.setattr(
+            jobs_utils.managed_job_state, 'get_status',
+            lambda jid: managed_job_state.ManagedJobStatus.SUCCEEDED)
+
+        # The function will try to stream logs, but we just want to verify
+        # task filtering works. If it returns NOT_FOUND, the filter failed.
+        msg, exit_code = jobs_utils.stream_logs_by_id(job_id,
+                                                      follow=False,
+                                                      task=1)
+
+        # Should NOT return NOT_FOUND since task_id=1 exists
+        assert exit_code != exceptions.JobExitCode.NOT_FOUND
+        assert 'No task found matching' not in msg
+
+    def test_task_filter_by_str_matches_task_name(self, monkeypatch):
+        """Test that str task filter matches against task_name."""
+        job_id = 1
+        tasks = [(0, 'train'), (1, 'eval'), (2, 'export')]
+        task_info = self._create_task_info(tasks)
+
+        monkeypatch.setattr(jobs_utils.managed_job_state, 'get_num_tasks',
+                            lambda jid: len(tasks))
+        monkeypatch.setattr(
+            jobs_utils.managed_job_state,
+            'get_all_task_ids_names_statuses_logs',
+            lambda jid: task_info,
+        )
+        monkeypatch.setattr(
+            jobs_utils.managed_job_state, 'get_status',
+            lambda jid: managed_job_state.ManagedJobStatus.SUCCEEDED)
+
+        # Task filter is str 'eval', should match task_name='eval'
+        msg, exit_code = jobs_utils.stream_logs_by_id(job_id,
+                                                      follow=False,
+                                                      task='eval')
+
+        # Should NOT return NOT_FOUND since task_name='eval' exists
+        assert exit_code != exceptions.JobExitCode.NOT_FOUND
+        assert 'No task found matching' not in msg
+
+    def test_task_filter_int_not_found(self, monkeypatch):
+        """Test that int task filter returns NOT_FOUND when task_id doesn't exist."""
+        job_id = 1
+        tasks = [(0, 'train'), (1, 'eval')]
+        task_info = self._create_task_info(tasks)
+
+        monkeypatch.setattr(jobs_utils.managed_job_state, 'get_num_tasks',
+                            lambda jid: len(tasks))
+        monkeypatch.setattr(
+            jobs_utils.managed_job_state,
+            'get_all_task_ids_names_statuses_logs',
+            lambda jid: task_info,
+        )
+
+        # Task filter is int 5, which doesn't exist
+        msg, exit_code = jobs_utils.stream_logs_by_id(job_id,
+                                                      follow=False,
+                                                      task=5)
+
+        assert exit_code == exceptions.JobExitCode.NOT_FOUND
+        assert 'No task found matching 5' in msg
+        assert 'Valid task IDs are 0-1' in msg
+
+    def test_task_filter_str_not_found(self, monkeypatch):
+        """Test that str task filter returns NOT_FOUND when task_name doesn't exist."""
+        job_id = 1
+        tasks = [(0, 'train'), (1, 'eval')]
+        task_info = self._create_task_info(tasks)
+
+        monkeypatch.setattr(jobs_utils.managed_job_state, 'get_num_tasks',
+                            lambda jid: len(tasks))
+        monkeypatch.setattr(
+            jobs_utils.managed_job_state,
+            'get_all_task_ids_names_statuses_logs',
+            lambda jid: task_info,
+        )
+
+        # Task filter is str 'nonexistent', which doesn't exist
+        msg, exit_code = jobs_utils.stream_logs_by_id(job_id,
+                                                      follow=False,
+                                                      task='nonexistent')
+
+        assert exit_code == exceptions.JobExitCode.NOT_FOUND
+        assert "No task found matching 'nonexistent'" in msg
+
+    def test_task_filter_int_does_not_match_task_name(self, monkeypatch):
+        """Test that int task filter does NOT match task_name even if numeric."""
+        job_id = 1
+        # Task with numeric name '99' but task_id=0
+        tasks = [(0, '99')]
+        task_info = self._create_task_info(tasks)
+
+        monkeypatch.setattr(jobs_utils.managed_job_state, 'get_num_tasks',
+                            lambda jid: len(tasks))
+        monkeypatch.setattr(
+            jobs_utils.managed_job_state,
+            'get_all_task_ids_names_statuses_logs',
+            lambda jid: task_info,
+        )
+
+        # Task filter is int 99, should NOT match task_name='99',
+        # should only try to match task_id
+        msg, exit_code = jobs_utils.stream_logs_by_id(job_id,
+                                                      follow=False,
+                                                      task=99)
+
+        # Should return NOT_FOUND because task_id=99 doesn't exist
+        assert exit_code == exceptions.JobExitCode.NOT_FOUND
+        assert 'No task found matching 99' in msg
+
+    def test_task_filter_str_does_not_match_task_id(self, monkeypatch):
+        """Test that str task filter does NOT match task_id even if numeric."""
+        job_id = 1
+        tasks = [(0, 'train'), (1, 'eval')]
+        task_info = self._create_task_info(tasks)
+
+        monkeypatch.setattr(jobs_utils.managed_job_state, 'get_num_tasks',
+                            lambda jid: len(tasks))
+        monkeypatch.setattr(
+            jobs_utils.managed_job_state,
+            'get_all_task_ids_names_statuses_logs',
+            lambda jid: task_info,
+        )
+
+        # Task filter is str '1', should NOT match task_id=1,
+        # should only try to match task_name
+        msg, exit_code = jobs_utils.stream_logs_by_id(job_id,
+                                                      follow=False,
+                                                      task='1')
+
+        # Should return NOT_FOUND because no task_name='1' exists
+        assert exit_code == exceptions.JobExitCode.NOT_FOUND
+        assert "No task found matching '1'" in msg
+
+    def test_task_filter_none_does_not_filter(self, monkeypatch):
+        """Test that None task filter shows all tasks (no filtering)."""
+        job_id = 1
+        tasks = [(0, 'train'), (1, 'eval')]
+        task_info = self._create_task_info(tasks)
+
+        monkeypatch.setattr(jobs_utils.managed_job_state, 'get_num_tasks',
+                            lambda jid: len(tasks))
+        monkeypatch.setattr(
+            jobs_utils.managed_job_state,
+            'get_all_task_ids_names_statuses_logs',
+            lambda jid: task_info,
+        )
+        monkeypatch.setattr(
+            jobs_utils.managed_job_state, 'get_status',
+            lambda jid: managed_job_state.ManagedJobStatus.SUCCEEDED)
+
+        # Task filter is None, should not filter
+        msg, exit_code = jobs_utils.stream_logs_by_id(job_id,
+                                                      follow=False,
+                                                      task=None)
+
+        # Should NOT return NOT_FOUND
+        assert exit_code != exceptions.JobExitCode.NOT_FOUND
+        assert 'No task found matching' not in msg
+
+    def test_task_filter_single_task_valid_range_message(self, monkeypatch):
+        """Test that error message shows correct valid range for single task."""
+        job_id = 1
+        tasks = [(0, 'train')]
+        task_info = self._create_task_info(tasks)
+
+        monkeypatch.setattr(jobs_utils.managed_job_state, 'get_num_tasks',
+                            lambda jid: len(tasks))
+        monkeypatch.setattr(
+            jobs_utils.managed_job_state,
+            'get_all_task_ids_names_statuses_logs',
+            lambda jid: task_info,
+        )
+
+        msg, exit_code = jobs_utils.stream_logs_by_id(job_id,
+                                                      follow=False,
+                                                      task=5)
+
+        assert exit_code == exceptions.JobExitCode.NOT_FOUND
+        # Single task should show '0' not '0-0'
+        assert 'Valid task IDs are 0.' in msg or 'Valid task IDs are 0,' in msg
diff --git a/tests/unit_tests/test_sky/provision/slurm/test_slurm_utils.py b/tests/unit_tests/test_sky/provision/slurm/test_slurm_utils.py
index e3916ad151a..fb8ceeb603c 100644
--- a/tests/unit_tests/test_sky/provision/slurm/test_slurm_utils.py
+++ b/tests/unit_tests/test_sky/provision/slurm/test_slurm_utils.py
@@ -18,3 +18,37 @@ def test_format_slurm_duration(self, duration_seconds, expected):
         """Test format_slurm_duration with various inputs."""
         result = utils.format_slurm_duration(duration_seconds)
         assert result == expected
+
+
+class TestGetIdentityFile:
+    """Test get_identity_file() helper function."""
+
+    @pytest.mark.parametrize(
+        'ssh_config_dict,expected',
+        [
+            # Returns first file when multiple identity files are present
+            ({
+                'identityfile': ['/path/to/key1', '/path/to/key2']
+            }, '/path/to/key1'),
+            # Returns single identity file
+            ({
+                'identityfile': ['/home/user/.ssh/id_rsa']
+            }, '/home/user/.ssh/id_rsa'),
+            # Returns None when identityfile key is missing
+            ({
+                'hostname': 'example.com',
+                'user': 'testuser'
+            }, None),
+            # Returns None when identityfile is an empty list
+            ({
+                'identityfile': []
+            }, None),
+            # Returns None when identityfile value is None
+            ({
+                'identityfile': None
+            }, None),
+        ])
+    def test_get_identity_file(self, ssh_config_dict, expected):
+        """Test get_identity_file with various SSH config inputs."""
+        result = utils.get_identity_file(ssh_config_dict)
+        assert result == expected
diff --git a/tests/unit_tests/test_sky/server/auth/test_oauth2_proxy.py b/tests/unit_tests/test_sky/server/auth/test_oauth2_proxy.py
index 6e6c0037614..185c46ad0a0 100644
--- a/tests/unit_tests/test_sky/server/auth/test_oauth2_proxy.py
+++ b/tests/unit_tests/test_sky/server/auth/test_oauth2_proxy.py
@@ -134,9 +134,6 @@ def mock_request(self):
         request.state = mock.Mock()
         request.state.auth_user = None
         request.state.request_id = 'test-request-123'
-        # Mock body and json methods for authn.override_user_info_in_request_body
-        request.body = mock.AsyncMock(return_value=b'{"task": "test"}')
-        request.json = mock.AsyncMock(return_value={'task': 'test'})
         return request
 
     @pytest.fixture
diff --git a/tests/unit_tests/test_sky/server/auth/test_sessions.py b/tests/unit_tests/test_sky/server/auth/test_sessions.py
new file mode 100644
index 00000000000..a941f7ce160
--- /dev/null
+++ b/tests/unit_tests/test_sky/server/auth/test_sessions.py
@@ -0,0 +1,115 @@
+"""Tests for the auth sessions module."""
+import hashlib
+import time
+
+import pytest
+
+from sky.server import constants as server_constants
+from sky.server.auth import sessions
+from sky.utils import common_utils
+
+
+class TestComputeCodeChallenge:
+    """Tests for the compute_code_challenge function."""
+
+    def test_compute_challenge(self):
+        code_verifier = 'test_verifier_123456'
+        verifier_hash = hashlib.sha256(code_verifier.encode('utf-8')).digest()
+        expected = common_utils.base64_url_encode(verifier_hash)
+        assert common_utils.compute_code_challenge(code_verifier) == expected
+
+    def test_different_verifiers_different_challenges(self):
+        challenge1 = common_utils.compute_code_challenge('verifier1')
+        challenge2 = common_utils.compute_code_challenge('verifier2')
+        assert challenge1 != challenge2
+
+
+class TestAuthSessionStore:
+    """Tests for the AuthSessionStore class."""
+
+    @pytest.fixture
+    def store(self, tmp_path):
+        """Create a store with a temporary database."""
+        db_path = str(tmp_path / 'test_sessions.db')
+        store = sessions.AuthSessionStore()
+        store._db_path = db_path  # pylint: disable=protected-access
+        return store
+
+    def test_create_session(self, store):
+        code_verifier = 'test_verifier'
+        code_challenge = common_utils.compute_code_challenge(code_verifier)
+
+        store.create_session(code_challenge, 'my_token')
+
+        token = store.poll_session(code_verifier)
+        assert token == 'my_token'
+
+    def test_create_session_overwrites(self, store):
+        # Duplicate authorize clicks should just overwrite
+        code_verifier = 'test_verifier'
+        code_challenge = common_utils.compute_code_challenge(code_verifier)
+
+        store.create_session(code_challenge, 'token1')
+        store.create_session(code_challenge, 'token2')
+
+        token = store.poll_session(code_verifier)
+        assert token == 'token2'
+
+    def test_poll_session_consumes(self, store):
+        code_verifier = 'test_verifier'
+        code_challenge = common_utils.compute_code_challenge(code_verifier)
+
+        store.create_session(code_challenge, 'my_token')
+
+        # First poll returns the token
+        token = store.poll_session(code_verifier)
+        assert token == 'my_token'
+
+        # Session should be consumed (deleted)
+        token = store.poll_session(code_verifier)
+        assert token is None
+
+    def test_poll_session_not_found(self, store):
+        # Poll with a verifier that has no corresponding session
+        token = store.poll_session('nonexistent_verifier')
+        assert token is None
+
+    def test_poll_expired_session(self, store, monkeypatch):
+        code_verifier = 'test_verifier'
+        code_challenge = common_utils.compute_code_challenge(code_verifier)
+
+        store.create_session(code_challenge, 'my_token')
+
+        # Fast-forward time past expiration
+        future_time = time.time(
+        ) + server_constants.AUTH_SESSION_TIMEOUT_SECONDS + 10
+        monkeypatch.setattr(time, 'time', lambda: future_time)
+
+        # Should return None due to expiration
+        token = store.poll_session(code_verifier)
+        assert token is None
+
+    def test_expired_sessions_cleanup(self, store, monkeypatch):
+        code_verifier1 = 'verifier1'
+        code_verifier2 = 'verifier2'
+        code_challenge1 = common_utils.compute_code_challenge(code_verifier1)
+        code_challenge2 = common_utils.compute_code_challenge(code_verifier2)
+
+        store.create_session(code_challenge1, 'token1')
+        store.create_session(code_challenge2, 'token2')
+
+        # Fast-forward time past expiration
+        future_time = time.time(
+        ) + server_constants.AUTH_SESSION_TIMEOUT_SECONDS + 10
+        monkeypatch.setattr(time, 'time', lambda: future_time)
+
+        # Both should be expired now
+        assert store.poll_session(code_verifier1) is None
+        assert store.poll_session(code_verifier2) is None
+
+
+class TestGlobalStore:
+
+    def test_global_store_exists(self):
+        assert isinstance(sessions.auth_session_store,
+                          sessions.AuthSessionStore)
diff --git a/tests/unit_tests/test_sky/server/requests/serializers/test_return_value_serializers.py b/tests/unit_tests/test_sky/server/requests/serializers/test_return_value_serializers.py
index 552a428ca18..5cc0b4a1fad 100644
--- a/tests/unit_tests/test_sky/server/requests/serializers/test_return_value_serializers.py
+++ b/tests/unit_tests/test_sky/server/requests/serializers/test_return_value_serializers.py
@@ -281,6 +281,209 @@ def test_empty_node_info_dict(self, mock_get_version):
         result = return_value_serializers.serialize_kubernetes_node_info(data)
         assert json.loads(result) == data
 
+    @mock.patch(
+        'sky.server.requests.serializers.return_value_serializers.versions.get_remote_api_version'
+    )
+    def test_remote_version_28_keeps_is_cordoned_and_taints(
+            self, mock_get_version):
+        """Test that is_cordoned and taints are kept when remote_api_version >= 28."""
+        mock_get_version.return_value = 28
+        data = {
+            'node_info_dict': {
+                'node1': {
+                    'name': 'node1',
+                    'is_cordoned': True,
+                    'taints': ['NoSchedule']
+                },
+                'node2': {
+                    'name': 'node2',
+                    'is_cordoned': False,
+                    'taints': []
+                }
+            }
+        }
+        result = return_value_serializers.serialize_kubernetes_node_info(data)
+        parsed = json.loads(result)
+        # is_cordoned and taints should be preserved
+        assert parsed['node_info_dict']['node1']['is_cordoned'] is True
+        assert parsed['node_info_dict']['node1']['taints'] == ['NoSchedule']
+        assert parsed['node_info_dict']['node2']['is_cordoned'] is False
+        assert parsed['node_info_dict']['node2']['taints'] == []
+
+    @mock.patch(
+        'sky.server.requests.serializers.return_value_serializers.versions.get_remote_api_version'
+    )
+    def test_remote_version_above_28_keeps_is_cordoned_and_taints(
+            self, mock_get_version):
+        """Test that is_cordoned and taints are kept when remote_api_version > 28."""
+        mock_get_version.return_value = 30
+        data = {
+            'node_info_dict': {
+                'node1': {
+                    'name': 'node1',
+                    'is_cordoned': True,
+                    'taints': ['NoSchedule', 'NoExecute']
+                }
+            }
+        }
+        result = return_value_serializers.serialize_kubernetes_node_info(data)
+        parsed = json.loads(result)
+        assert parsed['node_info_dict']['node1']['is_cordoned'] is True
+        assert parsed['node_info_dict']['node1']['taints'] == [
+            'NoSchedule', 'NoExecute'
+        ]
+
+    @mock.patch(
+        'sky.server.requests.serializers.return_value_serializers.versions.get_remote_api_version'
+    )
+    def test_remote_version_below_28_removes_is_cordoned_and_taints(
+            self, mock_get_version):
+        """Test that is_cordoned and taints are removed when remote_api_version < 28."""
+        mock_get_version.return_value = 27
+        data = {
+            'node_info_dict': {
+                'node1': {
+                    'name': 'node1',
+                    'is_cordoned': True,
+                    'taints': ['NoSchedule'],
+                    'other_field': 'value'
+                },
+                'node2': {
+                    'name': 'node2',
+                    'is_cordoned': False,
+                    'taints': [],
+                    'other_field': 'value2'
+                }
+            }
+        }
+        result = return_value_serializers.serialize_kubernetes_node_info(data)
+        parsed = json.loads(result)
+        # is_cordoned and taints should be removed
+        assert 'is_cordoned' not in parsed['node_info_dict']['node1']
+        assert 'taints' not in parsed['node_info_dict']['node1']
+        assert 'is_cordoned' not in parsed['node_info_dict']['node2']
+        assert 'taints' not in parsed['node_info_dict']['node2']
+        # Other fields should remain
+        assert parsed['node_info_dict']['node1']['name'] == 'node1'
+        assert parsed['node_info_dict']['node1']['other_field'] == 'value'
+        assert parsed['node_info_dict']['node2']['name'] == 'node2'
+        assert parsed['node_info_dict']['node2']['other_field'] == 'value2'
+
+    @mock.patch(
+        'sky.server.requests.serializers.return_value_serializers.versions.get_remote_api_version'
+    )
+    def test_combined_version_compatibility_old_client(self, mock_get_version):
+        """Test combined version compatibility for old clients (API version < 25).
+
+        Old clients should not see any of the newer fields: is_ready,
+        cpu_count, memory_gb, cpu_free, memory_free_gb, is_cordoned, taints.
+        """
+        mock_get_version.return_value = 24
+        data = {
+            'node_info_dict': {
+                'node1': {
+                    'name': 'node1',
+                    'is_ready': True,
+                    'cpu_count': 8,
+                    'memory_gb': 32.0,
+                    'cpu_free': 4,
+                    'memory_free_gb': 16.0,
+                    'is_cordoned': False,
+                    'taints': [],
+                    'other_field': 'value'
+                }
+            }
+        }
+        result = return_value_serializers.serialize_kubernetes_node_info(data)
+        parsed = json.loads(result)
+        node = parsed['node_info_dict']['node1']
+        # All newer fields should be removed
+        assert 'is_ready' not in node
+        assert 'cpu_count' not in node
+        assert 'memory_gb' not in node
+        assert 'cpu_free' not in node
+        assert 'memory_free_gb' not in node
+        assert 'is_cordoned' not in node
+        assert 'taints' not in node
+        # Basic fields should remain
+        assert node['name'] == 'node1'
+        assert node['other_field'] == 'value'
+
+    @mock.patch(
+        'sky.server.requests.serializers.return_value_serializers.versions.get_remote_api_version'
+    )
+    def test_combined_version_compatibility_version_25(self, mock_get_version):
+        """Test combined version compatibility for API version 25 clients.
+
+        Version 25 clients should see is_ready but not cpu_count, memory_gb,
+        cpu_free, memory_free_gb, is_cordoned, taints.
+        """
+        mock_get_version.return_value = 25
+        data = {
+            'node_info_dict': {
+                'node1': {
+                    'name': 'node1',
+                    'is_ready': True,
+                    'cpu_count': 8,
+                    'memory_gb': 32.0,
+                    'cpu_free': 4,
+                    'memory_free_gb': 16.0,
+                    'is_cordoned': False,
+                    'taints': ['NoSchedule']
+                }
+            }
+        }
+        result = return_value_serializers.serialize_kubernetes_node_info(data)
+        parsed = json.loads(result)
+        node = parsed['node_info_dict']['node1']
+        # is_ready should be preserved
+        assert node['is_ready'] is True
+        # Resource fields should be removed
+        assert 'cpu_count' not in node
+        assert 'memory_gb' not in node
+        assert 'cpu_free' not in node
+        assert 'memory_free_gb' not in node
+        # Cordon/taint fields should be removed
+        assert 'is_cordoned' not in node
+        assert 'taints' not in node
+
+    @mock.patch(
+        'sky.server.requests.serializers.return_value_serializers.versions.get_remote_api_version'
+    )
+    def test_combined_version_compatibility_version_26(self, mock_get_version):
+        """Test combined version compatibility for API version 26 clients.
+
+        Version 26 clients should see is_ready and resource fields but not
+        is_cordoned and taints.
+        """
+        mock_get_version.return_value = 26
+        data = {
+            'node_info_dict': {
+                'node1': {
+                    'name': 'node1',
+                    'is_ready': True,
+                    'cpu_count': 8,
+                    'memory_gb': 32.0,
+                    'cpu_free': 4,
+                    'memory_free_gb': 16.0,
+                    'is_cordoned': True,
+                    'taints': ['NoSchedule']
+                }
+            }
+        }
+        result = return_value_serializers.serialize_kubernetes_node_info(data)
+        parsed = json.loads(result)
+        node = parsed['node_info_dict']['node1']
+        # is_ready and resource fields should be preserved
+        assert node['is_ready'] is True
+        assert node['cpu_count'] == 8
+        assert node['memory_gb'] == 32.0
+        assert node['cpu_free'] == 4
+        assert node['memory_free_gb'] == 16.0
+        # Cordon/taint fields should be removed
+        assert 'is_cordoned' not in node
+        assert 'taints' not in node
+
 
 class TestHandlersRegistration:
     """Tests to verify the handlers are registered correctly at import time."""
diff --git a/tests/unit_tests/test_sky/server/test_bearer_token_middleware.py b/tests/unit_tests/test_sky/server/test_bearer_token_middleware.py
index 43f1026ba03..2b867eb4472 100644
--- a/tests/unit_tests/test_sky/server/test_bearer_token_middleware.py
+++ b/tests/unit_tests/test_sky/server/test_bearer_token_middleware.py
@@ -141,8 +141,7 @@ async def test_valid_service_account_token_success(self, middleware,
             {constants.ENV_VAR_ENABLE_SERVICE_ACCOUNTS: 'true'}), \
                 mock.patch('sky.users.token_service.token_service') as mock_token_service, \
                 mock.patch('sky.global_user_state.get_user') as mock_get_user, \
-                mock.patch('sky.global_user_state.update_service_account_token_last_used') as mock_update_last_used, \
-                mock.patch('sky.server.auth.authn.override_user_info_in_request_body') as mock_override_user_info:
+                mock.patch('sky.global_user_state.update_service_account_token_last_used') as mock_update_last_used:
 
             mock_token_service.verify_token.return_value = mock_payload
             mock_get_user.return_value = mock_user_info
@@ -156,8 +155,6 @@ async def test_valid_service_account_token_success(self, middleware,
             assert base_mock_request.state.auth_user.name == 'test-service-account'
             # Verify token last used was updated
             mock_update_last_used.assert_called_once_with('token_123')
-            # Verify user info override was called
-            mock_override_user_info.assert_called_once()
 
     @pytest.mark.asyncio
     async def test_missing_user_id_in_token(self, middleware, base_mock_request,
@@ -262,8 +259,7 @@ async def test_update_last_used_failure_not_fatal(self, middleware,
             {constants.ENV_VAR_ENABLE_SERVICE_ACCOUNTS: 'true'}), \
                 mock.patch('sky.users.token_service.token_service') as mock_token_service, \
                 mock.patch('sky.global_user_state.get_user') as mock_get_user, \
-                mock.patch('sky.global_user_state.update_service_account_token_last_used') as mock_update_last_used, \
-                mock.patch('sky.server.auth.authn.override_user_info_in_request_body'):
+                mock.patch('sky.global_user_state.update_service_account_token_last_used') as mock_update_last_used:
 
             mock_token_service.verify_token.return_value = mock_payload
             mock_get_user.return_value = mock_user_info
@@ -323,8 +319,7 @@ async def test_case_insensitive_bearer_check(self, middleware,
             {constants.ENV_VAR_ENABLE_SERVICE_ACCOUNTS: 'true'}), \
                 mock.patch('sky.users.token_service.token_service') as mock_token_service, \
                 mock.patch('sky.global_user_state.get_user') as mock_get_user, \
-                mock.patch('sky.global_user_state.update_service_account_token_last_used'), \
-                mock.patch('sky.server.auth.authn.override_user_info_in_request_body'):
+                mock.patch('sky.global_user_state.update_service_account_token_last_used'):
 
             mock_token_service.verify_token.return_value = mock_payload
             mock_get_user.return_value = mock_user_info
@@ -409,7 +404,6 @@ async def bearer_then_basic_call_next(request):
                 mock.patch('sky.users.token_service.token_service') as mock_token_service, \
                 mock.patch('sky.global_user_state.get_user') as mock_get_user, \
                 mock.patch('sky.global_user_state.update_service_account_token_last_used'), \
-                mock.patch('sky.server.auth.authn.override_user_info_in_request_body'), \
                 mock.patch('sky.jobs.utils.is_consolidation_mode', return_value=False):
 
             mock_token_service.verify_token.return_value = mock_payload
diff --git a/tests/unit_tests/test_sky/server/test_config.py b/tests/unit_tests/test_sky/server/test_config.py
index 570eb431fd5..cc892ff1abb 100644
--- a/tests/unit_tests/test_sky/server/test_config.py
+++ b/tests/unit_tests/test_sky/server/test_config.py
@@ -1,5 +1,8 @@
+import os
+import tempfile
 from unittest import mock
 
+import jwt as pyjwt
 import pytest
 
 from sky.server import config
@@ -139,3 +142,497 @@ def test_parallel_size_short():
     expected = 3
     assert config._max_short_worker_parallism(mem_size_gb,
                                               blocking_size) == expected
+
+
+class TestExternalProxyConfig:
+    """Test cases for ExternalProxyConfig dataclass."""
+
+    def test_default_values(self):
+        """Test default configuration values."""
+        proxy_config = config.ExternalProxyConfig()
+        assert proxy_config.enabled is False
+        assert proxy_config.header_name == 'X-Auth-Request-Email'
+        assert proxy_config.header_format == 'plaintext'
+        assert proxy_config.jwt_identity_claim == 'sub'
+
+    def test_custom_values(self):
+        """Test custom configuration values."""
+        proxy_config = config.ExternalProxyConfig(
+            enabled=True,
+            header_name='x-amzn-oidc-data',
+            header_format='jwt',
+            jwt_identity_claim='email',
+        )
+        assert proxy_config.enabled is True
+        assert proxy_config.header_name == 'x-amzn-oidc-data'
+        assert proxy_config.header_format == 'jwt'
+        assert proxy_config.jwt_identity_claim == 'email'
+
+
+class TestLoadServerConfig:
+    """Test cases for load_server_config function."""
+
+    def test_load_empty_config(self):
+        """Test loading when no config file exists."""
+        with mock.patch.object(config, 'SERVER_CONFIG_PATH',
+                               '/nonexistent/path'):
+            server_config = config.load_server_config()
+
+        assert server_config == {}
+
+    def test_load_config_from_file(self):
+        """Test loading configuration from file."""
+        yaml_content = """
+auth:
+  external_proxy:
+    enabled: true
+    header_name: x-custom
+"""
+        with tempfile.NamedTemporaryFile(mode='w', suffix='.yaml',
+                                         delete=False) as f:
+            f.write(yaml_content)
+            f.flush()
+            temp_path = f.name
+
+        try:
+            with mock.patch.object(config, 'SERVER_CONFIG_PATH', temp_path):
+                server_config = config.load_server_config()
+
+            assert server_config.get_nested(
+                ('auth', 'external_proxy', 'enabled'), False) is True
+            assert server_config.get_nested(
+                ('auth', 'external_proxy', 'header_name'), None) == 'x-custom'
+        finally:
+            os.unlink(temp_path)
+
+
+def _mock_server_config(config_dict):
+    """Helper to create a mock for load_server_config."""
+    from sky.utils import config_utils
+    return config_utils.Config.from_dict(
+        config_dict) if config_dict else config_utils.Config()
+
+
+class TestLoadExternalProxyConfig:
+    """Test cases for load_external_proxy_config function.
+
+    These tests mock load_server_config directly to avoid race conditions
+    in parallel test execution. Environment variables are mocked without
+    clear=True to avoid affecting other parallel tests.
+    """
+
+    def setup_method(self):
+        """Clear the lru_cache before each test."""
+        config.load_external_proxy_config.cache_clear()
+
+    def teardown_method(self):
+        """Clear the lru_cache after each test."""
+        config.load_external_proxy_config.cache_clear()
+
+    def test_load_from_server_yaml(self):
+        """Test loading configuration from server.yaml file."""
+        mock_config = _mock_server_config({
+            'auth': {
+                'external_proxy': {
+                    'enabled': True,
+                    'header_name': 'x-amzn-oidc-data',
+                    'header_format': 'jwt',
+                    'jwt_identity_claim': 'email',
+                }
+            }
+        })
+        with mock.patch('sky.server.config.load_server_config',
+                        return_value=mock_config):
+            with mock.patch.dict(
+                    os.environ, {
+                        'ENABLE_BASIC_AUTH': '',
+                        'SKYPILOT_AUTH_OAUTH2_PROXY_ENABLED': '',
+                        'SKYPILOT_AUTH_USER_HEADER': '',
+                    }):
+                proxy_config = config.load_external_proxy_config()
+
+        assert proxy_config.enabled is True
+        assert proxy_config.header_name == 'x-amzn-oidc-data'
+        assert proxy_config.header_format == 'jwt'
+        assert proxy_config.jwt_identity_claim == 'email'
+
+    def test_load_from_server_yaml_disabled(self):
+        """Test loading disabled configuration from server.yaml."""
+        mock_config = _mock_server_config(
+            {'auth': {
+                'external_proxy': {
+                    'enabled': False,
+                }
+            }})
+        with mock.patch('sky.server.config.load_server_config',
+                        return_value=mock_config):
+            with mock.patch.dict(
+                    os.environ, {
+                        'ENABLE_BASIC_AUTH': '',
+                        'SKYPILOT_AUTH_OAUTH2_PROXY_ENABLED': '',
+                        'SKYPILOT_AUTH_USER_HEADER': '',
+                    }):
+                proxy_config = config.load_external_proxy_config()
+
+        assert proxy_config.enabled is False
+
+    def test_default_enabled_when_no_config_and_no_builtin_auth(self):
+        """Test that external proxy is enabled by default for backward compat.
+
+        When no config file exists and no built-in auth is enabled, external
+        proxy auth should be enabled to support legacy ingress oauth2-proxy.
+        """
+        mock_config = _mock_server_config({})
+        with mock.patch('sky.server.config.load_server_config',
+                        return_value=mock_config):
+            with mock.patch.dict(
+                    os.environ, {
+                        'ENABLE_BASIC_AUTH': '',
+                        'SKYPILOT_AUTH_OAUTH2_PROXY_ENABLED': '',
+                        'SKYPILOT_AUTH_USER_HEADER': '',
+                    }):
+                proxy_config = config.load_external_proxy_config()
+
+        # Enabled by default for backward compatibility
+        assert proxy_config.enabled is True
+
+    def test_default_disabled_when_basic_auth_enabled(self):
+        """Test that external proxy is disabled when basic auth is enabled."""
+        mock_config = _mock_server_config({})
+        with mock.patch('sky.server.config.load_server_config',
+                        return_value=mock_config):
+            with mock.patch.dict(
+                    os.environ, {
+                        'ENABLE_BASIC_AUTH': 'true',
+                        'SKYPILOT_AUTH_OAUTH2_PROXY_ENABLED': '',
+                        'SKYPILOT_AUTH_USER_HEADER': '',
+                    }):
+                proxy_config = config.load_external_proxy_config()
+
+        # Disabled because built-in basic auth is enabled
+        assert proxy_config.enabled is False
+
+    def test_default_disabled_when_oauth2_proxy_on_server_enabled(self):
+        """Test that external proxy is disabled when oauth2-proxy on server."""
+        mock_config = _mock_server_config({})
+        with mock.patch('sky.server.config.load_server_config',
+                        return_value=mock_config):
+            with mock.patch.dict(
+                    os.environ, {
+                        'ENABLE_BASIC_AUTH': '',
+                        'SKYPILOT_AUTH_OAUTH2_PROXY_ENABLED': 'true',
+                        'SKYPILOT_AUTH_USER_HEADER': '',
+                    }):
+                proxy_config = config.load_external_proxy_config()
+
+        # Disabled because oauth2-proxy on API server is enabled
+        assert proxy_config.enabled is False
+
+    def test_explicit_enabled_overrides_builtin_auth_check(self):
+        """Test that explicit enabled=true overrides built-in auth check."""
+        mock_config = _mock_server_config(
+            {'auth': {
+                'external_proxy': {
+                    'enabled': True,
+                }
+            }})
+        with mock.patch('sky.server.config.load_server_config',
+                        return_value=mock_config):
+            # Even with basic auth enabled, explicit config takes precedence
+            with mock.patch.dict(
+                    os.environ, {
+                        'ENABLE_BASIC_AUTH': 'true',
+                        'SKYPILOT_AUTH_OAUTH2_PROXY_ENABLED': '',
+                        'SKYPILOT_AUTH_USER_HEADER': '',
+                    }):
+                proxy_config = config.load_external_proxy_config()
+
+        assert proxy_config.enabled is True
+
+    def test_explicit_disabled_when_no_builtin_auth(self):
+        """Test that explicit enabled=false is respected."""
+        mock_config = _mock_server_config(
+            {'auth': {
+                'external_proxy': {
+                    'enabled': False,
+                }
+            }})
+        with mock.patch('sky.server.config.load_server_config',
+                        return_value=mock_config):
+            with mock.patch.dict(
+                    os.environ, {
+                        'ENABLE_BASIC_AUTH': '',
+                        'SKYPILOT_AUTH_OAUTH2_PROXY_ENABLED': '',
+                        'SKYPILOT_AUTH_USER_HEADER': '',
+                    }):
+                proxy_config = config.load_external_proxy_config()
+
+        # Explicit false is respected even without built-in auth
+        assert proxy_config.enabled is False
+
+    def test_legacy_env_var_overrides_header_name(self):
+        """Test that legacy env var overrides header_name from config."""
+        mock_config = _mock_server_config({
+            'auth': {
+                'external_proxy': {
+                    'enabled': True,
+                    'header_name': 'x-from-yaml',
+                    'header_format': 'plaintext',
+                }
+            }
+        })
+        with mock.patch('sky.server.config.load_server_config',
+                        return_value=mock_config):
+            with mock.patch.dict(
+                    os.environ, {
+                        'ENABLE_BASIC_AUTH': '',
+                        'SKYPILOT_AUTH_OAUTH2_PROXY_ENABLED': '',
+                        'SKYPILOT_AUTH_USER_HEADER': 'X-From-Env',
+                    }):
+                proxy_config = config.load_external_proxy_config()
+
+        # Legacy env var should override header_name
+        assert proxy_config.header_name == 'X-From-Env'
+
+    def test_legacy_env_var_with_jwt_format_raises_error(self):
+        """Test that legacy env var with JWT format raises ValueError."""
+        mock_config = _mock_server_config({
+            'auth': {
+                'external_proxy': {
+                    'enabled': True,
+                    'header_name': 'x-amzn-oidc-data',
+                    'header_format': 'jwt',
+                }
+            }
+        })
+        with mock.patch('sky.server.config.load_server_config',
+                        return_value=mock_config):
+            with mock.patch.dict(
+                    os.environ, {
+                        'ENABLE_BASIC_AUTH': '',
+                        'SKYPILOT_AUTH_OAUTH2_PROXY_ENABLED': '',
+                        'SKYPILOT_AUTH_USER_HEADER': 'X-From-Env',
+                    }):
+                with pytest.raises(ValueError) as exc_info:
+                    config.load_external_proxy_config()
+
+        assert 'header_format is "jwt"' in str(exc_info.value)
+        assert 'SKYPILOT_AUTH_USER_HEADER' in str(exc_info.value)
+
+    def test_empty_server_yaml_defaults_to_enabled(self):
+        """Test that empty server.yaml defaults to enabled for backward compat.
+
+        When server.yaml exists but doesn't set enabled, and no built-in auth
+        is active, external proxy should be enabled for backward compatibility
+        with legacy ingress oauth2-proxy setups.
+        """
+        mock_config = _mock_server_config({})
+        with mock.patch('sky.server.config.load_server_config',
+                        return_value=mock_config):
+            with mock.patch.dict(
+                    os.environ, {
+                        'ENABLE_BASIC_AUTH': '',
+                        'SKYPILOT_AUTH_OAUTH2_PROXY_ENABLED': '',
+                        'SKYPILOT_AUTH_USER_HEADER': '',
+                    }):
+                proxy_config = config.load_external_proxy_config()
+
+        # Enabled by default for backward compatibility
+        assert proxy_config.enabled is True
+
+    def test_empty_server_yaml_disabled_when_basic_auth(self):
+        """Test that empty server.yaml with basic auth returns disabled.
+
+        When server.yaml exists but doesn't set enabled, and built-in basic
+        auth is active, external proxy should be disabled.
+        """
+        mock_config = _mock_server_config({})
+        with mock.patch('sky.server.config.load_server_config',
+                        return_value=mock_config):
+            with mock.patch.dict(
+                    os.environ, {
+                        'ENABLE_BASIC_AUTH': 'true',
+                        'SKYPILOT_AUTH_OAUTH2_PROXY_ENABLED': '',
+                        'SKYPILOT_AUTH_USER_HEADER': '',
+                    }):
+                proxy_config = config.load_external_proxy_config()
+
+        # Disabled because built-in basic auth is enabled
+        assert proxy_config.enabled is False
+
+    def test_partial_server_yaml_uses_defaults_for_missing(self):
+        """Test that partial config uses defaults for missing fields."""
+        mock_config = _mock_server_config({
+            'auth': {
+                'external_proxy': {
+                    'enabled': True,
+                    'header_name': 'x-custom-header',
+                }
+            }
+        })
+        with mock.patch('sky.server.config.load_server_config',
+                        return_value=mock_config):
+            with mock.patch.dict(
+                    os.environ, {
+                        'ENABLE_BASIC_AUTH': '',
+                        'SKYPILOT_AUTH_OAUTH2_PROXY_ENABLED': '',
+                        'SKYPILOT_AUTH_USER_HEADER': '',
+                    }):
+                proxy_config = config.load_external_proxy_config()
+
+        # Custom value from yaml
+        assert proxy_config.header_name == 'x-custom-header'
+        assert proxy_config.enabled is True
+        # Defaults for missing fields
+        assert proxy_config.header_format == 'plaintext'
+        assert proxy_config.jwt_identity_claim == 'sub'
+
+
+class TestExtractIdentityFromJwt:
+    """Test cases for JWT identity extraction in server.py."""
+
+    def test_extract_email_from_valid_jwt(self):
+        """Test extracting email claim from a valid JWT."""
+        from sky.server.server import _extract_identity_from_jwt
+
+        # Create a test JWT
+        payload = {'email': 'test@example.com', 'sub': '12345'}
+        token = pyjwt.encode(payload, 'secret', algorithm='HS256')
+
+        result = _extract_identity_from_jwt(token, 'email')
+        assert result == 'test@example.com'
+
+    def test_extract_sub_from_valid_jwt(self):
+        """Test extracting sub claim from a valid JWT."""
+        from sky.server.server import _extract_identity_from_jwt
+
+        payload = {'email': 'test@example.com', 'sub': '12345'}
+        token = pyjwt.encode(payload, 'secret', algorithm='HS256')
+
+        result = _extract_identity_from_jwt(token, 'sub')
+        assert result == '12345'
+
+    def test_extract_missing_claim_returns_none(self):
+        """Test extracting a missing claim returns None."""
+        from sky.server.server import _extract_identity_from_jwt
+
+        payload = {'sub': '12345'}
+        token = pyjwt.encode(payload, 'secret', algorithm='HS256')
+
+        result = _extract_identity_from_jwt(token, 'email')
+        assert result is None
+
+    def test_invalid_jwt_returns_none(self):
+        """Test that an invalid JWT returns None."""
+        from sky.server.server import _extract_identity_from_jwt
+
+        result = _extract_identity_from_jwt('not-a-valid-jwt', 'email')
+        assert result is None
+
+    def test_empty_jwt_returns_none(self):
+        """Test that an empty JWT returns None."""
+        from sky.server.server import _extract_identity_from_jwt
+
+        result = _extract_identity_from_jwt('', 'email')
+        assert result is None
+
+
+class TestExtractUserFromHeader:
+    """Test cases for user extraction from headers."""
+
+    def test_extract_from_plaintext_header(self):
+        """Test extracting user from plaintext header."""
+        from sky.server.server import _extract_user_from_header
+
+        request = mock.MagicMock()
+        request.headers = {'X-Auth-Request-Email': 'user@example.com'}
+
+        proxy_config = config.ExternalProxyConfig(
+            enabled=True,
+            header_name='X-Auth-Request-Email',
+            header_format='plaintext',
+        )
+
+        user = _extract_user_from_header(request, proxy_config)
+
+        assert user is not None
+        assert user.name == 'user@example.com'
+        assert user.id is not None
+
+    def test_extract_from_jwt_header(self):
+        """Test extracting user from JWT header."""
+        from sky.server.server import _extract_user_from_header
+
+        payload = {'email': 'jwt-user@example.com'}
+        token = pyjwt.encode(payload, 'secret', algorithm='HS256')
+
+        request = mock.MagicMock()
+        request.headers = {'x-amzn-oidc-data': token}
+
+        proxy_config = config.ExternalProxyConfig(
+            enabled=True,
+            header_name='x-amzn-oidc-data',
+            header_format='jwt',
+            jwt_identity_claim='email',
+        )
+
+        user = _extract_user_from_header(request, proxy_config)
+
+        assert user is not None
+        assert user.name == 'jwt-user@example.com'
+
+    def test_missing_header_returns_none(self):
+        """Test that missing header returns None."""
+        from sky.server.server import _extract_user_from_header
+
+        request = mock.MagicMock()
+        request.headers = {}
+
+        proxy_config = config.ExternalProxyConfig(
+            enabled=True,
+            header_name='X-Auth-Request-Email',
+            header_format='plaintext',
+        )
+
+        user = _extract_user_from_header(request, proxy_config)
+
+        assert user is None
+
+    def test_invalid_jwt_in_header_returns_none(self):
+        """Test that invalid JWT in header returns None."""
+        from sky.server.server import _extract_user_from_header
+
+        request = mock.MagicMock()
+        request.headers = {'x-amzn-oidc-data': 'not-a-jwt'}
+
+        proxy_config = config.ExternalProxyConfig(
+            enabled=True,
+            header_name='x-amzn-oidc-data',
+            header_format='jwt',
+            jwt_identity_claim='email',
+        )
+
+        user = _extract_user_from_header(request, proxy_config)
+
+        assert user is None
+
+    def test_jwt_with_missing_claim_returns_none(self):
+        """Test that JWT with missing claim returns None."""
+        from sky.server.server import _extract_user_from_header
+
+        payload = {'sub': '12345'}  # No email claim
+        token = pyjwt.encode(payload, 'secret', algorithm='HS256')
+
+        request = mock.MagicMock()
+        request.headers = {'x-amzn-oidc-data': token}
+
+        proxy_config = config.ExternalProxyConfig(
+            enabled=True,
+            header_name='x-amzn-oidc-data',
+            header_format='jwt',
+            jwt_identity_claim='email',
+        )
+
+        user = _extract_user_from_header(request, proxy_config)
+
+        assert user is None
diff --git a/tests/unit_tests/test_sky/server/test_plugin_utils.py b/tests/unit_tests/test_sky/server/test_plugin_utils.py
new file mode 100644
index 00000000000..bae34c40fb3
--- /dev/null
+++ b/tests/unit_tests/test_sky/server/test_plugin_utils.py
@@ -0,0 +1,317 @@
+"""Unit tests for the SkyPilot API server plugin_utils."""
+
+import os
+import tempfile
+
+import pytest
+import yaml
+
+from sky.server import plugin_utils
+from sky.server import plugins
+
+
+def test_get_remote_plugin_packages(monkeypatch, tmp_path):
+    """Test get_remote_plugin_packages returns remote plugin configurations."""
+    remote_config = {
+        'plugins': [
+            {
+                'class': 'module1.RemotePlugin1',
+            },
+            {
+                'class': 'module2.RemotePlugin2',
+                'parameters': {
+                    'param': 'value'
+                },
+            },
+        ]
+    }
+    remote_config_path = tmp_path / 'remote_plugins.yaml'
+    remote_config_path.write_text(yaml.safe_dump(remote_config))
+    monkeypatch.setenv(plugins._REMOTE_PLUGINS_CONFIG_ENV_VAR,
+                       str(remote_config_path))
+
+    plugin_config = {'controller_wheel_path': 'dist', 'plugins': []}
+    plugin_config_path = tmp_path / 'plugins.yaml'
+    plugin_config_path.write_text(yaml.safe_dump(plugin_config))
+    monkeypatch.setenv(plugins._PLUGINS_CONFIG_ENV_VAR, str(plugin_config_path))
+
+    packages = plugins.get_remote_plugin_packages()
+    wheel_path = plugins.get_remote_controller_wheel_path()
+
+    assert len(packages) == 2
+    assert packages[0]['class'] == 'module1.RemotePlugin1'
+    assert 'upload_to_controller' not in packages[0]
+    assert packages[1]['class'] == 'module2.RemotePlugin2'
+    assert 'upload_to_controller' not in packages[1]
+    assert wheel_path == 'dist'
+
+
+def test_get_plugin_mounts_and_commands(monkeypatch, tmp_path):
+    """Test get_plugin_mounts_and_commands returns consistent mounts and cmds."""
+    # Create a directory with wheel files
+    wheel_dir = tmp_path / 'wheels'
+    wheel_dir.mkdir()
+    wheel_file1 = wheel_dir / 'test_plugin-0.0.1-py3-none-any.whl'
+    wheel_file1.write_bytes(b'fake wheel content 1')
+    wheel_file2 = wheel_dir / 'another_plugin-0.0.2-py3-none-any.whl'
+    wheel_file2.write_bytes(b'fake wheel content 2')
+
+    remote_config = {
+        'plugins': [{
+            'class': 'test_plugin.TestPlugin',
+        }]
+    }
+    remote_config_path = tmp_path / 'remote_plugins.yaml'
+    remote_config_path.write_text(yaml.safe_dump(remote_config))
+    monkeypatch.setenv(plugins._REMOTE_PLUGINS_CONFIG_ENV_VAR,
+                       str(remote_config_path))
+
+    plugin_config = {'controller_wheel_path': str(wheel_dir), 'plugins': []}
+    plugin_config_path = tmp_path / 'plugins.yaml'
+    plugin_config_path.write_text(yaml.safe_dump(plugin_config))
+    monkeypatch.setenv(plugins._PLUGINS_CONFIG_ENV_VAR, str(plugin_config_path))
+
+    file_mounts, commands = plugin_utils.get_plugin_mounts_and_commands()
+
+    # Check file mounts - should have both wheel files
+    assert len(file_mounts) == 2
+    remote_paths = list(file_mounts.keys())
+    assert all('~/.sky/plugins/wheels' in path for path in remote_paths)
+    assert any(
+        'test_plugin-0.0.1-py3-none-any.whl' in path for path in remote_paths)
+    assert any('another_plugin-0.0.2-py3-none-any.whl' in path
+               for path in remote_paths)
+    assert str(wheel_file1) in file_mounts.values()
+    assert str(wheel_file2) in file_mounts.values()
+
+    # Check commands
+    assert commands != ''
+    assert 'pip install' in commands or 'uv pip install' in commands
+    # Should contain both wheel filenames
+    assert 'test_plugin-0.0.1-py3-none-any.whl' in commands
+    assert 'another_plugin-0.0.2-py3-none-any.whl' in commands
+    # Path should use ~ for shell expansion (not quoted)
+    assert '~/.sky/plugins/wheels' in commands
+    # Should have && between commands
+    assert ' && ' in commands
+
+
+def test_get_plugin_mounts_and_commands_no_wheel_path(monkeypatch, tmp_path):
+    """Test get_plugin_mounts_and_commands when controller_wheel_path is missing."""
+    remote_config = {
+        'plugins': [{
+            'class': 'test_plugin.TestPlugin',
+        }]
+    }
+    remote_config_path = tmp_path / 'remote_plugins.yaml'
+    remote_config_path.write_text(yaml.safe_dump(remote_config))
+    monkeypatch.setenv(plugins._REMOTE_PLUGINS_CONFIG_ENV_VAR,
+                       str(remote_config_path))
+
+    # plugins.yaml without controller_wheel_path
+    plugin_config = {'plugins': []}
+    plugin_config_path = tmp_path / 'plugins.yaml'
+    plugin_config_path.write_text(yaml.safe_dump(plugin_config))
+    monkeypatch.setenv(plugins._PLUGINS_CONFIG_ENV_VAR, str(plugin_config_path))
+
+    file_mounts, commands = plugin_utils.get_plugin_mounts_and_commands()
+
+    # Should return empty since controller_wheel_path is missing
+    assert file_mounts == {}
+    assert commands == ''
+
+
+def test_get_plugin_mounts_and_commands_invalid_wheel_path(
+        monkeypatch, tmp_path):
+    """Test get_plugin_mounts_and_commands with invalid wheel directory path."""
+    remote_config = {
+        'plugins': [{
+            'class': 'test_plugin.TestPlugin',
+        }]
+    }
+    remote_config_path = tmp_path / 'remote_plugins.yaml'
+    remote_config_path.write_text(yaml.safe_dump(remote_config))
+    monkeypatch.setenv(plugins._REMOTE_PLUGINS_CONFIG_ENV_VAR,
+                       str(remote_config_path))
+
+    plugin_config = {
+        'controller_wheel_path': str(tmp_path / 'nonexistent_dir'),
+        'plugins': []
+    }
+    plugin_config_path = tmp_path / 'plugins.yaml'
+    plugin_config_path.write_text(yaml.safe_dump(plugin_config))
+    monkeypatch.setenv(plugins._PLUGINS_CONFIG_ENV_VAR, str(plugin_config_path))
+
+    file_mounts, commands = plugin_utils.get_plugin_mounts_and_commands()
+
+    # Should return empty since directory doesn't exist
+    assert file_mounts == {}
+    assert commands == ''
+
+
+def test_get_plugin_mounts_and_commands_not_directory(monkeypatch, tmp_path):
+    """Test get_plugin_mounts_and_commands when path is not a directory."""
+    # Create a file instead of a directory
+    wheel_file = tmp_path / 'test_plugin.whl'
+    wheel_file.write_bytes(b'fake wheel content')
+
+    remote_config = {
+        'plugins': [{
+            'class': 'test_plugin.TestPlugin',
+        }]
+    }
+    remote_config_path = tmp_path / 'remote_plugins.yaml'
+    remote_config_path.write_text(yaml.safe_dump(remote_config))
+    monkeypatch.setenv(plugins._REMOTE_PLUGINS_CONFIG_ENV_VAR,
+                       str(remote_config_path))
+
+    plugin_config = {'controller_wheel_path': str(wheel_file), 'plugins': []}
+    plugin_config_path = tmp_path / 'plugins.yaml'
+    plugin_config_path.write_text(yaml.safe_dump(plugin_config))
+    monkeypatch.setenv(plugins._PLUGINS_CONFIG_ENV_VAR, str(plugin_config_path))
+
+    file_mounts, commands = plugin_utils.get_plugin_mounts_and_commands()
+
+    # Should return empty since path is not a directory
+    assert file_mounts == {}
+    assert commands == ''
+
+
+def test_get_plugin_mounts_and_commands_no_whl_files(monkeypatch, tmp_path):
+    """Test get_plugin_mounts_and_commands when directory has no .whl files."""
+    # Create a directory with no .whl files
+    wheel_dir = tmp_path / 'wheels'
+    wheel_dir.mkdir()
+    # Create a non-wheel file
+    (wheel_dir / 'test_plugin.txt').write_text('not a wheel')
+
+    remote_config = {
+        'plugins': [{
+            'class': 'test_plugin.TestPlugin',
+        }]
+    }
+    remote_config_path = tmp_path / 'remote_plugins.yaml'
+    remote_config_path.write_text(yaml.safe_dump(remote_config))
+    monkeypatch.setenv(plugins._REMOTE_PLUGINS_CONFIG_ENV_VAR,
+                       str(remote_config_path))
+
+    plugin_config = {'controller_wheel_path': str(wheel_dir), 'plugins': []}
+    plugin_config_path = tmp_path / 'plugins.yaml'
+    plugin_config_path.write_text(yaml.safe_dump(plugin_config))
+    monkeypatch.setenv(plugins._PLUGINS_CONFIG_ENV_VAR, str(plugin_config_path))
+
+    file_mounts, commands = plugin_utils.get_plugin_mounts_and_commands()
+
+    # Should return empty since no .whl files found
+    assert file_mounts == {}
+    assert commands == ''
+
+
+def test_get_filtered_plugins_config_path_empty(monkeypatch, tmp_path):
+    """Test get_filtered_plugins_config_path with no plugins."""
+    remote_config_path = tmp_path / 'remote_plugins.yaml'
+    remote_config_path.write_text(yaml.safe_dump({'plugins': []}))
+    monkeypatch.setenv(plugins._REMOTE_PLUGINS_CONFIG_ENV_VAR,
+                       str(remote_config_path))
+
+    result = plugin_utils.get_filtered_plugins_config_path()
+
+    assert result is None
+
+
+def test_get_filtered_plugins_config_path_no_remote_config(
+        monkeypatch, tmp_path):
+    """Test get_filtered_plugins_config_path when remote_plugins.yaml doesn't exist."""
+    # Set a path that doesn't exist
+    remote_config_path = tmp_path / 'nonexistent_remote_plugins.yaml'
+    monkeypatch.setenv(plugins._REMOTE_PLUGINS_CONFIG_ENV_VAR,
+                       str(remote_config_path))
+
+    result = plugin_utils.get_filtered_plugins_config_path()
+
+    # No remote_plugins.yaml - should return None
+    assert result is None
+
+
+def test_get_filtered_plugins_config_path_with_remote_config(
+        monkeypatch, tmp_path):
+    """Test get_filtered_plugins_config_path with remote_plugins.yaml."""
+    remote_config = {
+        'plugins': [{
+            'class': 'test_plugin.TestPlugin',
+        }]
+    }
+    remote_config_path = tmp_path / 'remote_plugins.yaml'
+    remote_config_path.write_text(yaml.safe_dump(remote_config))
+    monkeypatch.setenv(plugins._REMOTE_PLUGINS_CONFIG_ENV_VAR,
+                       str(remote_config_path))
+
+    result = plugin_utils.get_filtered_plugins_config_path()
+
+    assert result is not None
+    assert result == str(remote_config_path)
+    assert os.path.exists(result)
+
+    # Read and verify the config (should be the original remote_plugins.yaml)
+    with open(result) as f:
+        config = yaml.safe_load(f)
+
+    assert len(config['plugins']) == 1
+    assert config['plugins'][0]['class'] == 'test_plugin.TestPlugin'
+    # controller_wheel_path should not be in remote_plugins.yaml anymore
+    assert 'controller_wheel_path' not in config
+    # Config should not include upload_to_controller (not in schema anymore)
+    assert 'upload_to_controller' not in config['plugins'][0]
+
+
+def test_get_filtered_plugins_config_path_multiple_plugins(
+        monkeypatch, tmp_path):
+    """Test get_filtered_plugins_config_path with multiple plugins in remote_plugins.yaml."""
+    remote_config = {
+        'plugins': [
+            {
+                'class': 'module1.Plugin1',
+            },
+            {
+                'class': 'module3.Plugin3',
+                'parameters': {
+                    'key': 'value'
+                },
+            },
+        ]
+    }
+    remote_config_path = tmp_path / 'remote_plugins.yaml'
+    remote_config_path.write_text(yaml.safe_dump(remote_config))
+    monkeypatch.setenv(plugins._REMOTE_PLUGINS_CONFIG_ENV_VAR,
+                       str(remote_config_path))
+
+    result = plugin_utils.get_filtered_plugins_config_path()
+
+    assert result is not None
+    assert result == str(remote_config_path)
+    assert os.path.exists(result)
+
+    # Read and verify the config (should be the original remote_plugins.yaml)
+    with open(result) as f:
+        config = yaml.safe_load(f)
+
+    # Should contain all plugins from remote_plugins.yaml
+    assert len(config['plugins']) == 2
+    assert config['plugins'][0]['class'] == 'module1.Plugin1'
+    assert 'upload_to_controller' not in config['plugins'][0]
+    assert config['plugins'][1]['class'] == 'module3.Plugin3'
+    assert 'upload_to_controller' not in config['plugins'][1]
+    assert config['plugins'][1]['parameters'] == {'key': 'value'}
+    # controller_wheel_path should not be in remote_plugins.yaml anymore
+    assert 'controller_wheel_path' not in config
+
+
+def test_get_filtered_plugins_config_path_no_config(monkeypatch, tmp_path):
+    """Test get_filtered_plugins_config_path when remote_plugins.yaml doesn't exist."""
+    remote_config_path = tmp_path / 'nonexistent.yaml'
+    monkeypatch.setenv(plugins._REMOTE_PLUGINS_CONFIG_ENV_VAR,
+                       str(remote_config_path))
+
+    result = plugin_utils.get_filtered_plugins_config_path()
+
+    assert result is None
diff --git a/tests/unit_tests/test_sky/server/test_plugins.py b/tests/unit_tests/test_sky/server/test_plugins.py
index 41ffae44a42..8d589ffb003 100644
--- a/tests/unit_tests/test_sky/server/test_plugins.py
+++ b/tests/unit_tests/test_sky/server/test_plugins.py
@@ -6,6 +6,7 @@
 from unittest import mock
 
 from fastapi import FastAPI
+from fastapi.testclient import TestClient
 import yaml
 
 from sky.server import plugins
@@ -111,3 +112,159 @@ def bind_socket(self):
     ctx = load_mock.call_args.args[0]
     assert isinstance(ctx, plugins.ExtensionContext)
     assert ctx.app is None
+
+
+def test_hidden_from_display_property_default():
+    """Test that hidden_from_display defaults to False."""
+    module_name = 'sky_test_visible_plugin'
+
+    class VisiblePlugin(plugins.BasePlugin):
+
+        def install(self, extension_context):
+            pass
+
+    plugin = VisiblePlugin()
+    assert plugin.hidden_from_display is False
+
+
+def test_hidden_from_display_property_override():
+    """Test that plugins can override hidden_from_display to True."""
+    module_name = 'sky_test_hidden_plugin'
+
+    class HiddenPlugin(plugins.BasePlugin):
+
+        @property
+        def hidden_from_display(self) -> bool:
+            return True
+
+        def install(self, extension_context):
+            pass
+
+    plugin = HiddenPlugin()
+    assert plugin.hidden_from_display is True
+
+
+def test_api_plugins_endpoint_excludes_hidden_plugins(monkeypatch):
+    """Test that /api/plugins endpoint excludes plugins with hidden_from_display=True."""
+    from sky.server import server
+
+    # Create test plugins
+    class VisiblePlugin(plugins.BasePlugin):
+
+        @property
+        def name(self) -> str:
+            return 'VisiblePlugin'
+
+        @property
+        def version(self) -> str:
+            return '1.0.0'
+
+        @property
+        def commit(self) -> str:
+            return 'abc123'
+
+        def install(self, extension_context):
+            pass
+
+    class HiddenPlugin(plugins.BasePlugin):
+
+        @property
+        def name(self) -> str:
+            return 'HiddenPlugin'
+
+        @property
+        def version(self) -> str:
+            return '2.0.0'
+
+        @property
+        def commit(self) -> str:
+            return 'def456'
+
+        @property
+        def hidden_from_display(self) -> bool:
+            return True
+
+        def install(self, extension_context):
+            pass
+
+    # Mock get_plugins to return our test plugins
+    monkeypatch.setattr(
+        plugins, 'get_plugins',
+        lambda: [VisiblePlugin(), HiddenPlugin()])
+
+    # Create test client
+    client = TestClient(server.app)
+
+    # Make request to /api/plugins
+    response = client.get('/api/plugins')
+
+    # Verify response
+    assert response.status_code == 200
+    data = response.json()
+    assert 'plugins' in data
+    plugin_list = data['plugins']
+
+    # Should only contain the visible plugin
+    assert len(plugin_list) == 1
+    assert plugin_list[0]['name'] == 'VisiblePlugin'
+    assert plugin_list[0]['version'] == '1.0.0'
+    assert plugin_list[0]['commit'] == 'abc123'
+
+    # Verify hidden plugin is not in the list
+    plugin_names = [p['name'] for p in plugin_list]
+    assert 'HiddenPlugin' not in plugin_names
+
+
+def test_api_plugins_endpoint_includes_visible_plugins(monkeypatch):
+    """Test that /api/plugins endpoint includes plugins with hidden_from_display=False."""
+    from sky.server import server
+
+    # Create test plugins
+    class VisiblePlugin1(plugins.BasePlugin):
+
+        @property
+        def name(self) -> str:
+            return 'VisiblePlugin1'
+
+        @property
+        def version(self) -> str:
+            return '1.0.0'
+
+        def install(self, extension_context):
+            pass
+
+    class VisiblePlugin2(plugins.BasePlugin):
+
+        @property
+        def name(self) -> str:
+            return 'VisiblePlugin2'
+
+        @property
+        def version(self) -> str:
+            return '2.0.0'
+
+        def install(self, extension_context):
+            pass
+
+    # Mock get_plugins to return our test plugins
+    monkeypatch.setattr(
+        plugins, 'get_plugins',
+        lambda: [VisiblePlugin1(), VisiblePlugin2()])
+
+    # Create test client
+    client = TestClient(server.app)
+
+    # Make request to /api/plugins
+    response = client.get('/api/plugins')
+
+    # Verify response
+    assert response.status_code == 200
+    data = response.json()
+    assert 'plugins' in data
+    plugin_list = data['plugins']
+
+    # Should contain both visible plugins
+    assert len(plugin_list) == 2
+    plugin_names = [p['name'] for p in plugin_list]
+    assert 'VisiblePlugin1' in plugin_names
+    assert 'VisiblePlugin2' in plugin_names
diff --git a/tests/unit_tests/test_sky/server/test_sdk.py b/tests/unit_tests/test_sky/server/test_sdk.py
index 090fcd4fdbc..4aba8a81b1c 100644
--- a/tests/unit_tests/test_sky/server/test_sdk.py
+++ b/tests/unit_tests/test_sky/server/test_sdk.py
@@ -719,7 +719,6 @@ def test_get_request_id():
     mock_response = mock.MagicMock()
     mock_response.headers = {
         'X-Skypilot-Request-ID': 'test_request_id',
-        'X-Request-ID': 'test_request_id_2',
     }
     mock_response.status_code = 200
     mock_response.reason = 'OK'
diff --git a/tests/unit_tests/test_sky/server/test_server.py b/tests/unit_tests/test_sky/server/test_server.py
index 26d78ab73de..66bd1955f51 100644
--- a/tests/unit_tests/test_sky/server/test_server.py
+++ b/tests/unit_tests/test_sky/server/test_server.py
@@ -279,25 +279,144 @@ def setup_and_check():
 
 @pytest.mark.asyncio
 async def test_enabled_clouds_respect_auth_user():
+    """Test that enabled_clouds endpoint passes auth_user to schedule_request_async.
+
+    After the refactor, auth_user is passed directly to the executor function
+    which handles setting env_vars internally, rather than modifying env_vars
+    before the call.
+    """
     auth_user = models.User(id='auth-user-id', name='Auth User')
     request = mock.MagicMock()
     request.state = mock.MagicMock()
     request.state.request_id = 'request-id'
     request.state.auth_user = auth_user
 
-    default_env_vars = {
-        constants.USER_ID_ENV_VAR: 'default-id',
-        constants.USER_ENV_VAR: 'default-name',
-    }
-
-    with mock.patch('sky.server.requests.payloads.request_body_env_vars',
-                    side_effect=lambda: default_env_vars.copy()), \
-         mock.patch('sky.server.server.executor.schedule_request_async',
+    with mock.patch('sky.server.server.executor.schedule_request_async',
                     new_callable=mock.AsyncMock) as mock_schedule:
         await server.enabled_clouds(request, workspace='ws', expand=True)
 
     mock_schedule.assert_awaited_once()
     _, kwargs = mock_schedule.call_args
-    request_body = kwargs['request_body']
-    assert request_body.env_vars[constants.USER_ID_ENV_VAR] == auth_user.id
-    assert request_body.env_vars[constants.USER_ENV_VAR] == auth_user.name
+    # Verify auth_user is passed to schedule_request_async
+    assert kwargs['auth_user'] == auth_user
+
+
+@pytest.mark.asyncio
+async def test_schedule_request_passes_auth_user():
+    """Test that schedule_request_async receives auth_user from various endpoints."""
+    from sky.server.requests import payloads
+
+    auth_user = models.User(id='test-user-id', name='Test User')
+    request = mock.MagicMock()
+    request.state = mock.MagicMock()
+    request.state.request_id = 'test-request-id'
+    request.state.auth_user = auth_user
+
+    # Test check endpoint
+    check_body = payloads.CheckBody()
+    with mock.patch('sky.server.server.executor.schedule_request_async',
+                    new_callable=mock.AsyncMock) as mock_schedule:
+        await server.check(request, check_body)
+        mock_schedule.assert_awaited_once()
+        _, kwargs = mock_schedule.call_args
+        assert kwargs['auth_user'] == auth_user
+
+    # Test status endpoint
+    status_body = payloads.StatusBody()
+    with mock.patch('sky.server.server.executor.schedule_request_async',
+                    new_callable=mock.AsyncMock) as mock_schedule:
+        await server.status(request, status_body)
+        mock_schedule.assert_awaited_once()
+        _, kwargs = mock_schedule.call_args
+        assert kwargs['auth_user'] == auth_user
+
+    # Test stop endpoint
+    stop_body = payloads.StopOrDownBody(cluster_name='test-cluster')
+    with mock.patch('sky.server.server.executor.schedule_request_async',
+                    new_callable=mock.AsyncMock) as mock_schedule:
+        await server.stop(request, stop_body)
+        mock_schedule.assert_awaited_once()
+        _, kwargs = mock_schedule.call_args
+        assert kwargs['auth_user'] == auth_user
+
+
+@pytest.mark.asyncio
+async def test_schedule_request_with_none_auth_user():
+    """Test that endpoints work correctly when auth_user is None."""
+    from sky.server.requests import payloads
+
+    request = mock.MagicMock()
+    request.state = mock.MagicMock()
+    request.state.request_id = 'test-request-id'
+    request.state.auth_user = None
+
+    check_body = payloads.CheckBody()
+    with mock.patch('sky.server.server.executor.schedule_request_async',
+                    new_callable=mock.AsyncMock) as mock_schedule:
+        await server.check(request, check_body)
+        mock_schedule.assert_awaited_once()
+        _, kwargs = mock_schedule.call_args
+        assert kwargs['auth_user'] is None
+
+
+@pytest.mark.asyncio
+async def test_prepare_request_passes_auth_user():
+    """Test that prepare_request_async receives auth_user for streaming endpoints."""
+    from sky.server.requests import payloads
+
+    auth_user = models.User(id='test-user-id', name='Test User')
+    request = mock.MagicMock()
+    request.state = mock.MagicMock()
+    request.state.request_id = 'test-request-id'
+    request.state.auth_user = auth_user
+
+    cluster_job_body = payloads.ClusterJobBody(cluster_name='test-cluster',
+                                               job_id=1)
+    background_tasks = fastapi.BackgroundTasks()
+
+    mock_request_task = mock.MagicMock()
+    mock_request_task.request_id = 'test-request-id'
+    mock_request_task.log_path = '/tmp/test.log'
+
+    with mock.patch('sky.server.requests.executor.prepare_request_async',
+                    new_callable=mock.AsyncMock,
+                    return_value=mock_request_task) as mock_prepare, \
+         mock.patch('sky.server.requests.executor.execute_request_in_coroutine') as mock_execute, \
+         mock.patch('sky.server.stream_utils.stream_response_for_long_request',
+                    return_value=fastapi.responses.StreamingResponse(
+                        content=iter([]),
+                        media_type='text/plain')):
+        mock_execute.return_value = executor.CoroutineTask(
+            asyncio.create_task(asyncio.sleep(0)))
+
+        await server.logs(request, cluster_job_body, background_tasks)
+
+        mock_prepare.assert_awaited_once()
+        _, kwargs = mock_prepare.call_args
+        assert kwargs['auth_user'] == auth_user
+
+
+@pytest.mark.asyncio
+async def test_launch_endpoint_passes_auth_user():
+    """Test that launch endpoint passes auth_user to schedule_request_async."""
+    from sky.server.requests import payloads
+
+    auth_user = models.User(id='launch-user-id', name='Launch User')
+    request = mock.MagicMock()
+    request.state = mock.MagicMock()
+    request.state.request_id = 'launch-request-id'
+    request.state.auth_user = auth_user
+
+    launch_body = payloads.LaunchBody(
+        task='test_task_yaml',
+        cluster_name='test-cluster',
+    )
+
+    with mock.patch('sky.server.server.executor.schedule_request_async',
+                    new_callable=mock.AsyncMock) as mock_schedule:
+        await server.launch(launch_body, request)
+        mock_schedule.assert_awaited_once()
+        args, kwargs = mock_schedule.call_args
+        assert kwargs['auth_user'] == auth_user
+        # request_id is passed as first positional argument
+        assert args[0] == 'launch-request-id'
diff --git a/tests/unit_tests/test_sky/skylet/test_log_lib.py b/tests/unit_tests/test_sky/skylet/test_log_lib.py
index 44190ebf587..9a7f9e05e2e 100644
--- a/tests/unit_tests/test_sky/skylet/test_log_lib.py
+++ b/tests/unit_tests/test_sky/skylet/test_log_lib.py
@@ -1,6 +1,8 @@
 """Unit tests for skylet log_lib."""
 
 from io import StringIO
+import subprocess
+import tempfile
 import unittest
 
 from sky.skylet import log_lib
@@ -100,5 +102,69 @@ def test_reset_after_flush(self):
         self.assertEqual(buffer._buffer.tell(), 0)
 
 
+class TestRunWithLogTimeout(unittest.TestCase):
+    """Test cases for run_with_log timeout functionality."""
+
+    def test_process_stream_timeout_exceeded(self):
+        """Test that timeout works with process_stream=True."""
+        with tempfile.NamedTemporaryFile(suffix='.log', delete=False) as f:
+            log_path = f.name
+
+        # Command that sleeps longer than timeout
+        cmd = ['sleep', '10']
+        with self.assertRaises(subprocess.TimeoutExpired):
+            log_lib.run_with_log(
+                cmd,
+                log_path,
+                process_stream=True,
+                timeout=1,
+            )
+
+    def test_process_stream_timeout_not_exceeded(self):
+        """Test normal completion with process_stream=True and timeout set."""
+        with tempfile.NamedTemporaryFile(suffix='.log', delete=False) as f:
+            log_path = f.name
+
+        # Command that completes quickly
+        cmd = ['echo', 'hello']
+        returncode = log_lib.run_with_log(
+            cmd,
+            log_path,
+            process_stream=True,
+            timeout=10,
+        )
+        self.assertEqual(returncode, 0)
+
+    def test_no_stream_timeout_exceeded(self):
+        """Test that timeout works with process_stream=False."""
+        with tempfile.NamedTemporaryFile(suffix='.log', delete=False) as f:
+            log_path = f.name
+
+        # Command that sleeps longer than timeout
+        cmd = ['sleep', '10']
+        with self.assertRaises(subprocess.TimeoutExpired):
+            log_lib.run_with_log(
+                cmd,
+                log_path,
+                process_stream=False,
+                timeout=1,
+            )
+
+    def test_no_stream_timeout_not_exceeded(self):
+        """Test normal completion with process_stream=False and timeout set."""
+        with tempfile.NamedTemporaryFile(suffix='.log', delete=False) as f:
+            log_path = f.name
+
+        # Command that completes quickly
+        cmd = ['echo', 'hello']
+        returncode = log_lib.run_with_log(
+            cmd,
+            log_path,
+            process_stream=False,
+            timeout=10,
+        )
+        self.assertEqual(returncode, 0)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/unit_tests/test_sky/test_cli_show_gpus.py b/tests/unit_tests/test_sky/test_cli_show_gpus.py
new file mode 100644
index 00000000000..638de4a586f
--- /dev/null
+++ b/tests/unit_tests/test_sky/test_cli_show_gpus.py
@@ -0,0 +1,842 @@
+"""Tests for show_gpus CLI command.
+
+This module contains tests for the show_gpus function in
+sky.client.cli.command module.
+"""
+from unittest import mock
+
+from click.testing import CliRunner
+import numpy as np
+import pytest
+
+from sky import clouds
+from sky import models
+from sky.catalog import common as catalog_common
+from sky.client import sdk
+from sky.client.cli import command
+from sky.utils import registry
+
+
+class TestShowGpus:
+    """Test suite for the show_gpus function."""
+
+    @pytest.fixture(autouse=True)
+    def setup_mocks(self):
+        """Set up common mocks for all tests."""
+        self.runner = CliRunner()
+
+        self.mock_cloud_registry = mock.patch.object(registry.CLOUD_REGISTRY,
+                                                     'from_str')
+        self.mock_sdk_get = mock.patch.object(sdk, 'get')
+        self.mock_stream_and_get = mock.patch.object(sdk, 'stream_and_get')
+        self.mock_enabled_clouds_fn = mock.patch.object(sdk, 'enabled_clouds')
+
+        self.cloud_registry_mock = self.mock_cloud_registry.start()
+        self.sdk_get_mock = self.mock_sdk_get.start()
+        self.stream_and_get_mock = self.mock_stream_and_get.start()
+        self.enabled_clouds_fn_mock = self.mock_enabled_clouds_fn.start()
+
+        self.enabled_clouds_fn_mock.return_value = 'mock_request_id'
+        self.sdk_get_mock.return_value = []
+        self.stream_and_get_mock.return_value = {}
+
+        yield
+
+        self.mock_cloud_registry.stop()
+        self.mock_sdk_get.stop()
+        self.mock_stream_and_get.stop()
+        self.mock_enabled_clouds_fn.stop()
+
+    # ==========================================================================
+    # Basic functionality tests
+    # ==========================================================================
+
+    def test_show_gpus_no_accelerator_no_cloud(self):
+        """Test show_gpus with no accelerator string and no cloud filter."""
+        self.cloud_registry_mock.return_value = None
+        self.sdk_get_mock.return_value = []
+        self.stream_and_get_mock.return_value = {'V100': [1, 2, 4, 8]}
+
+        with mock.patch.object(sdk,
+                               'list_accelerator_counts',
+                               return_value=mock.MagicMock()):
+            result = self.runner.invoke(command.show_gpus, [])
+
+        assert result.exit_code == 0
+        assert 'V100' in result.output
+
+    def test_show_gpus_with_specific_accelerator(self):
+        """Test show_gpus with a specific accelerator string."""
+        self.cloud_registry_mock.return_value = None
+        self.sdk_get_mock.return_value = []
+        self.stream_and_get_mock.return_value = {}
+
+        with mock.patch.object(sdk,
+                               'list_accelerators',
+                               return_value=mock.MagicMock()):
+            result = self.runner.invoke(command.show_gpus, ['V100'])
+
+        assert result.exit_code == 0
+        assert 'not found' in result.output
+
+    def test_show_gpus_with_accelerator_and_quantity(self):
+        """Test show_gpus with accelerator string including quantity."""
+        self.cloud_registry_mock.return_value = None
+        self.sdk_get_mock.return_value = []
+        self.stream_and_get_mock.return_value = {}
+
+        with mock.patch.object(sdk,
+                               'list_accelerators',
+                               return_value=mock.MagicMock()):
+            result = self.runner.invoke(command.show_gpus, ['V100:4'])
+
+        assert result.exit_code == 0
+        assert 'not found' in result.output
+
+    # ==========================================================================
+    # Input validation tests
+    # ==========================================================================
+
+    def test_show_gpus_invalid_accelerator_format(self):
+        """Test show_gpus with invalid accelerator format (too many colons)."""
+        self.cloud_registry_mock.return_value = None
+        self.sdk_get_mock.return_value = []
+
+        result = self.runner.invoke(command.show_gpus, ['V100:4:extra'])
+
+        assert result.exit_code != 0
+        assert 'Invalid accelerator string' in result.output
+
+    def test_show_gpus_invalid_quantity(self):
+        """Test show_gpus with invalid quantity (non-integer)."""
+        self.cloud_registry_mock.return_value = None
+        self.sdk_get_mock.return_value = []
+
+        result = self.runner.invoke(command.show_gpus, ['V100:abc'])
+
+        assert result.exit_code != 0
+        assert 'Invalid accelerator quantity' in result.output
+
+    def test_show_gpus_negative_quantity(self):
+        """Test show_gpus with negative quantity."""
+        self.cloud_registry_mock.return_value = None
+        self.sdk_get_mock.return_value = []
+
+        result = self.runner.invoke(command.show_gpus, ['V100:-1'])
+
+        assert result.exit_code != 0
+        assert 'Invalid accelerator quantity' in result.output
+
+    def test_show_gpus_zero_quantity(self):
+        """Test show_gpus with zero quantity."""
+        self.cloud_registry_mock.return_value = None
+        self.sdk_get_mock.return_value = []
+
+        result = self.runner.invoke(command.show_gpus, ['V100:0'])
+
+        assert result.exit_code != 0
+        assert 'Invalid accelerator quantity' in result.output
+
+    def test_show_gpus_float_quantity(self):
+        """Test show_gpus with float quantity."""
+        self.cloud_registry_mock.return_value = None
+        self.sdk_get_mock.return_value = []
+
+        result = self.runner.invoke(command.show_gpus, ['V100:4.0'])
+
+        assert result.exit_code != 0
+        assert 'Invalid accelerator quantity' in result.output
+
+    def test_show_gpus_empty_accelerator_string(self):
+        """Test show_gpus with empty accelerator string after colon."""
+        self.cloud_registry_mock.return_value = None
+        self.sdk_get_mock.return_value = []
+
+        result = self.runner.invoke(command.show_gpus, ['V100:'])
+
+        assert result.exit_code != 0
+        assert 'Invalid accelerator quantity' in result.output
+
+    # ==========================================================================
+    # Flag validation tests
+    # ==========================================================================
+
+    def test_show_gpus_region_without_cloud(self):
+        """Test that --region flag without --cloud raises error."""
+        self.cloud_registry_mock.return_value = None
+
+        result = self.runner.invoke(command.show_gpus,
+                                    ['--region', 'us-west-2'])
+
+        assert result.exit_code != 0
+        assert 'The --region flag is only valid when the --cloud flag is set' in result.output
+
+    def test_show_gpus_all_regions_without_accelerator(self):
+        """Test that --all-regions flag without accelerator raises error."""
+        self.cloud_registry_mock.return_value = None
+
+        result = self.runner.invoke(command.show_gpus, ['--all-regions'])
+
+        assert result.exit_code != 0
+        assert 'The --all-regions flag is only valid when an accelerator is specified' in result.output
+
+    def test_show_gpus_all_regions_with_region(self):
+        """Test that --all-regions and --region cannot be used together."""
+        mock_aws = clouds.AWS()
+        self.cloud_registry_mock.return_value = mock_aws
+
+        result = self.runner.invoke(command.show_gpus, [
+            'V100', '--cloud', 'aws', '--region', 'us-west-2', '--all-regions'
+        ])
+
+        assert result.exit_code != 0
+        assert '--all-regions and --region flags cannot be used simultaneously' in result.output
+
+    def test_show_gpus_all_with_accelerator(self):
+        """Test that --all flag cannot be used with accelerator name."""
+        self.cloud_registry_mock.return_value = None
+
+        result = self.runner.invoke(command.show_gpus, ['V100', '--all'])
+
+        assert result.exit_code != 0
+        assert '--all is only allowed without a GPU name' in result.output
+
+    # ==========================================================================
+    # Cloud filter tests
+    # ==========================================================================
+
+    def test_show_gpus_with_cloud_filter(self):
+        """Test show_gpus with cloud filter."""
+        mock_aws = clouds.AWS()
+        self.cloud_registry_mock.return_value = mock_aws
+        self.sdk_get_mock.return_value = ['AWS']
+        self.stream_and_get_mock.return_value = {'V100': [1, 2, 4, 8]}
+
+        with mock.patch.object(sdk,
+                               'list_accelerator_counts',
+                               return_value=mock.MagicMock()):
+            result = self.runner.invoke(command.show_gpus, ['--cloud', 'aws'])
+
+        assert result.exit_code == 0
+        self.cloud_registry_mock.assert_called_once_with('aws')
+        assert 'V100' in result.output
+
+    def test_show_gpus_with_region_filter(self):
+        """Test show_gpus with region filter."""
+        mock_aws = clouds.AWS()
+        self.cloud_registry_mock.return_value = mock_aws
+        self.sdk_get_mock.return_value = ['AWS']
+        self.stream_and_get_mock.return_value = {'V100': [1, 2, 4, 8]}
+
+        with mock.patch.object(sdk,
+                               'list_accelerator_counts',
+                               return_value=mock.MagicMock()):
+            result = self.runner.invoke(
+                command.show_gpus, ['--cloud', 'aws', '--region', 'us-west-2'])
+
+        assert result.exit_code == 0
+        assert 'V100' in result.output
+
+    def test_show_gpus_with_infra_option(self):
+        """Test show_gpus with --infra option."""
+        mock_aws = clouds.AWS()
+        self.cloud_registry_mock.return_value = mock_aws
+        self.sdk_get_mock.return_value = ['AWS']
+        self.stream_and_get_mock.return_value = {'V100': [1, 2, 4, 8]}
+
+        with mock.patch.object(sdk,
+                               'list_accelerator_counts',
+                               return_value=mock.MagicMock()):
+            result = self.runner.invoke(command.show_gpus,
+                                        ['--infra', 'aws/us-west-2'])
+
+        assert result.exit_code == 0
+        assert 'V100' in result.output
+
+    def test_show_gpus_wildcard_cloud_converted_to_none(self):
+        """Test that wildcard cloud '*' is converted to None internally."""
+        self.cloud_registry_mock.return_value = None
+        self.sdk_get_mock.return_value = []
+        self.stream_and_get_mock.return_value = {}
+
+        with mock.patch.object(sdk,
+                               'list_accelerator_counts',
+                               return_value=mock.MagicMock()):
+            result = self.runner.invoke(command.show_gpus, ['--infra', 'aws'])
+
+        assert result.exit_code == 0
+
+    # ==========================================================================
+    # --all flag tests
+    # ==========================================================================
+
+    def test_show_gpus_with_all_flag(self):
+        """Test show_gpus with --all flag to show all accelerators."""
+        self.cloud_registry_mock.return_value = None
+        self.sdk_get_mock.return_value = []
+        self.stream_and_get_mock.return_value = {
+            'V100': [1, 2, 4, 8],
+            'A100': [1, 2, 4, 8],
+            'T4': [1, 2, 4],
+        }
+
+        with mock.patch.object(sdk,
+                               'list_accelerator_counts',
+                               return_value=mock.MagicMock()):
+            result = self.runner.invoke(command.show_gpus, ['--all'])
+
+        assert result.exit_code == 0
+        assert 'V100' in result.output
+
+    # ==========================================================================
+    # --all-regions flag tests
+    # ==========================================================================
+
+    def test_show_gpus_all_regions_flag_with_accelerator(self):
+        """Test show_gpus with --all-regions flag and specific accelerator."""
+        mock_aws = clouds.AWS()
+        self.cloud_registry_mock.return_value = mock_aws
+        self.sdk_get_mock.return_value = ['AWS']
+        self.stream_and_get_mock.return_value = {}
+
+        with mock.patch.object(sdk,
+                               'list_accelerators',
+                               return_value=mock.MagicMock()) as mock_list:
+            result = self.runner.invoke(
+                command.show_gpus, ['V100', '--cloud', 'aws', '--all-regions'])
+
+        assert result.exit_code == 0
+        call_args = mock_list.call_args
+        assert call_args is not None
+
+    # ==========================================================================
+    # Disabled cloud tests
+    # ==========================================================================
+
+    def test_show_gpus_k8s_disabled_shows_message(self):
+        """Test that disabled Kubernetes shows appropriate message."""
+        mock_k8s = clouds.Kubernetes()
+        self.cloud_registry_mock.return_value = mock_k8s
+        self.sdk_get_mock.return_value = []
+        self.stream_and_get_mock.return_value = []
+
+        result = self.runner.invoke(command.show_gpus,
+                                    ['--cloud', 'kubernetes'])
+
+        assert 'Kubernetes is not enabled' in result.output
+
+    def test_show_gpus_slurm_disabled_shows_message(self):
+        """Test that disabled Slurm shows appropriate message."""
+        mock_slurm = clouds.Slurm()
+        self.cloud_registry_mock.return_value = mock_slurm
+        self.sdk_get_mock.return_value = []
+        self.stream_and_get_mock.return_value = []
+
+        result = self.runner.invoke(command.show_gpus, ['--cloud', 'slurm'])
+
+        assert 'Slurm is not enabled' in result.output
+
+    def test_show_gpus_ssh_disabled_shows_message(self):
+        """Test that disabled SSH shows appropriate message when requested."""
+        mock_ssh = clouds.SSH()
+        self.cloud_registry_mock.return_value = mock_ssh
+        self.sdk_get_mock.return_value = []
+
+        result = self.runner.invoke(command.show_gpus,
+                                    ['A100', '--cloud', 'ssh'])
+
+        assert 'SSH Node Pools are not enabled' in result.output or result.exit_code != 0
+
+    # ==========================================================================
+    # Case sensitivity tests
+    # ==========================================================================
+
+    def test_show_gpus_case_insensitive_accelerator(self):
+        """Test that accelerator names are handled case-insensitively."""
+        self.cloud_registry_mock.return_value = None
+        self.sdk_get_mock.return_value = []
+        self.stream_and_get_mock.return_value = {}
+
+        with mock.patch.object(sdk,
+                               'list_accelerators',
+                               return_value=mock.MagicMock()) as mock_list:
+            result = self.runner.invoke(command.show_gpus, ['v100'])
+
+        assert result.exit_code == 0
+        call_args = mock_list.call_args
+        assert call_args is not None
+
+    # ==========================================================================
+    # Kubernetes tests
+    # ==========================================================================
+
+    def test_show_gpus_kubernetes_basic_functionality(self):
+        """Test Kubernetes GPU listing with basic table structure validation."""
+        mock_k8s = clouds.Kubernetes()
+        self.cloud_registry_mock.return_value = mock_k8s
+        self.sdk_get_mock.return_value = ['kubernetes']
+
+        # Setup GPU availability
+        gpu_availability = [('test-context', [('A100', [1, 2, 4], 8, 4)])]
+
+        # Setup node info with healthy node
+        mock_node_healthy = mock.MagicMock()
+        mock_node_healthy.accelerator_type = 'A100'
+        mock_node_healthy.total = {'accelerator_count': 4}
+        mock_node_healthy.free = {'accelerators_available': 2}
+        mock_node_healthy.is_ready = True
+        mock_node_healthy.is_cordoned = False
+        mock_node_healthy.taints = []
+        mock_node_healthy.cpu_count = 96
+        mock_node_healthy.cpu_free = 48
+        mock_node_healthy.memory_gb = 360
+        mock_node_healthy.memory_free_gb = 180
+
+        mock_nodes_info = mock.MagicMock()
+        mock_nodes_info.node_info_dict = {'gpu-node-1': mock_node_healthy}
+        mock_nodes_info.hint = None
+
+        self.stream_and_get_mock.side_effect = [
+            gpu_availability,
+            mock_nodes_info,
+            {},
+        ]
+
+        with mock.patch.object(sdk,
+                               'realtime_kubernetes_gpu_availability',
+                               return_value=mock.MagicMock()):
+            with mock.patch.object(sdk,
+                                   'kubernetes_node_info',
+                                   return_value=mock.MagicMock()):
+                result = self.runner.invoke(command.show_gpus,
+                                            ['--cloud', 'kubernetes'])
+
+        assert result.exit_code == 0
+
+        # Validate table headers are present
+        assert 'CONTEXT' in result.output
+        assert 'NODE' in result.output
+        assert 'vCPU' in result.output
+        assert 'Memory (GB)' in result.output
+        assert 'GPU UTILIZATION' in result.output
+        assert 'NODE STATUS' in result.output
+
+        # Validate data values
+        assert 'test-context' in result.output
+        assert 'gpu-node-1' in result.output
+        assert 'A100' in result.output
+        assert '48 of 96 free' in result.output  # CPU
+        assert '180 of 360 free' in result.output  # Memory
+        assert '2 of 4 free' in result.output  # GPU utilization
+        assert 'Healthy' in result.output
+
+    def test_show_gpus_kubernetes_with_specific_gpu_and_quantity(self):
+        """Test Kubernetes with specific GPU and quantity filter."""
+        mock_k8s = clouds.Kubernetes()
+        self.cloud_registry_mock.return_value = mock_k8s
+        self.sdk_get_mock.return_value = ['kubernetes']
+
+        gpu_availability = [('context1', [('A100', [8], 16, 8)])]
+        mock_nodes_info = mock.MagicMock()
+        mock_nodes_info.node_info_dict = {}
+        mock_nodes_info.hint = None
+
+        self.stream_and_get_mock.side_effect = [
+            gpu_availability,
+            mock_nodes_info,
+            {},
+        ]
+
+        with mock.patch.object(sdk,
+                               'realtime_kubernetes_gpu_availability',
+                               return_value=mock.MagicMock()):
+            with mock.patch.object(sdk,
+                                   'kubernetes_node_info',
+                                   return_value=mock.MagicMock()):
+                result = self.runner.invoke(command.show_gpus,
+                                            ['A100:8', '--cloud', 'kubernetes'])
+
+        assert result.exit_code == 0
+        assert 'A100' in result.output
+
+    def test_show_gpus_kubernetes_multiple_contexts(self):
+        """Test Kubernetes with multiple contexts showing aggregated table with proper structure."""
+        mock_k8s = clouds.Kubernetes()
+        self.cloud_registry_mock.return_value = mock_k8s
+        self.sdk_get_mock.return_value = ['kubernetes']
+
+        # Multiple contexts with different GPU types
+        gpu_availability = [
+            ('prod-cluster', [('A100', [1, 2], 4, 2), ('V100', [1], 2, 1)]),
+            ('staging-cluster', [('A100', [1, 2, 4], 8, 4)]),
+            ('dev-cluster', [('V100', [1, 2], 4, 2)]),
+        ]
+
+        # Node info for each context
+        mock_nodes_info = mock.MagicMock()
+        mock_nodes_info.node_info_dict = {}
+        mock_nodes_info.hint = None
+
+        node_info_calls = [mock_nodes_info] * 3
+        self.stream_and_get_mock.side_effect = ([gpu_availability] +
+                                                node_info_calls + [{}])
+
+        with mock.patch.object(sdk,
+                               'realtime_kubernetes_gpu_availability',
+                               return_value=mock.MagicMock()):
+            with mock.patch.object(sdk,
+                                   'kubernetes_node_info',
+                                   return_value=mock.MagicMock()):
+                result = self.runner.invoke(command.show_gpus,
+                                            ['--cloud', 'kubernetes'])
+
+        assert result.exit_code == 0
+
+        # Validate context names appear
+        assert 'prod-cluster' in result.output
+        assert 'staging-cluster' in result.output
+        assert 'dev-cluster' in result.output
+
+        # Validate GPU types from each context
+        assert 'A100' in result.output
+        assert 'V100' in result.output
+
+        # Validate table sections are present
+        assert 'Kubernetes' in result.output
+
+    def test_show_gpus_kubernetes_node_status_variations(self):
+        """Test Kubernetes with various node statuses: Healthy, NotReady, Cordoned, Tainted."""
+        mock_k8s = clouds.Kubernetes()
+        self.cloud_registry_mock.return_value = mock_k8s
+        self.sdk_get_mock.return_value = ['kubernetes']
+
+        gpu_availability = [('test-k8s', [('A100', [1, 2, 4], 16, 8)])]
+
+        # Healthy node
+        mock_node_healthy = mock.MagicMock()
+        mock_node_healthy.accelerator_type = 'A100'
+        mock_node_healthy.total = {'accelerator_count': 4}
+        mock_node_healthy.free = {'accelerators_available': 2}
+        mock_node_healthy.is_ready = True
+        mock_node_healthy.is_cordoned = False
+        mock_node_healthy.taints = []
+        mock_node_healthy.cpu_count = 96
+        mock_node_healthy.cpu_free = 48
+        mock_node_healthy.memory_gb = 360
+        mock_node_healthy.memory_free_gb = 180
+
+        # NotReady node
+        mock_node_not_ready = mock.MagicMock()
+        mock_node_not_ready.accelerator_type = 'A100'
+        mock_node_not_ready.total = {'accelerator_count': 4}
+        mock_node_not_ready.free = {'accelerators_available': 0}
+        mock_node_not_ready.is_ready = False
+        mock_node_not_ready.is_cordoned = False
+        mock_node_not_ready.taints = []
+        mock_node_not_ready.cpu_count = 96
+        mock_node_not_ready.cpu_free = None
+        mock_node_not_ready.memory_gb = 360
+        mock_node_not_ready.memory_free_gb = None
+
+        # Cordoned node
+        mock_node_cordoned = mock.MagicMock()
+        mock_node_cordoned.accelerator_type = 'A100'
+        mock_node_cordoned.total = {'accelerator_count': 4}
+        mock_node_cordoned.free = {'accelerators_available': 4}
+        mock_node_cordoned.is_ready = True
+        mock_node_cordoned.is_cordoned = True
+        mock_node_cordoned.taints = []
+        mock_node_cordoned.cpu_count = 96
+        mock_node_cordoned.cpu_free = 96
+        mock_node_cordoned.memory_gb = 360
+        mock_node_cordoned.memory_free_gb = 360
+
+        # Tainted node with multiple taints grouped by effect
+        mock_node_tainted = mock.MagicMock()
+        mock_node_tainted.accelerator_type = 'A100'
+        mock_node_tainted.total = {'accelerator_count': 4}
+        mock_node_tainted.free = {'accelerators_available': 4}
+        mock_node_tainted.is_ready = True
+        mock_node_tainted.is_cordoned = False
+        mock_node_tainted.taints = [{
+            'key': 'nvidia.com/gpu',
+            'effect': 'NoSchedule'
+        }, {
+            'key': 'node.kubernetes.io/memory-pressure',
+            'effect': 'NoSchedule'
+        }, {
+            'key': 'node.kubernetes.io/disk-pressure',
+            'effect': 'NoExecute'
+        }]
+        mock_node_tainted.cpu_count = 96
+        mock_node_tainted.cpu_free = 96
+        mock_node_tainted.memory_gb = 360
+        mock_node_tainted.memory_free_gb = 360
+
+        # Combined state: NotReady + Cordoned
+        mock_node_combined = mock.MagicMock()
+        mock_node_combined.accelerator_type = 'A100'
+        mock_node_combined.total = {'accelerator_count': 4}
+        mock_node_combined.free = {'accelerators_available': 0}
+        mock_node_combined.is_ready = False
+        mock_node_combined.is_cordoned = True
+        mock_node_combined.taints = []
+        mock_node_combined.cpu_count = 96
+        mock_node_combined.cpu_free = None
+        mock_node_combined.memory_gb = 360
+        mock_node_combined.memory_free_gb = None
+
+        mock_nodes_info = mock.MagicMock()
+        mock_nodes_info.node_info_dict = {
+            'node-healthy': mock_node_healthy,
+            'node-not-ready': mock_node_not_ready,
+            'node-cordoned': mock_node_cordoned,
+            'node-tainted': mock_node_tainted,
+            'node-combined': mock_node_combined,
+        }
+        mock_nodes_info.hint = None
+
+        self.stream_and_get_mock.side_effect = [
+            gpu_availability,
+            mock_nodes_info,
+            {},
+        ]
+
+        with mock.patch.object(sdk,
+                               'realtime_kubernetes_gpu_availability',
+                               return_value=mock.MagicMock()):
+            with mock.patch.object(sdk,
+                                   'kubernetes_node_info',
+                                   return_value=mock.MagicMock()):
+                result = self.runner.invoke(command.show_gpus,
+                                            ['--cloud', 'kubernetes'])
+
+        assert result.exit_code == 0
+
+        # Validate table headers
+        assert 'NODE STATUS' in result.output
+
+        # Validate node names and statuses
+        assert 'node-healthy' in result.output
+        assert 'node-not-ready' in result.output
+        assert 'node-cordoned' in result.output
+        assert 'node-tainted' in result.output
+        assert 'node-combined' in result.output
+
+        # Validate status values
+        assert 'Healthy' in result.output
+        assert 'NotReady' in result.output
+        assert 'Cordoned' in result.output
+        assert 'NoSchedule Taint' in result.output
+        assert 'NoExecute Taint' in result.output
+
+        # Validate combined status appears (NotReady, Cordoned)
+        lines = result.output.split('\n')
+        combined_line = [l for l in lines if 'node-combined' in l]
+        assert len(combined_line) == 1
+        assert 'NotReady' in combined_line[0]
+        assert 'Cordoned' in combined_line[0]
+
+    def test_show_gpus_kubernetes_edge_cases(self):
+        """Test Kubernetes with edge cases: no accelerator type, zero accelerators, missing CPU/memory info."""
+        mock_k8s = clouds.Kubernetes()
+        self.cloud_registry_mock.return_value = mock_k8s
+        self.sdk_get_mock.return_value = ['kubernetes']
+
+        gpu_availability = [('test-cluster', [('A100', [1, 2], 4, 2)])]
+
+        # Node without accelerator_type (should show as '-')
+        mock_node_no_accel = mock.MagicMock()
+        mock_node_no_accel.accelerator_type = None
+        mock_node_no_accel.total = {'accelerator_count': 2}
+        mock_node_no_accel.free = {'accelerators_available': 1}
+        mock_node_no_accel.is_ready = True
+        mock_node_no_accel.is_cordoned = False
+        mock_node_no_accel.taints = []
+        mock_node_no_accel.cpu_count = None
+        mock_node_no_accel.memory_gb = None
+
+        # Node with zero accelerators
+        mock_node_zero = mock.MagicMock()
+        mock_node_zero.accelerator_type = 'A100'
+        mock_node_zero.total = {'accelerator_count': 0}
+        mock_node_zero.free = {'accelerators_available': 0}
+        mock_node_zero.is_ready = False
+        mock_node_zero.is_cordoned = False
+        mock_node_zero.taints = []
+        mock_node_zero.cpu_count = None
+        mock_node_zero.memory_gb = None
+
+        mock_nodes_info = mock.MagicMock()
+        mock_nodes_info.node_info_dict = {
+            'node-no-accel': mock_node_no_accel,
+            'node-zero-gpus': mock_node_zero
+        }
+        mock_nodes_info.hint = None
+
+        self.stream_and_get_mock.side_effect = [
+            gpu_availability,
+            mock_nodes_info,
+            {},
+        ]
+
+        with mock.patch.object(sdk,
+                               'realtime_kubernetes_gpu_availability',
+                               return_value=mock.MagicMock()):
+            with mock.patch.object(sdk,
+                                   'kubernetes_node_info',
+                                   return_value=mock.MagicMock()):
+                result = self.runner.invoke(command.show_gpus,
+                                            ['--cloud', 'kubernetes'])
+
+        assert result.exit_code == 0
+
+        # Validate table still renders with edge case data
+        assert 'node-no-accel' in result.output
+        assert 'node-zero-gpus' in result.output
+
+        # Validate nodes show up in table (even with missing/zero data)
+        lines = result.output.split('\n')
+        no_accel_line = [l for l in lines if 'node-no-accel' in l]
+        assert len(no_accel_line) == 1
+        # Should show '-' for accelerator type
+        assert '-' in no_accel_line[0]
+
+    def test_show_gpus_kubernetes_with_hint(self):
+        """Test Kubernetes node info with hint message."""
+        mock_k8s = clouds.Kubernetes()
+        self.cloud_registry_mock.return_value = mock_k8s
+        self.sdk_get_mock.return_value = ['kubernetes']
+
+        gpu_availability = [('context1', [('A100', [1, 2], 4, 2)])]
+
+        mock_node_info = mock.MagicMock()
+        mock_node_info.accelerator_type = 'A100'
+        mock_node_info.total = {'accelerator_count': 2}
+        mock_node_info.free = {'accelerators_available': 1}
+        mock_node_info.is_ready = True
+        mock_node_info.is_cordoned = False
+        mock_node_info.taints = []
+        mock_node_info.cpu_count = None
+        mock_node_info.memory_gb = None
+
+        mock_nodes_info = mock.MagicMock()
+        mock_nodes_info.node_info_dict = {'node1': mock_node_info}
+        mock_nodes_info.hint = 'Some nodes may have resource constraints'
+
+        self.stream_and_get_mock.side_effect = [
+            gpu_availability,
+            mock_nodes_info,
+            {},
+        ]
+
+        with mock.patch.object(sdk,
+                               'realtime_kubernetes_gpu_availability',
+                               return_value=mock.MagicMock()):
+            with mock.patch.object(sdk,
+                                   'kubernetes_node_info',
+                                   return_value=mock.MagicMock()):
+                result = self.runner.invoke(command.show_gpus,
+                                            ['--cloud', 'kubernetes'])
+
+        assert result.exit_code == 0
+        # Verify hint message appears in output
+        assert 'resource constraints' in result.output
+
+    def test_show_gpus_kubernetes_labeled_zero_gpu_hint(self):
+        """Test Kubernetes hint for nodes with GPU labels but 0 GPU resources."""
+        mock_k8s = clouds.Kubernetes()
+        self.cloud_registry_mock.return_value = mock_k8s
+        self.sdk_get_mock.return_value = ['kubernetes']
+
+        # Create a node with GPU label but 0 GPU resources
+        # Use the actual model class to ensure proper structure
+        mock_node_info = models.KubernetesNodeInfo(
+            name='node1',
+            accelerator_type='V100',
+            total={'accelerator_count': 0},
+            free={'accelerators_available': 0},
+            ip_address=None,
+            cpu_count=8.0,
+            memory_gb=16.0,
+            cpu_free=None,
+            memory_free_gb=None,
+            is_ready=True,
+            is_cordoned=False,
+            taints=[])
+
+        mock_nodes_info = models.KubernetesNodesInfo(
+            node_info_dict={'node1': mock_node_info}, hint='')
+
+        # Mock the GPU availability (first call to stream_and_get)
+        gpu_availability = [('context1', [('V100', [1, 2], 4, 2)])]
+        # Mock the node info (second call to stream_and_get for kubernetes_node_info)
+        # Mock the accelerator counts (third call to stream_and_get)
+        self.stream_and_get_mock.side_effect = [
+            gpu_availability,  # realtime_kubernetes_gpu_availability
+            mock_nodes_info,  # kubernetes_node_info
+            {},  # list_accelerator_counts
+        ]
+
+        with mock.patch.object(sdk,
+                               'realtime_kubernetes_gpu_availability',
+                               return_value=mock.MagicMock()):
+            with mock.patch.object(sdk,
+                                   'kubernetes_node_info',
+                                   return_value=mock.MagicMock()):
+                result = self.runner.invoke(command.show_gpus,
+                                            ['--cloud', 'kubernetes'])
+
+        assert result.exit_code == 0
+        # Verify hint message appears at the bottom of output
+        assert 'Some Kubernetes nodes have GPU labels but report 0 GPU' in result.output
+        assert 'check the node labels and configuration' in result.output
+        assert 'Affected 1 node(s): context1/node1' in result.output
+
+    def test_show_gpus_kubernetes_labeled_zero_gpu_hint_multiple_nodes(self):
+        """Test Kubernetes hint for nodes with GPU labels but 0 GPU resources."""
+        mock_k8s = clouds.Kubernetes()
+        self.cloud_registry_mock.return_value = mock_k8s
+        self.sdk_get_mock.return_value = ['kubernetes']
+
+        node_info_dict = {}
+        for i in range(10):
+            node_info_dict[f'node{i}'] = models.KubernetesNodeInfo(
+                name=f'node{i}',
+                accelerator_type='V100',
+                total={'accelerator_count': 0},
+                free={'accelerators_available': 0},
+                ip_address=None,
+                cpu_count=8.0,
+                memory_gb=16.0,
+                cpu_free=None,
+                memory_free_gb=None,
+                is_ready=True,
+                is_cordoned=False,
+                taints=[])
+
+        mock_nodes_info = models.KubernetesNodesInfo(
+            node_info_dict=node_info_dict, hint='')
+
+        gpu_availability = [('context1', [('V100', [1, 2], 4, 2)])]
+
+        self.stream_and_get_mock.side_effect = [
+            gpu_availability,
+            mock_nodes_info,
+            {},
+        ]
+
+        with mock.patch.object(sdk,
+                               'realtime_kubernetes_gpu_availability',
+                               return_value=mock.MagicMock()):
+            with mock.patch.object(sdk,
+                                   'kubernetes_node_info',
+                                   return_value=mock.MagicMock()):
+                result = self.runner.invoke(command.show_gpus,
+                                            ['--cloud', 'kubernetes'])
+
+        assert result.exit_code == 0
+        # Verify hint message appears at the bottom of output
+        assert 'Some Kubernetes nodes have GPU labels but report 0 GPU' in result.output
+        assert 'check the node labels and configuration' in result.output
+        assert 'Affected 10 node(s): context1/node0, context1/node1, context1/node2...' in result.output
diff --git a/tests/unit_tests/test_sky/test_cost_report.py b/tests/unit_tests/test_sky/test_cost_report.py
index 9006b5c4a17..8cbf746193a 100644
--- a/tests/unit_tests/test_sky/test_cost_report.py
+++ b/tests/unit_tests/test_sky/test_cost_report.py
@@ -192,6 +192,7 @@ def test_cost_report_endpoint_calls_core(self, mock_core_cost_report):
         # Create mock request and body
         mock_request = mock.Mock()
         mock_request.state.request_id = 'test_request_id'
+        mock_request.state.auth_user = None
 
         cost_report_body = payloads.CostReportBody(days=15)
 
diff --git a/tests/unit_tests/test_sky/test_service_spec.py b/tests/unit_tests/test_sky/test_service_spec.py
new file mode 100644
index 00000000000..f075efd775e
--- /dev/null
+++ b/tests/unit_tests/test_sky/test_service_spec.py
@@ -0,0 +1,166 @@
+"""Tests for SkyServiceSpec, specifically pool configuration validation."""
+import pytest
+
+from sky.serve import service_spec
+
+
+class TestPoolConfiguration:
+    """Test pool configuration validation in SkyServiceSpec."""
+
+    def test_pool_with_min_and_max_workers_without_workers(self):
+        """Test that pool can be specified with min_workers and max_workers
+        without workers set.
+
+        This is a valid autoscaling configuration.
+        """
+        config = {
+            'pool': {
+                'min_workers': 1,
+                'max_workers': 5,
+            },
+            'readiness_probe': '/',
+        }
+
+        # Should not raise any error
+        spec = service_spec.SkyServiceSpec.from_yaml_config(config)
+
+        # Verify the values were properly set
+        assert spec.min_replicas == 1
+        assert spec.max_replicas == 5
+
+    def test_pool_with_only_workers(self):
+        """Test that pool can be specified with just workers (fixed workers)."""
+        config = {
+            'pool': {},
+            'workers': 3,
+            'readiness_probe': '/',
+        }
+
+        spec = service_spec.SkyServiceSpec.from_yaml_config(config)
+
+        assert spec.min_replicas == 3
+        # max_replicas is None for fixed workers
+        assert spec.max_replicas is None
+
+    def test_pool_with_min_max_workers_and_queue_length_threshold(self):
+        """Test pool with autoscaling and queue_length_threshold."""
+        config = {
+            'pool': {
+                'min_workers': 2,
+                'max_workers': 10,
+                'queue_length_threshold': 5,
+            },
+            'readiness_probe': '/',
+        }
+
+        spec = service_spec.SkyServiceSpec.from_yaml_config(config)
+
+        assert spec.min_replicas == 2
+        assert spec.max_replicas == 10
+        assert spec.queue_length_threshold == 5
+
+    def test_pool_with_min_max_workers_and_delays(self):
+        """Test pool with autoscaling and delay settings."""
+        config = {
+            'pool': {
+                'min_workers': 1,
+                'max_workers': 8,
+                'upscale_delay_seconds': 30,
+                'downscale_delay_seconds': 60,
+            },
+            'readiness_probe': '/',
+        }
+
+        spec = service_spec.SkyServiceSpec.from_yaml_config(config)
+
+        assert spec.min_replicas == 1
+        assert spec.max_replicas == 8
+        assert spec.upscale_delay_seconds == 30
+        assert spec.downscale_delay_seconds == 60
+
+    def test_pool_without_workers_and_without_min_max_fails(self):
+        """Test that pool without workers or min/max_workers fails."""
+        config = {
+            'pool': {},
+            'readiness_probe': '/',
+        }
+
+        with pytest.raises(ValueError,
+                           match='One of workers, or both min_workers and '
+                           'max_workers must be set'):
+            service_spec.SkyServiceSpec.from_yaml_config(config)
+
+    def test_pool_with_min_workers_but_no_max_workers_fails(self):
+        """Test that pool with min_workers but no max_workers fails."""
+        config = {
+            'pool': {
+                'min_workers': 2,
+            },
+            'readiness_probe': '/',
+        }
+
+        with pytest.raises(ValueError,
+                           match='max_workers must be set when min_workers is '
+                           'specified'):
+            service_spec.SkyServiceSpec.from_yaml_config(config)
+
+    def test_pool_with_min_workers_greater_than_max_workers_fails(self):
+        """Test that pool with min_workers > max_workers fails."""
+        config = {
+            'pool': {
+                'min_workers': 10,
+                'max_workers': 5,
+            },
+            'readiness_probe': '/',
+        }
+
+        with pytest.raises(ValueError,
+                           match=r'min_workers \(10\) must be <= max_workers '
+                           r'\(5\)'):
+            service_spec.SkyServiceSpec.from_yaml_config(config)
+
+    def test_pool_with_queue_length_threshold_but_no_max_workers_fails(self):
+        """Test that pool with queue_length_threshold but no max_workers fails.
+        """
+        config = {
+            'pool': {
+                'queue_length_threshold': 5,
+            },
+            'workers': 3,
+            'readiness_probe': '/',
+        }
+
+        with pytest.raises(ValueError,
+                           match='max_workers must be set when '
+                           'queue_length_threshold is specified'):
+            service_spec.SkyServiceSpec.from_yaml_config(config)
+
+    def test_pool_with_zero_min_workers(self):
+        """Test that pool can have min_workers=0 (scale to zero)."""
+        config = {
+            'pool': {
+                'min_workers': 0,
+                'max_workers': 5,
+            },
+            'readiness_probe': '/',
+        }
+
+        spec = service_spec.SkyServiceSpec.from_yaml_config(config)
+
+        assert spec.min_replicas == 0
+        assert spec.max_replicas == 5
+
+    def test_pool_with_equal_min_and_max_workers(self):
+        """Test that pool can have min_workers == max_workers."""
+        config = {
+            'pool': {
+                'min_workers': 3,
+                'max_workers': 3,
+            },
+            'readiness_probe': '/',
+        }
+
+        spec = service_spec.SkyServiceSpec.from_yaml_config(config)
+
+        assert spec.min_replicas == 3
+        assert spec.max_replicas == 3
diff --git a/tests/unit_tests/test_sky/users/test_permission.py b/tests/unit_tests/test_sky/users/test_permission.py
index 239c060d342..228b881f328 100644
--- a/tests/unit_tests/test_sky/users/test_permission.py
+++ b/tests/unit_tests/test_sky/users/test_permission.py
@@ -12,6 +12,7 @@
 from sky.skylet import constants
 from sky.users import permission
 from sky.users import rbac
+from sky.utils import common
 
 
 @pytest.fixture
@@ -996,9 +997,13 @@ def create_service():
                                  ('user2', 'workspace1', '*')}
 
         # Should have one grouping policy call per user (for default role assignment)
+        # plus system users (SERVER_ID and SKYPILOT_SYSTEM_USER_ID) with admin role
         expected_grouping_policy_calls = {('user1', rbac.get_default_role()),
                                           ('user2', rbac.get_default_role()),
-                                          ('user3', rbac.get_default_role())}
+                                          ('user3', rbac.get_default_role()),
+                                          (common.SERVER_ID, 'admin'),
+                                          (constants.SKYPILOT_SYSTEM_USER_ID,
+                                           'admin')}
 
         assert unique_policy_calls == expected_policy_calls
         assert unique_grouping_policy_calls == expected_grouping_policy_calls
@@ -1038,10 +1043,13 @@ def get_grouping_policy_side_effect():
             if init_call_count <= 1:  # First initialization call
                 return []  # No roles initially
             else:
-                # Users have roles after first call
-                return [
+                # Users have roles after first call (including system users)
+                result = [
                     [user.id, rbac.get_default_role()] for user in mock_users
                 ]
+                result.append([common.SERVER_ID, 'admin'])
+                result.append([constants.SKYPILOT_SYSTEM_USER_ID, 'admin'])
+                return result
 
         def get_roles_for_user_side_effect(user_id):
             # This is called by _add_user_if_not_exists_no_lock
@@ -1080,13 +1088,15 @@ def add_grouping_policy_tracked(*args):
         service._maybe_initialize_policies()
         service._maybe_initialize_policies()
 
-        # Each user should only be added once (3 users total)
-        assert len(grouping_policy_calls) == len(mock_users)
-
-        # Verify each user was added exactly once
+        # Each user should only be added once (3 mock users + 2 system users)
         expected_calls = {
             (user.id, rbac.get_default_role()) for user in mock_users
         }
+        expected_calls.add((common.SERVER_ID, 'admin'))
+        expected_calls.add((constants.SKYPILOT_SYSTEM_USER_ID, 'admin'))
+        assert len(grouping_policy_calls) == len(expected_calls)
+
+        # Verify each user was added exactly once
         actual_calls = set(grouping_policy_calls)
         assert actual_calls == expected_calls
 
diff --git a/tests/unit_tests/test_sky/utils/test_cluster_utils.py b/tests/unit_tests/test_sky/utils/test_cluster_utils.py
new file mode 100644
index 00000000000..f641f334eeb
--- /dev/null
+++ b/tests/unit_tests/test_sky/utils/test_cluster_utils.py
@@ -0,0 +1,215 @@
+"""Unit tests for sky/utils/cluster_utils.py."""
+from unittest import mock
+
+import pytest
+
+from sky.utils import cluster_utils
+
+
+class TestConvertWindowsPathToWsl:
+    """Tests for _convert_windows_path_to_wsl()."""
+
+    def test_standard_windows_path(self):
+        result = cluster_utils._convert_windows_path_to_wsl('C:\\Users\\test')
+        assert result == '/mnt/c/Users/test'
+
+    def test_windows_path_with_forward_slashes(self):
+        result = cluster_utils._convert_windows_path_to_wsl('C:/Users/test')
+        assert result == '/mnt/c/Users/test'
+
+    def test_drive_letter_case_insensitive(self):
+        result = cluster_utils._convert_windows_path_to_wsl('D:\\Data')
+        assert result == '/mnt/d/Data'
+
+    def test_root_drive(self):
+        result = cluster_utils._convert_windows_path_to_wsl('C:\\')
+        assert result == '/mnt/c/'
+
+    def test_already_wsl_path(self):
+        result = cluster_utils._convert_windows_path_to_wsl('/already/wsl/path')
+        assert result == '/already/wsl/path'
+
+    def test_relative_path(self):
+        result = cluster_utils._convert_windows_path_to_wsl('relative/path')
+        assert result == 'relative/path'
+
+    def test_empty_string(self):
+        result = cluster_utils._convert_windows_path_to_wsl('')
+        assert result == ''
+
+
+class TestConvertWslPathToWindows:
+    """Tests for _convert_wsl_path_to_windows()."""
+
+    def test_standard_wsl_path(self):
+        result = cluster_utils._convert_wsl_path_to_windows('/mnt/c/Users/test')
+        assert result == 'C:/Users/test'
+
+    def test_different_drive(self):
+        result = cluster_utils._convert_wsl_path_to_windows('/mnt/d/Data')
+        assert result == 'D:/Data'
+
+    def test_root_drive(self):
+        result = cluster_utils._convert_wsl_path_to_windows('/mnt/c/')
+        assert result == 'C:/'
+
+    def test_non_mnt_path(self):
+        result = cluster_utils._convert_wsl_path_to_windows('/home/user')
+        assert result == '/home/user'
+
+    def test_short_mnt_path(self):
+        result = cluster_utils._convert_wsl_path_to_windows('/mnt/')
+        assert result == '/mnt/'
+
+    def test_empty_string(self):
+        result = cluster_utils._convert_wsl_path_to_windows('')
+        assert result == ''
+
+
+class TestConvertProxyCommandForWindows:
+    """Tests for SSHConfigHelper._convert_proxy_command_for_windows()."""
+
+    def test_simple_proxy_command(self):
+        cmd = 'ssh -W %h:%p user@host'
+        result = cluster_utils.SSHConfigHelper._convert_proxy_command_for_windows(
+            cmd)
+        assert result == "wsl.exe bash -c 'ssh -W %h:%p user@host'"
+
+    def test_proxy_command_with_double_quotes(self):
+        cmd = 'ssh -o "StrictHostKeyChecking=no" -W %h:%p user@host'
+        result = cluster_utils.SSHConfigHelper._convert_proxy_command_for_windows(
+            cmd)
+        assert result == (
+            "wsl.exe bash -c 'ssh -o \"StrictHostKeyChecking=no\" -W %h:%p user@host'"
+        )
+
+    def test_proxy_command_with_single_quotes(self):
+        cmd = "ssh -o 'StrictHostKeyChecking=no' -W %h:%p user@host"
+        result = cluster_utils.SSHConfigHelper._convert_proxy_command_for_windows(
+            cmd)
+        # Single quotes should be escaped with '"'"'
+        assert result == (
+            "wsl.exe bash -c 'ssh -o '\"'\"'StrictHostKeyChecking=no'\"'\"' -W %h:%p user@host'"
+        )
+
+    def test_proxy_command_with_dollar_sign(self):
+        # Dollar signs should not be expanded in single-quoted strings
+        cmd = 'echo $HOME'
+        result = cluster_utils.SSHConfigHelper._convert_proxy_command_for_windows(
+            cmd)
+        assert result == "wsl.exe bash -c 'echo $HOME'"
+
+    def test_proxy_command_with_backticks(self):
+        # Backticks should not be expanded in single-quoted strings
+        cmd = 'echo `hostname`'
+        result = cluster_utils.SSHConfigHelper._convert_proxy_command_for_windows(
+            cmd)
+        assert result == "wsl.exe bash -c 'echo `hostname`'"
+
+
+class TestGetWslWindowsHome:
+    """Tests for get_wsl_windows_home()."""
+
+    def test_not_wsl_returns_none(self):
+        # Clear the lru_cache to ensure fresh state
+        cluster_utils.get_wsl_windows_home.cache_clear()
+        with mock.patch('sky.utils.common_utils.is_wsl', return_value=False):
+            result = cluster_utils.get_wsl_windows_home()
+            assert result is None
+
+    def test_wsl_with_userprofile_env(self):
+        cluster_utils.get_wsl_windows_home.cache_clear()
+        with mock.patch('sky.utils.common_utils.is_wsl', return_value=True):
+            with mock.patch.dict('os.environ',
+                                 {'USERPROFILE': 'C:\\Users\\testuser'}):
+                with mock.patch('os.path.isdir', return_value=True):
+                    result = cluster_utils.get_wsl_windows_home()
+                    assert result == '/mnt/c/Users/testuser'
+
+    def test_wsl_without_userprofile_uses_cmd(self):
+        cluster_utils.get_wsl_windows_home.cache_clear()
+        with mock.patch('sky.utils.common_utils.is_wsl', return_value=True):
+            with mock.patch.dict('os.environ', {}, clear=True):
+                with mock.patch(
+                        'sky.utils.cluster_utils._get_windows_userprofile_via_cmd',
+                        return_value='D:\\Users\\cmduser'):
+                    with mock.patch('os.path.isdir', return_value=True):
+                        result = cluster_utils.get_wsl_windows_home()
+                        assert result == '/mnt/d/Users/cmduser'
+
+    def test_wsl_with_invalid_home_returns_none(self):
+        cluster_utils.get_wsl_windows_home.cache_clear()
+        with mock.patch('sky.utils.common_utils.is_wsl', return_value=True):
+            with mock.patch.dict('os.environ',
+                                 {'USERPROFILE': 'C:\\Users\\testuser'}):
+                with mock.patch('os.path.isdir', return_value=False):
+                    result = cluster_utils.get_wsl_windows_home()
+                    assert result is None
+
+    def test_caching_behavior(self):
+        """Test that the function result is cached."""
+        cluster_utils.get_wsl_windows_home.cache_clear()
+        call_count = 0
+
+        def mock_is_wsl():
+            nonlocal call_count
+            call_count += 1
+            return True
+
+        with mock.patch('sky.utils.common_utils.is_wsl',
+                        side_effect=mock_is_wsl):
+            with mock.patch.dict('os.environ',
+                                 {'USERPROFILE': 'C:\\Users\\cached'}):
+                with mock.patch('os.path.isdir', return_value=True):
+                    # Call multiple times
+                    result1 = cluster_utils.get_wsl_windows_home()
+                    result2 = cluster_utils.get_wsl_windows_home()
+                    result3 = cluster_utils.get_wsl_windows_home()
+
+                    # Should only call is_wsl once due to caching
+                    assert call_count == 1
+                    assert result1 == result2 == result3 == '/mnt/c/Users/cached'
+
+
+class TestGetWindowsUserprofileViaCmd:
+    """Tests for _get_windows_userprofile_via_cmd()."""
+
+    def test_successful_cmd_call(self):
+        mock_result = mock.Mock()
+        mock_result.returncode = 0
+        mock_result.stdout = 'C:\\Users\\testuser\n'
+
+        with mock.patch('subprocess.run', return_value=mock_result):
+            result = cluster_utils._get_windows_userprofile_via_cmd()
+            assert result == 'C:\\Users\\testuser'
+
+    def test_failed_cmd_call(self):
+        mock_result = mock.Mock()
+        mock_result.returncode = 1
+        mock_result.stdout = ''
+
+        with mock.patch('subprocess.run', return_value=mock_result):
+            result = cluster_utils._get_windows_userprofile_via_cmd()
+            assert result is None
+
+    def test_unexpanded_variable(self):
+        mock_result = mock.Mock()
+        mock_result.returncode = 0
+        mock_result.stdout = '%USERPROFILE%\n'
+
+        with mock.patch('subprocess.run', return_value=mock_result):
+            result = cluster_utils._get_windows_userprofile_via_cmd()
+            assert result is None
+
+    def test_timeout_exception(self):
+        import subprocess
+        with mock.patch('subprocess.run',
+                        side_effect=subprocess.TimeoutExpired(cmd='cmd.exe',
+                                                              timeout=5)):
+            result = cluster_utils._get_windows_userprofile_via_cmd()
+            assert result is None
+
+    def test_file_not_found(self):
+        with mock.patch('subprocess.run', side_effect=FileNotFoundError()):
+            result = cluster_utils._get_windows_userprofile_via_cmd()
+            assert result is None
diff --git a/tests/unit_tests/test_sky/utils/test_command_runner.py b/tests/unit_tests/test_sky/utils/test_command_runner.py
index 671b83088e7..05684d5704b 100644
--- a/tests/unit_tests/test_sky/utils/test_command_runner.py
+++ b/tests/unit_tests/test_sky/utils/test_command_runner.py
@@ -1,5 +1,6 @@
 """Unit tests for sky.utils.command_runner."""
 
+from contextlib import suppress
 import os
 import select
 import socket
@@ -52,9 +53,14 @@ def test_docker_runner_passes_proxy_command_to_inner_hop() -> None:
 class MockSSHServer(paramiko.ServerInterface):
     """Mock SSH server requiring keyboard-interactive auth."""
 
-    def __init__(self, expected_code: str = '123456'):
+    def __init__(
+        self,
+        exec_event: threading.Event,
+        expected_code: str = '123456',
+    ):
         self.expected_code = expected_code
         self.auth_attempts = []
+        self.exec_event = exec_event
 
     def check_channel_request(self, kind, chanid):
         if kind == 'session':
@@ -88,8 +94,10 @@ def get_allowed_auths(self, username):
 
     def check_channel_exec_request(self, channel, command):
         # Echo back the command to show auth worked
-        channel.send(f'executed: {command.decode()}\n'.encode())
-        channel.send_exit_status(0)
+        cmd_str = command.decode('utf-8', errors='replace')
+        channel.send(f'executed: {cmd_str}\n'.encode())
+        # Allow the command and signal that exec was received
+        self.exec_event.set()
         return True
 
     def check_channel_pty_request(self, channel, term, width, height,
@@ -106,66 +114,101 @@ def _generate_host_key():
 
 
 class TestSSHCommandRunnerInteractiveAuth:
-    """Test SSHCommandRunner with mock SSH server requiring interactive auth."""
+    """Test SSHCommandRunner with mock SSH server requiring interactive auth.
 
-    @pytest.fixture
+    This test verifies the `_retry_with_interactive_auth` workflow where:
+    1. The client (runner) connects to a mock SSH server requiring keyboard-interactive auth.
+    2. The runner spawns a PTY and passes the master fd to a handler via a Unix socket.
+    3. The handler (simulated here) reads the auth prompt from the PTY and writes the response.
+    4. The mock server authenticates the client, accepts the command execution, and returns exit status 0.
+
+    Crucially, this test handles synchronization to prevent deadlocks:
+    - The server waits for the 'exec' request event before sending an exit status.
+    - The server immediately closes the channel after sending status 0 to ensure the OpenSSH client receives EOF and terminates.
+    - The test waits for the client to finish successfully before signaling the server to shut down.
+    """
+
+    @pytest.fixture(autouse=True)
     def mock_ssh_server(self):
-        """Start a mock SSH server and return (port, server_instance)."""
+        """Start a mock SSH server and set attributes on the test class."""
         server_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
         server_socket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
         server_socket.bind(('127.0.0.1', 0))
         server_socket.listen(1)
-        port = server_socket.getsockname()[1]
+        self.port = server_socket.getsockname()[1]
 
         host_key = _generate_host_key()
-        ssh_server = MockSSHServer(expected_code='123456')
+        self.server_done = threading.Event()
+        self.ssh_session_finished = threading.Event()
+        self.exec_event = threading.Event()
+        self.ssh_server = MockSSHServer(exec_event=self.exec_event,
+                                        expected_code='123456')
+
+        # Ensure the server is ready before yielding
         server_ready = threading.Event()
-        server_done = threading.Event()
 
         def run_server():
             server_ready.set()
-            try:
-                server_socket.settimeout(30)
-                conn, _ = server_socket.accept()
-                transport = paramiko.Transport(conn)
-                transport.add_server_key(host_key)
-                transport.start_server(server=ssh_server)
-
-                # Wait for channel (exec request)
-                channel = transport.accept(timeout=30)
-                if channel:
-                    time.sleep(0.5)
-                    channel.close()
-                transport.close()
-            except Exception as e:
-                print(f'Server error: {e}')
-            finally:
-                server_done.set()
+            conn, _ = server_socket.accept()
+            transport = paramiko.Transport(conn)
+            transport.add_server_key(host_key)
+            transport.start_server(server=self.ssh_server)
+
+            # Wait for channel (session open)
+            channel = transport.accept(timeout=30)
+            if channel:
+                try:
+                    # 1. Wait for the client to send the 'exec' request.
+                    # We wait on the event we added to MockSSHServer.
+                    if self.exec_event.wait(timeout=10):
+                        # 2. Command received. Send exit status 0 to Client.
+                        # This unblocks runner.run() so it returns 0.
+                        # Echo back the command to show auth worked
+                        channel.send_exit_status(0)
+                        # 3. CRITICAL FIX: Close the channel (send EOF) immediately.
+                        # OpenSSH client waits for EOF before exiting.
+                        channel.close()
+                    else:
+                        print("Timeout waiting for exec request")
+
+                    # 3. Now wait for the test to signal it's done (cleanup phase)
+                    self.ssh_session_finished.wait(timeout=15)
+                except Exception as e:
+                    print(f'Server logic error: {e}')
+                finally:
+
+                    # Ensure channel is closed with suppressing exceptions
+                    with suppress(Exception):
+                        channel.close()
+
+                    transport.close()
+
+            self.server_done.set()
 
         server_thread = threading.Thread(target=run_server, daemon=True)
         server_thread.start()
+
+        # Wait for the server to be ready before yielding
         server_ready.wait(timeout=5)
 
-        yield port, ssh_server, server_done
+        yield
 
         server_socket.close()
         server_thread.join(timeout=2)
 
-    def test_interactive_auth_via_pty_and_unix_socket(self, mock_ssh_server):
+    def test_interactive_auth_via_pty_and_unix_socket(self):
         """Test that SSHCommandRunner's interactive auth flow works.
-
         This tests the actual _retry_with_interactive_auth code path:
         1. SSH command is run with PTY for interactive auth
         2. PTY master fd is passed to handler via Unix socket
         3. Handler writes auth code to PTY
         4. SSH authenticates successfully
         """
-        port, ssh_server, server_done = mock_ssh_server
         session_id = 'test-session-123'
 
         # Create SSHCommandRunner pointing to mock server
         runner = command_runner.SSHCommandRunner(
-            node=('127.0.0.1', port),
+            node=('127.0.0.1', self.port),
             ssh_user='testuser',
             ssh_private_key=None,
             ssh_control_name=None,
@@ -225,8 +268,8 @@ def simulate_websocket_handler():
                 # Write auth code
                 os.write(pty_master_fd, b'123456\n')
 
-                # Wait for auth to complete
-                time.sleep(1.0)
+                # Allow a brief moment for propagation
+                time.sleep(0.5)
 
             except Exception as e:
                 auth_error.append(str(e))
@@ -284,13 +327,16 @@ def simulate_websocket_handler():
                 executable='/bin/bash',
             )
 
+            # Signal to the mock server that the SSH session is complete
+            self.ssh_session_finished.set()
+
             # Wait for everything to complete
             auth_handler_done.wait(timeout=10)
-            server_done.wait(timeout=5)
+            self.server_done.wait(timeout=5)
 
             # Verify auth was attempted with correct code
-            assert ssh_server.auth_attempts == [['123456']], \
-                f'Expected auth with 123456, got {ssh_server.auth_attempts}'
+            assert self.ssh_server.auth_attempts == [['123456']], \
+                f'Expected auth with 123456, got {self.ssh_server.auth_attempts}'
 
             # Check no errors in auth handler
             assert not auth_error, f'Auth handler errors: {auth_error}'
@@ -304,6 +350,104 @@ def simulate_websocket_handler():
                 os.unlink(log_path)
 
 
+def test_kubernetes_runner_adds_container_flag_to_kubectl_exec() -> None:
+    captured = {}
+
+    def fake_run_with_log(command: str, *args, **kwargs):
+        captured['command'] = command
+        require_outputs = kwargs.get('require_outputs', False)
+        if require_outputs:
+            return 0, '', ''
+        return 0
+
+    with mock.patch.object(command_runner.log_lib,
+                           'run_with_log',
+                           side_effect=fake_run_with_log):
+        runner = command_runner.KubernetesCommandRunner((('ns', 'ctx'), 'pod'),
+                                                        container='ray-node')
+        runner.run('echo hello', require_outputs=True, stream_logs=False)
+
+    assert 'kubectl exec' in captured['command']
+    assert 'pod/pod' in captured['command']
+    assert '-c ray-node' in captured['command']
+
+
+def test_kubernetes_runner_rsync_sets_exec_container_envvar() -> None:
+    captured = {}
+
+    def fake_run_with_log(command: str, *args, **kwargs):
+        captured['command'] = command
+        return 0, '', ''
+
+    with mock.patch.object(command_runner.log_lib,
+                           'run_with_log',
+                           side_effect=fake_run_with_log):
+        runner = command_runner.KubernetesCommandRunner((('ns', 'ctx'), 'pod'),
+                                                        container='sidecar0')
+        runner.rsync('/tmp/src', '/tmp/dst', up=True, stream_logs=False)
+
+    assert 'SKYPILOT_K8S_EXEC_CONTAINER=sidecar0' in captured['command']
+    assert 'rsync' in captured['command']
+
+
+def test_kubernetes_runner_rsync_does_not_set_exec_container_envvar_by_default(
+) -> None:
+    captured = {}
+
+    def fake_run_with_log(command: str, *args, **kwargs):
+        captured['command'] = command
+        return 0, '', ''
+
+    with mock.patch.object(command_runner.log_lib,
+                           'run_with_log',
+                           side_effect=fake_run_with_log):
+        runner = command_runner.KubernetesCommandRunner((('ns', 'ctx'), 'pod'))
+        runner.rsync('/tmp/src', '/tmp/dst', up=True, stream_logs=False)
+
+    assert 'SKYPILOT_K8S_EXEC_CONTAINER=' not in captured['command']
+
+
+def test_get_pod_primary_container_prefers_ray_node() -> None:
+    from sky.provision.kubernetes import utils as kubernetes_utils
+
+    sidecar = mock.MagicMock()
+    sidecar.name = 'sidecar'
+    primary = mock.MagicMock()
+    primary.name = 'ray-node'
+
+    pod = mock.MagicMock()
+    pod.metadata.name = 'p'
+    pod.spec.containers = [sidecar, primary]
+
+    assert kubernetes_utils.get_pod_primary_container(pod) is primary
+
+
+def test_get_pod_primary_container_falls_back_to_first_container() -> None:
+    from sky.provision.kubernetes import utils as kubernetes_utils
+
+    c0 = mock.MagicMock()
+    c0.name = 'not-ray-node'
+    c1 = mock.MagicMock()
+    c1.name = 'also-not-ray-node'
+
+    pod = mock.MagicMock()
+    pod.metadata.name = 'p'
+    pod.spec.containers = [c0, c1]
+
+    assert kubernetes_utils.get_pod_primary_container(pod) is c0
+
+
+def test_get_pod_primary_container_raises_on_empty_container_list() -> None:
+    from sky.provision.kubernetes import utils as kubernetes_utils
+
+    pod = mock.MagicMock()
+    pod.metadata.name = 'p'
+    pod.spec.containers = []
+
+    with pytest.raises(ValueError):
+        kubernetes_utils.get_pod_primary_container(pod)
+
+
 class TestSSHCommandRunnerAuthFailureDetection:
     """Test SSHCommandRunner authentication failure detection logic."""
 
diff --git a/tests/unit_tests/test_sky/utils/test_config_utils.py b/tests/unit_tests/test_sky/utils/test_config_utils.py
index 448b01a70fe..93b1b0546f2 100644
--- a/tests/unit_tests/test_sky/utils/test_config_utils.py
+++ b/tests/unit_tests/test_sky/utils/test_config_utils.py
@@ -636,3 +636,111 @@ def test_merge_k8s_configs_with_patch_merge_keys():
     port_9090 = next(
         p for p in base_config['ports'] if p['containerPort'] == 9090)
     assert port_9090['protocol'] == 'TCP'
+
+
+def test_merge_k8s_configs_with_sidecar_containers():
+    """Test merging Kubernetes configs with sidecar containers.
+
+    This test verifies that adding a sidecar container via pod_config
+    correctly adds a new container instead of replacing the primary container.
+    """
+    base_config = {
+        'spec': {
+            'containers': [{
+                'name': 'ray-node',
+                'image': 'rayproject/ray:latest',
+                'command': ['/bin/bash', '-c', '--'],
+                'args': ['echo hello'],
+                'resources': {
+                    'requests': {
+                        'cpu': '2',
+                        'memory': '4Gi'
+                    }
+                }
+            }]
+        }
+    }
+    override_config = {
+        'spec': {
+            'containers': [{
+                'name': 'sidecar',
+                'image': 'busybox:latest',
+                'command': ['sh', '-c', 'while true; do sleep 60; done'],
+                'resources': {
+                    'requests': {
+                        'cpu': '100m',
+                        'memory': '64Mi'
+                    }
+                }
+            }]
+        }
+    }
+
+    config_utils.merge_k8s_configs(base_config, override_config)
+
+    # Verify both containers exist
+    containers = base_config['spec']['containers']
+    assert len(containers) == 2
+
+    # Verify ray-node container is preserved
+    ray_node = next(c for c in containers if c['name'] == 'ray-node')
+    assert ray_node['image'] == 'rayproject/ray:latest'
+    assert ray_node['command'] == ['/bin/bash', '-c', '--']
+    assert ray_node['resources']['requests']['cpu'] == '2'
+
+    # Verify sidecar container is added
+    sidecar = next(c for c in containers if c['name'] == 'sidecar')
+    assert sidecar['image'] == 'busybox:latest'
+    assert sidecar['resources']['requests']['cpu'] == '100m'
+
+
+def test_merge_k8s_configs_with_sidecar_and_primary_container_override():
+    """Test merging configs that override the primary container and add a sidecar."""
+    base_config = {
+        'spec': {
+            'containers': [{
+                'name': 'ray-node',
+                'image': 'rayproject/ray:latest',
+                'resources': {
+                    'requests': {
+                        'cpu': '2',
+                        'memory': '4Gi'
+                    }
+                }
+            }]
+        }
+    }
+    override_config = {
+        'spec': {
+            'containers': [
+                {
+                    'name': 'ray-node',  # Override primary container
+                    'resources': {
+                        'limits': {
+                            'cpu': '4',
+                            'memory': '8Gi'
+                        }
+                    }
+                },
+                {
+                    'name': 'sidecar',  # Add sidecar container
+                    'image': 'busybox:latest',
+                }
+            ]
+        }
+    }
+
+    config_utils.merge_k8s_configs(base_config, override_config)
+
+    containers = base_config['spec']['containers']
+    assert len(containers) == 2
+
+    # Verify ray-node container is merged (not replaced)
+    ray_node = next(c for c in containers if c['name'] == 'ray-node')
+    assert ray_node['image'] == 'rayproject/ray:latest'  # Preserved
+    assert ray_node['resources']['requests']['cpu'] == '2'  # Preserved
+    assert ray_node['resources']['limits']['cpu'] == '4'  # Added from override
+
+    # Verify sidecar is added
+    sidecar = next(c for c in containers if c['name'] == 'sidecar')
+    assert sidecar['image'] == 'busybox:latest'
diff --git a/tests/unit_tests/test_sky/utils/test_dag_yaml_utils.py b/tests/unit_tests/test_sky/utils/test_dag_yaml_utils.py
index 95aecc0b19f..d57a03cd4eb 100644
--- a/tests/unit_tests/test_sky/utils/test_dag_yaml_utils.py
+++ b/tests/unit_tests/test_sky/utils/test_dag_yaml_utils.py
@@ -183,3 +183,152 @@ def test_dump_chain_dag_multiple_tasks(self):
         assert 'task1-secret-value' not in yaml_str_redacted
         assert 'task2-secret-value' not in yaml_str_redacted
         assert '<redacted>' in yaml_str_redacted
+
+
+class TestJobGroupDetection:
+    """Test job group vs pipeline detection based on execution field."""
+
+    def test_job_group_with_execution_parallel(self):
+        """Test that execution: parallel is detected as job group."""
+        yaml_str = """
+---
+name: my-job-group
+execution: parallel
+---
+name: trainer
+run: echo hello
+---
+name: data-processor
+run: echo world
+"""
+        assert dag_utils.is_job_group_yaml_str(yaml_str) is True
+        dag = dag_utils.load_dag_from_yaml_str(yaml_str)
+        assert dag.is_job_group() is True
+        assert dag.execution == dag_lib.DagExecution.PARALLEL
+
+    def test_pipeline_without_execution_field(self):
+        """Test that missing execution field is detected as pipeline."""
+        yaml_str = """
+---
+name: my-pipeline
+---
+name: task1
+run: echo hello
+---
+name: task2
+run: echo world
+"""
+        assert dag_utils.is_job_group_yaml_str(yaml_str) is False
+        dag = dag_utils.load_dag_from_yaml_str(yaml_str)
+        assert dag.is_job_group() is False
+        assert dag.execution == dag_lib.DagExecution.SERIAL
+
+    def test_pipeline_with_execution_serial(self):
+        """Test that execution: serial is detected as pipeline."""
+        yaml_str = """
+---
+name: my-pipeline
+execution: serial
+---
+name: task1
+run: echo hello
+---
+name: task2
+run: echo world
+"""
+        assert dag_utils.is_job_group_yaml_str(yaml_str) is False
+        dag = dag_utils.load_dag_from_yaml_str(yaml_str)
+        assert dag.is_job_group() is False
+        assert dag.execution == dag_lib.DagExecution.SERIAL
+
+    def test_job_group_with_execution(self):
+        """Test job group with execution field."""
+        yaml_str = """
+---
+name: my-job-group
+execution: parallel
+---
+name: trainer
+run: echo hello
+"""
+        assert dag_utils.is_job_group_yaml_str(yaml_str) is True
+        dag = dag_utils.load_dag_from_yaml_str(yaml_str)
+        assert dag.is_job_group() is True
+        assert dag.execution == dag_lib.DagExecution.PARALLEL
+
+    def test_single_task_yaml_is_pipeline(self):
+        """Test that single task YAML is detected as pipeline."""
+        yaml_str = """
+name: my-task
+run: echo hello
+"""
+        assert dag_utils.is_job_group_yaml_str(yaml_str) is False
+        dag = dag_utils.load_dag_from_yaml_str(yaml_str)
+        assert dag.is_job_group() is False
+        assert dag.execution == dag_lib.DagExecution.SERIAL
+
+    def test_pipeline_with_invalid_execution_raises_error(self):
+        """Test that invalid execution mode raises ValueError for pipeline."""
+        yaml_str = """
+---
+name: my-pipeline
+execution: invalid_mode
+---
+name: task1
+run: echo hello
+"""
+        with pytest.raises(ValueError) as exc_info:
+            dag_utils.load_dag_from_yaml_str(yaml_str)
+        assert 'Invalid execution mode for pipeline' in str(exc_info.value)
+        assert 'invalid_mode' in str(exc_info.value)
+
+    def test_job_group_round_trip(self):
+        """Test that job group can be serialized and deserialized."""
+        yaml_str = """
+---
+name: my-job-group
+execution: parallel
+primary_tasks:
+  - trainer
+---
+name: trainer
+run: echo hello
+---
+name: data-processor
+run: echo world
+"""
+        dag = dag_utils.load_dag_from_yaml_str(yaml_str)
+        assert dag.is_job_group() is True
+
+        # Round-trip
+        dumped = dag_utils.dump_dag_to_yaml_str(dag)
+        dag2 = dag_utils.load_dag_from_yaml_str(dumped)
+
+        assert dag2.is_job_group() is True
+        assert dag2.execution == dag_lib.DagExecution.PARALLEL
+        assert dag2.name == 'my-job-group'
+        assert len(dag2.tasks) == 2
+
+    def test_pipeline_round_trip(self):
+        """Test that pipeline can be serialized and deserialized."""
+        yaml_str = """
+---
+name: my-pipeline
+---
+name: task1
+run: echo hello
+---
+name: task2
+run: echo world
+"""
+        dag = dag_utils.load_dag_from_yaml_str(yaml_str)
+        assert dag.is_job_group() is False
+
+        # Round-trip
+        dumped = dag_utils.dump_dag_to_yaml_str(dag)
+        dag2 = dag_utils.load_dag_from_yaml_str(dumped)
+
+        assert dag2.is_job_group() is False
+        assert dag2.execution == dag_lib.DagExecution.SERIAL
+        assert dag2.name == 'my-pipeline'
+        assert len(dag2.tasks) == 2
diff --git a/tests/unit_tests/test_sky/utils/test_db_utils.py b/tests/unit_tests/test_sky/utils/test_db_utils.py
index 52ba6ef33da..5bc3ea43174 100644
--- a/tests/unit_tests/test_sky/utils/test_db_utils.py
+++ b/tests/unit_tests/test_sky/utils/test_db_utils.py
@@ -5,6 +5,7 @@
 
 import pytest
 import pytest_asyncio
+import sqlalchemy
 
 from sky.utils.db import db_utils
 
@@ -92,3 +93,286 @@ async def test_execute_fetchall_async_error_does_not_stall_read_txn(
         values = [row[0] for row in rows]
 
     assert values == ['initial', 'external', 'after_error']
+
+
+class TestGetEngine:
+    """Tests for get_engine function."""
+
+    @pytest.fixture(autouse=True)
+    def clear_caches(self, monkeypatch):
+        """Clear engine caches before each test."""
+        # Clear the module-level caches
+        db_utils._postgres_engine_cache.clear()
+        db_utils._sqlite_engine_cache.clear()
+        # Reset max_connections to default
+        db_utils.set_max_connections(0)
+        # Ensure we're not in server mode by default
+        monkeypatch.delenv('IS_SKYPILOT_SERVER', raising=False)
+        monkeypatch.delenv('SKYPILOT_DB_CONNECTION_URI', raising=False)
+
+    def test_sqlite_sync_engine_creation(self, tmp_path, monkeypatch):
+        """Test SQLite sync engine is created correctly."""
+        monkeypatch.setenv('SKY_RUNTIME_DIR', str(tmp_path))
+
+        with mock.patch('sqlalchemy.create_engine') as mock_create:
+            mock_engine = mock.MagicMock()
+            mock_create.return_value = mock_engine
+
+            engine = db_utils.get_engine(db_name='test_db')
+
+            mock_create.assert_called_once()
+            call_args = mock_create.call_args
+            assert 'sqlite:///' in call_args[0][0]
+            assert 'test_db.db' in call_args[0][0]
+            assert engine == mock_engine
+
+    def test_sqlite_sync_engine_caching(self, tmp_path, monkeypatch):
+        """Test SQLite sync engine is cached and reused."""
+        monkeypatch.setenv('SKY_RUNTIME_DIR', str(tmp_path))
+
+        with mock.patch('sqlalchemy.create_engine') as mock_create:
+            mock_engine = mock.MagicMock()
+            mock_create.return_value = mock_engine
+
+            engine1 = db_utils.get_engine(db_name='cached_db')
+            engine2 = db_utils.get_engine(db_name='cached_db')
+
+            # Should only create once
+            assert mock_create.call_count == 1
+            assert engine1 is engine2
+
+    def test_sqlite_async_engine_creation(self, tmp_path, monkeypatch):
+        """Test SQLite async engine is created correctly."""
+        monkeypatch.setenv('SKY_RUNTIME_DIR', str(tmp_path))
+
+        with mock.patch(
+                'sqlalchemy.ext.asyncio.create_async_engine') as mock_create:
+            mock_engine = mock.MagicMock()
+            mock_create.return_value = mock_engine
+
+            engine = db_utils.get_engine(db_name='async_db', async_engine=True)
+
+            mock_create.assert_called_once()
+            call_args = mock_create.call_args
+            assert 'sqlite+aiosqlite:///' in call_args[0][0]
+            assert 'async_db.db' in call_args[0][0]
+            assert call_args[1]['connect_args'] == {'timeout': 30}
+            assert engine == mock_engine
+
+    def test_sqlite_async_engine_not_cached(self, tmp_path, monkeypatch):
+        """Test SQLite async engines are NOT cached (unlike sync engines)."""
+        monkeypatch.setenv('SKY_RUNTIME_DIR', str(tmp_path))
+
+        with mock.patch(
+                'sqlalchemy.ext.asyncio.create_async_engine') as mock_create:
+            mock_engine1 = mock.MagicMock()
+            mock_engine2 = mock.MagicMock()
+            mock_create.side_effect = [mock_engine1, mock_engine2]
+
+            engine1 = db_utils.get_engine(db_name='async_db', async_engine=True)
+            engine2 = db_utils.get_engine(db_name='async_db', async_engine=True)
+
+            # Async SQLite engines are NOT cached, so create should be called twice
+            assert mock_create.call_count == 2
+            assert engine1 is not engine2
+
+    def test_sqlite_db_name_required(self, monkeypatch):
+        """Test that db_name is required for SQLite."""
+        monkeypatch.delenv('IS_SKYPILOT_SERVER', raising=False)
+
+        with pytest.raises(AssertionError,
+                           match='db_name must be provided for SQLite'):
+            db_utils.get_engine(db_name=None)
+
+    def test_postgres_sync_engine_creation_with_nullpool(self, monkeypatch):
+        """Test Postgres sync engine with NullPool when max_connections=0."""
+        monkeypatch.setenv('IS_SKYPILOT_SERVER', 'true')
+        monkeypatch.setenv('SKYPILOT_DB_CONNECTION_URI',
+                           'postgresql://user:pass@localhost/db')
+        db_utils.set_max_connections(0)
+
+        with mock.patch('sqlalchemy.create_engine') as mock_create:
+            mock_engine = mock.MagicMock()
+            mock_create.return_value = mock_engine
+
+            engine = db_utils.get_engine(db_name='ignored')
+
+            mock_create.assert_called_once()
+            call_args = mock_create.call_args
+            assert call_args[0][0] == 'postgresql://user:pass@localhost/db'
+            assert call_args[1]['poolclass'] == sqlalchemy.NullPool
+            assert engine == mock_engine
+
+    def test_postgres_sync_engine_creation_with_queuepool(self, monkeypatch):
+        """Test Postgres sync engine with QueuePool when max_connections>0."""
+        monkeypatch.setenv('IS_SKYPILOT_SERVER', 'true')
+        monkeypatch.setenv('SKYPILOT_DB_CONNECTION_URI',
+                           'postgresql://user:pass@localhost/db')
+        db_utils.set_max_connections(10)
+
+        with mock.patch('sqlalchemy.create_engine') as mock_create:
+            mock_engine = mock.MagicMock()
+            mock_create.return_value = mock_engine
+
+            engine = db_utils.get_engine(db_name='ignored')
+
+            mock_create.assert_called_once()
+            call_args = mock_create.call_args
+            assert call_args[0][0] == 'postgresql://user:pass@localhost/db'
+            assert call_args[1]['poolclass'] == sqlalchemy.pool.QueuePool
+            assert call_args[1]['pool_size'] == 10
+            assert call_args[1]['max_overflow'] == 0  # max(0, 5-10)
+            assert call_args[1]['pool_pre_ping'] is True
+            assert call_args[1]['pool_recycle'] == 1800
+            assert engine == mock_engine
+
+    def test_postgres_sync_engine_queuepool_max_overflow_calculation(
+            self, monkeypatch):
+        """Test max_overflow calculation with different pool sizes."""
+        monkeypatch.setenv('IS_SKYPILOT_SERVER', 'true')
+        monkeypatch.setenv('SKYPILOT_DB_CONNECTION_URI',
+                           'postgresql://user:pass@localhost/db')
+
+        # Test with pool_size=2, max_overflow should be 3
+        db_utils.set_max_connections(2)
+
+        with mock.patch('sqlalchemy.create_engine') as mock_create:
+            mock_engine = mock.MagicMock()
+            mock_create.return_value = mock_engine
+
+            db_utils.get_engine(db_name='ignored')
+
+            call_args = mock_create.call_args
+            assert call_args[1]['max_overflow'] == 3  # max(0, 5-2)
+
+    def test_postgres_async_engine_creation(self, monkeypatch):
+        """Test Postgres async engine uses asyncpg and NullPool."""
+        monkeypatch.setenv('IS_SKYPILOT_SERVER', 'true')
+        monkeypatch.setenv('SKYPILOT_DB_CONNECTION_URI',
+                           'postgresql://user:pass@localhost/db')
+
+        with mock.patch(
+                'sqlalchemy.ext.asyncio.create_async_engine') as mock_create:
+            mock_engine = mock.MagicMock()
+            mock_create.return_value = mock_engine
+
+            engine = db_utils.get_engine(db_name='ignored', async_engine=True)
+
+            mock_create.assert_called_once()
+            call_args = mock_create.call_args
+            # Connection string should be modified for asyncpg
+            assert call_args[0][
+                0] == 'postgresql+asyncpg://user:pass@localhost/db'
+            assert call_args[1]['poolclass'] == sqlalchemy.NullPool
+            assert engine == mock_engine
+
+    def test_postgres_engine_caching(self, monkeypatch):
+        """Test Postgres sync engines are cached and reused."""
+        monkeypatch.setenv('IS_SKYPILOT_SERVER', 'true')
+        monkeypatch.setenv('SKYPILOT_DB_CONNECTION_URI',
+                           'postgresql://user:pass@localhost/db')
+        db_utils.set_max_connections(0)
+
+        with mock.patch('sqlalchemy.create_engine') as mock_create:
+            mock_engine = mock.MagicMock()
+            mock_create.return_value = mock_engine
+
+            engine1 = db_utils.get_engine(db_name='ignored')
+            engine2 = db_utils.get_engine(db_name='any_name')
+
+            # Should only create once regardless of db_name
+            assert mock_create.call_count == 1
+            assert engine1 is engine2
+
+    def test_postgres_async_engine_caching(self, monkeypatch):
+        """Test Postgres async engines are cached and reused."""
+        monkeypatch.setenv('IS_SKYPILOT_SERVER', 'true')
+        monkeypatch.setenv('SKYPILOT_DB_CONNECTION_URI',
+                           'postgresql://user:pass@localhost/db')
+
+        with mock.patch(
+                'sqlalchemy.ext.asyncio.create_async_engine') as mock_create:
+            mock_engine = mock.MagicMock()
+            mock_create.return_value = mock_engine
+
+            engine1 = db_utils.get_engine(db_name='ignored', async_engine=True)
+            engine2 = db_utils.get_engine(db_name='any_name', async_engine=True)
+
+            # Should only create once regardless of db_name
+            assert mock_create.call_count == 1
+            assert engine1 is engine2
+
+    def test_postgres_sync_and_async_engines_cached_separately(
+            self, monkeypatch):
+        """Test sync and async Postgres engines are cached separately."""
+        monkeypatch.setenv('IS_SKYPILOT_SERVER', 'true')
+        monkeypatch.setenv('SKYPILOT_DB_CONNECTION_URI',
+                           'postgresql://user:pass@localhost/db')
+        db_utils.set_max_connections(0)
+
+        with mock.patch('sqlalchemy.create_engine') as mock_sync_create, \
+             mock.patch('sqlalchemy.ext.asyncio.create_async_engine') as mock_async_create:
+            mock_sync_engine = mock.MagicMock()
+            mock_async_engine = mock.MagicMock()
+            mock_sync_create.return_value = mock_sync_engine
+            mock_async_create.return_value = mock_async_engine
+
+            sync_engine = db_utils.get_engine(db_name='ignored')
+            async_engine = db_utils.get_engine(db_name='ignored',
+                                               async_engine=True)
+
+            assert mock_sync_create.call_count == 1
+            assert mock_async_create.call_count == 1
+            assert sync_engine is not async_engine
+
+    def test_postgres_db_name_ignored(self, monkeypatch):
+        """Test that db_name is ignored when using Postgres."""
+        monkeypatch.setenv('IS_SKYPILOT_SERVER', 'true')
+        monkeypatch.setenv('SKYPILOT_DB_CONNECTION_URI',
+                           'postgresql://user:pass@localhost/db')
+        db_utils.set_max_connections(0)
+
+        with mock.patch('sqlalchemy.create_engine') as mock_create:
+            mock_engine = mock.MagicMock()
+            mock_create.return_value = mock_engine
+
+            # db_name can be None or any value for Postgres
+            engine1 = db_utils.get_engine(db_name=None)
+            engine2 = db_utils.get_engine(db_name='some_db')
+            engine3 = db_utils.get_engine(db_name='other_db')
+
+            # All should return the same cached engine
+            assert engine1 is engine2 is engine3
+            assert mock_create.call_count == 1
+
+    def test_env_var_is_skypilot_server_required_for_postgres(
+            self, monkeypatch):
+        """Test IS_SKYPILOT_SERVER env var is required for Postgres mode."""
+        # Only set DB_CONNECTION_URI, not IS_SKYPILOT_SERVER
+        monkeypatch.delenv('IS_SKYPILOT_SERVER', raising=False)
+        monkeypatch.setenv('SKYPILOT_DB_CONNECTION_URI',
+                           'postgresql://user:pass@localhost/db')
+
+        with mock.patch('sqlalchemy.create_engine') as mock_create:
+            mock_engine = mock.MagicMock()
+            mock_create.return_value = mock_engine
+
+            # Should fall back to SQLite mode since IS_SKYPILOT_SERVER is not set
+            with pytest.raises(AssertionError,
+                               match='db_name must be provided'):
+                db_utils.get_engine(db_name=None)
+
+    def test_directory_created_for_sqlite(self, tmp_path, monkeypatch):
+        """Test that parent directory is created for SQLite database."""
+        runtime_dir = tmp_path / 'nonexistent' / 'path'
+        monkeypatch.setenv('SKY_RUNTIME_DIR', str(runtime_dir))
+
+        with mock.patch('sqlalchemy.create_engine') as mock_create:
+            mock_engine = mock.MagicMock()
+            mock_create.return_value = mock_engine
+
+            db_utils.get_engine(db_name='test_db')
+
+            # Parent directory should be created
+            expected_dir = runtime_dir / '.sky'
+            assert expected_dir.exists()
diff --git a/tests/unit_tests/test_sky/utils/test_volume_utils.py b/tests/unit_tests/test_sky/utils/test_volume_utils.py
index 1c0bf1655e1..ce6cf6d8602 100644
--- a/tests/unit_tests/test_sky/utils/test_volume_utils.py
+++ b/tests/unit_tests/test_sky/utils/test_volume_utils.py
@@ -4,7 +4,9 @@
 
 import pytest
 
+from sky import exceptions
 from sky import models
+from sky.utils import status_lib
 from sky.utils import volume
 
 
@@ -214,3 +216,103 @@ def test_resolve_ephemeral_config_complete(self):
         assert volume_mount.volume_config.region is None
         assert volume_mount.volume_config.zone is None
         assert volume_mount.volume_config.name_on_cloud == ''
+
+    @mock.patch('sky.global_user_state.get_volume_by_name')
+    def test_resolve_volume_found(self, mock_get_volume):
+        """Test resolve with a valid volume."""
+        mock_volume_config = models.VolumeConfig(
+            name='test-volume',
+            type='k8s-pvc',
+            cloud='kubernetes',
+            region='us-central1',
+            zone=None,
+            name_on_cloud='pvc-12345',
+            size='10',
+            config={},
+            labels=None,
+        )
+        mock_get_volume.return_value = {
+            'name': 'test-volume',
+            'handle': mock_volume_config,
+            'status': status_lib.VolumeStatus.READY,
+        }
+
+        volume_mount = volume.VolumeMount.resolve('/data', 'test-volume')
+
+        assert volume_mount.path == '/data'
+        assert volume_mount.volume_name == 'test-volume'
+        assert volume_mount.volume_config == mock_volume_config
+        assert volume_mount.is_ephemeral is False
+        mock_get_volume.assert_called_once_with('test-volume')
+
+    @mock.patch('sky.global_user_state.get_volume_by_name')
+    def test_resolve_volume_not_found(self, mock_get_volume):
+        """Test resolve with a non-existent volume."""
+        mock_get_volume.return_value = None
+
+        with pytest.raises(exceptions.VolumeNotFoundError) as exc_info:
+            volume.VolumeMount.resolve('/data', 'non-existent-volume')
+
+        assert 'non-existent-volume' in str(exc_info.value)
+        assert 'not found' in str(exc_info.value)
+
+    @mock.patch('sky.global_user_state.get_volume_by_name')
+    def test_resolve_volume_not_ready_without_error_message(
+            self, mock_get_volume):
+        """Test resolve with a volume that is not ready (no error message)."""
+        mock_get_volume.return_value = {
+            'name': 'test-volume',
+            'handle': None,
+            'status': status_lib.VolumeStatus.NOT_READY,
+        }
+
+        with pytest.raises(exceptions.VolumeNotReadyError) as exc_info:
+            volume.VolumeMount.resolve('/data', 'test-volume')
+
+        assert 'test-volume' in str(exc_info.value)
+        assert 'not ready' in str(exc_info.value)
+        assert 'Error:' not in str(exc_info.value)
+
+    @mock.patch('sky.global_user_state.get_volume_by_name')
+    def test_resolve_volume_not_ready_with_error_message(self, mock_get_volume):
+        """Test resolve with a volume that is not ready (with error message)."""
+        mock_get_volume.return_value = {
+            'name': 'test-volume',
+            'handle': None,
+            'status': status_lib.VolumeStatus.NOT_READY,
+            'error_message': 'Storage quota exceeded',
+        }
+
+        with pytest.raises(exceptions.VolumeNotReadyError) as exc_info:
+            volume.VolumeMount.resolve('/data', 'test-volume')
+
+        assert 'test-volume' in str(exc_info.value)
+        assert 'not ready' in str(exc_info.value)
+        assert 'Error: Storage quota exceeded' in str(exc_info.value)
+
+    @mock.patch('sky.global_user_state.get_volume_by_name')
+    def test_resolve_volume_status_none(self, mock_get_volume):
+        """Test resolve with a volume where status is None."""
+        mock_volume_config = models.VolumeConfig(
+            name='test-volume',
+            type='k8s-pvc',
+            cloud='kubernetes',
+            region='us-central1',
+            zone=None,
+            name_on_cloud='pvc-12345',
+            size='10',
+            config={},
+            labels=None,
+        )
+        mock_get_volume.return_value = {
+            'name': 'test-volume',
+            'handle': mock_volume_config,
+            'status': None,
+        }
+
+        volume_mount = volume.VolumeMount.resolve('/data', 'test-volume')
+
+        assert volume_mount.path == '/data'
+        assert volume_mount.volume_name == 'test-volume'
+        assert volume_mount.volume_config == mock_volume_config
+        assert volume_mount.is_ephemeral is False
diff --git a/tests/unit_tests/test_sky/volumes/test_core.py b/tests/unit_tests/test_sky/volumes/test_core.py
index 206f54ee461..9507a365736 100644
--- a/tests/unit_tests/test_sky/volumes/test_core.py
+++ b/tests/unit_tests/test_sky/volumes/test_core.py
@@ -99,8 +99,16 @@ def test_volume_refresh_success(self, monkeypatch):
         # Should be called for both volumes
         assert mock_update_status.call_count == 2
         expected_calls = [
-            mock.call('test-volume-1', status=status_lib.VolumeStatus.READY),
-            mock.call('test-volume-2', status=status_lib.VolumeStatus.READY)
+            mock.call('test-volume-1',
+                      status=status_lib.VolumeStatus.READY,
+                      error_message=None,
+                      usedby_pods=[],
+                      usedby_clusters=[]),
+            mock.call('test-volume-2',
+                      status=status_lib.VolumeStatus.READY,
+                      error_message=None,
+                      usedby_pods=[],
+                      usedby_clusters=[])
         ]
         mock_update_status.assert_has_calls(expected_calls, any_order=True)
 
@@ -193,8 +201,16 @@ def test_volume_refresh_inuse_success(self, monkeypatch):
         # Should be called for both volumes
         assert mock_update_status.call_count == 2
         expected_calls = [
-            mock.call('test-volume-1', status=status_lib.VolumeStatus.IN_USE),
-            mock.call('test-volume-2', status=status_lib.VolumeStatus.IN_USE)
+            mock.call('test-volume-1',
+                      status=status_lib.VolumeStatus.IN_USE,
+                      error_message=None,
+                      usedby_pods=['pod1', 'pod2'],
+                      usedby_clusters=['cluster1', 'cluster2']),
+            mock.call('test-volume-2',
+                      status=status_lib.VolumeStatus.IN_USE,
+                      error_message=None,
+                      usedby_pods=['pod1', 'pod2'],
+                      usedby_clusters=['cluster1', 'cluster2'])
         ]
         mock_update_status.assert_has_calls(expected_calls, any_order=True)
 
@@ -279,7 +295,7 @@ def test_volume_refresh_volume_not_found(self, monkeypatch):
 
     def test_volume_list_success(self, monkeypatch):
         """Test volume_list with successful volume retrieval."""
-        # Mock volume data
+        # Mock volume data - usedby data now comes from database
         mock_volumes = [{
             'name': 'test-volume-1',
             'launched_at': 1234567890,
@@ -288,6 +304,9 @@ def test_volume_list_success(self, monkeypatch):
             'last_attached_at': 1234567891,
             'last_use': 'sky volumes apply',
             'status': status_lib.VolumeStatus.READY,
+            'error_message': None,
+            'usedby_pods': ['pod1', 'pod2'],
+            'usedby_clusters': ['cluster1', 'cluster2'],
             'handle': mock.MagicMock(name='test-volume-1',
                                      type='k8s-pvc',
                                      cloud='aws',
@@ -305,6 +324,9 @@ def test_volume_list_success(self, monkeypatch):
             'last_attached_at': None,
             'last_use': None,
             'status': None,
+            'error_message': None,
+            'usedby_pods': ['pod1', 'pod2'],
+            'usedby_clusters': ['cluster1', 'cluster2'],
             'handle': mock.MagicMock(name='test-volume-2',
                                      type='k8s-pvc',
                                      cloud='gcp',
@@ -319,20 +341,11 @@ def test_volume_list_success(self, monkeypatch):
         # Mock global_user_state
         mock_get_volumes = mock.MagicMock(return_value=mock_volumes)
         monkeypatch.setattr(global_user_state, 'get_volumes', mock_get_volumes)
-        # Mock provision.get_all_volumes_usedby
-        config_name = 'mock-config'
-        mock_get_all_usedby = mock.MagicMock(return_value=({
-            config_name: ['pod1', 'pod2']
-        }, {
-            config_name: ['cluster1', 'cluster2']
-        }, set()))
-        monkeypatch.setattr(provision, 'get_all_volumes_usedby',
-                            mock_get_all_usedby)
 
-        mock_map_all_usedby = mock.MagicMock(
-            return_value=(['pod1', 'pod2'], ['cluster1', 'cluster2']))
-        monkeypatch.setattr(provision, 'map_all_volumes_usedby',
-                            mock_map_all_usedby)
+        # Mock global_user_state.get_all_users
+        mock_get_all_users = mock.MagicMock(return_value=[])
+        monkeypatch.setattr(global_user_state, 'get_all_users',
+                            mock_get_all_users)
 
         # Call the function
         result = core.volume_list()
@@ -377,13 +390,21 @@ def test_volume_list_volume_without_handle(self, monkeypatch):
             'last_attached_at': None,
             'last_use': None,
             'status': status_lib.VolumeStatus.READY,
-            'handle': None
+            'handle': None,
+            'error_message': None,
+            'usedby_pods': [],
+            'usedby_clusters': [],
         }]
 
         # Mock global_user_state
         mock_get_volumes = mock.MagicMock(return_value=mock_volumes)
         monkeypatch.setattr(global_user_state, 'get_volumes', mock_get_volumes)
 
+        # Mock global_user_state.get_all_users
+        mock_get_all_users = mock.MagicMock(return_value=[])
+        monkeypatch.setattr(global_user_state, 'get_all_users',
+                            mock_get_all_users)
+
         # Call the function
         result = core.volume_list()
 
@@ -994,34 +1015,45 @@ def test_volume_apply_with_use_existing(self, monkeypatch):
         assert config_arg.name_on_cloud == volume_name
 
     def test_volume_refresh_with_usedby_fetch_failed(self, monkeypatch):
-        """Test volume_refresh skips volumes with usedby_fetch_failed=True."""
-        # Mock volume_list to return a volume with usedby_fetch_failed=True
-        mock_volume_record = responses.VolumeRecord(
-            name='test-volume-failed',
-            type='k8s-pvc',
-            launched_at=1234567890,
-            cloud='aws',
-            region='us-east-1',
-            zone='us-east-1a',
-            size='100Gi',
-            config={},
-            name_on_cloud='test-volume-failed-abc123',
-            user_hash='user123',
-            user_name='',
-            workspace='default',
-            last_attached_at=None,
-            last_use=None,
-            status='READY',
-            usedby_pods=[],
-            usedby_clusters=[],
-            is_ephemeral=False,
-            usedby_fetch_failed=True  # This triggers the skip logic
-        )
-
-        mock_volume_list = mock.MagicMock(return_value=[mock_volume_record])
-        # Patch volume_list in the core module namespace
-        monkeypatch.setattr('sky.volumes.server.core.volume_list',
-                            mock_volume_list)
+        """Test volume_refresh skips status update when usedby fetch fails."""
+        # Mock volume data
+        mock_handle = mock.MagicMock(cloud='aws',
+                                     type='k8s-pvc',
+                                     region='us-east-1',
+                                     zone='us-east-1a',
+                                     size='100Gi',
+                                     config={},
+                                     name_on_cloud='test-volume-abc123',
+                                     spec=models.VolumeConfig)
+        mock_handle.name = 'test-volume'
+        mock_volumes = [{
+            'name': 'test-volume',
+            'launched_at': 1234567890,
+            'user_hash': 'user123',
+            'workspace': 'default',
+            'last_attached_at': None,
+            'last_use': None,
+            'handle': mock_handle,
+            'status': status_lib.VolumeStatus.READY,
+            'is_ephemeral': False,
+            'error_message': None,
+            'usedby_pods': [],
+            'usedby_clusters': [],
+        }]
+
+        mock_get_volumes = mock.MagicMock(return_value=mock_volumes)
+        monkeypatch.setattr(global_user_state, 'get_volumes', mock_get_volumes)
+
+        # Mock get_all_volumes_usedby to raise an exception
+        mock_get_all_usedby = mock.MagicMock(
+            side_effect=Exception('Failed to fetch usedby'))
+        monkeypatch.setattr(provision, 'get_all_volumes_usedby',
+                            mock_get_all_usedby)
+
+        # Mock get_all_volumes_errors
+        mock_get_errors = mock.MagicMock(return_value={})
+        monkeypatch.setattr(provision, 'get_all_volumes_errors',
+                            mock_get_errors)
 
         # Mock filelock
         mock_filelock = mock.MagicMock()
@@ -1033,20 +1065,16 @@ def test_volume_refresh_with_usedby_fetch_failed(self, monkeypatch):
         monkeypatch.setattr(global_user_state, 'update_volume_status',
                             mock_update_status)
 
-        # Call the function - should skip the volume and not update status
+        # Call the function
         core.volume_refresh()
 
-        # Verify volume_list was called
-        mock_volume_list.assert_called_once_with(is_ephemeral=False)
-
-        # Verify that update_volume_status was NOT called
-        # (since the volume was skipped due to usedby_fetch_failed=True)
+        # Verify update_volume_status was NOT called - volume is skipped
+        # when usedby fetch fails to avoid setting incorrect status
         mock_update_status.assert_not_called()
 
-    def test_volume_list_with_get_all_volumes_usedby_exception(
-            self, monkeypatch):
-        """Test volume_list when get_all_volumes_usedby raises an exception."""
-        # Mock volume data
+    def test_volume_list_reads_usedby_from_database(self, monkeypatch):
+        """Test volume_list reads usedby data from database, not cloud APIs."""
+        # Mock volume data with usedby already cached in database
         mock_handle = mock.MagicMock(cloud='aws',
                                      type='k8s-pvc',
                                      region='us-east-1',
@@ -1055,7 +1083,7 @@ def test_volume_list_with_get_all_volumes_usedby_exception(
                                      config={},
                                      name_on_cloud='test-volume-1-abc123',
                                      spec=models.VolumeConfig)
-        mock_handle.name = 'test-volume-1'  # Set name attribute explicitly
+        mock_handle.name = 'test-volume-1'
         mock_volumes = [{
             'name': 'test-volume-1',
             'launched_at': 1234567890,
@@ -1065,19 +1093,16 @@ def test_volume_list_with_get_all_volumes_usedby_exception(
             'last_use': None,
             'handle': mock_handle,
             'status': status_lib.VolumeStatus.READY,
-            'is_ephemeral': False
+            'is_ephemeral': False,
+            'error_message': None,
+            'usedby_pods': ['cached-pod-1', 'cached-pod-2'],
+            'usedby_clusters': ['cached-cluster-1'],
         }]
 
         # Mock global_user_state
         mock_get_volumes = mock.MagicMock(return_value=mock_volumes)
         monkeypatch.setattr(global_user_state, 'get_volumes', mock_get_volumes)
 
-        # Mock provision.get_all_volumes_usedby to raise an exception
-        mock_get_all_usedby = mock.MagicMock(
-            side_effect=Exception('Failed to fetch usedby info'))
-        monkeypatch.setattr(provision, 'get_all_volumes_usedby',
-                            mock_get_all_usedby)
-
         # Mock global_user_state.get_all_users
         mock_get_all_users = mock.MagicMock(return_value=[])
         monkeypatch.setattr(global_user_state, 'get_all_users',
@@ -1086,23 +1111,16 @@ def test_volume_list_with_get_all_volumes_usedby_exception(
         # Call the function
         result = core.volume_list()
 
-        # Verify result
+        # Verify result - usedby data comes from database cache
         assert len(result) == 1
         vol = result[0]
         assert vol['name'] == 'test-volume-1'
-        # Verify that usedby_fetch_failed is True when exception occurs
-        assert vol['usedby_fetch_failed'] is True
-        # Verify that usedby_pods and usedby_clusters are empty lists
-        assert vol['usedby_pods'] == []
-        assert vol['usedby_clusters'] == []
-
-        # Verify get_all_volumes_usedby was called
-        mock_get_all_usedby.assert_called_once()
+        assert vol['usedby_pods'] == ['cached-pod-1', 'cached-pod-2']
+        assert vol['usedby_clusters'] == ['cached-cluster-1']
 
-    def test_volume_list_with_get_all_volumes_usedby_exception_multiple_volumes(
-            self, monkeypatch):
-        """Test volume_list with exception for multiple volumes on same cloud."""
-        # Mock volume data with multiple volumes on same cloud
+    def test_volume_list_multiple_volumes_from_database(self, monkeypatch):
+        """Test volume_list reads multiple volumes with usedby from database."""
+        # Mock volume data with multiple volumes, each with cached usedby data
         mock_handle1 = mock.MagicMock(cloud='aws',
                                       type='k8s-pvc',
                                       region='us-east-1',
@@ -1111,7 +1129,7 @@ def test_volume_list_with_get_all_volumes_usedby_exception_multiple_volumes(
                                       config={},
                                       name_on_cloud='test-volume-1-abc123',
                                       spec=models.VolumeConfig)
-        mock_handle1.name = 'test-volume-1'  # Set name attribute explicitly
+        mock_handle1.name = 'test-volume-1'
         mock_handle2 = mock.MagicMock(cloud='aws',
                                       type='k8s-pvc',
                                       region='us-east-1',
@@ -1120,7 +1138,7 @@ def test_volume_list_with_get_all_volumes_usedby_exception_multiple_volumes(
                                       config={},
                                       name_on_cloud='test-volume-2-def456',
                                       spec=models.VolumeConfig)
-        mock_handle2.name = 'test-volume-2'  # Set name attribute explicitly
+        mock_handle2.name = 'test-volume-2'
         mock_volumes = [{
             'name': 'test-volume-1',
             'launched_at': 1234567890,
@@ -1130,7 +1148,10 @@ def test_volume_list_with_get_all_volumes_usedby_exception_multiple_volumes(
             'last_use': None,
             'handle': mock_handle1,
             'status': status_lib.VolumeStatus.READY,
-            'is_ephemeral': False
+            'is_ephemeral': False,
+            'error_message': None,
+            'usedby_pods': ['pod-a'],
+            'usedby_clusters': ['cluster-a'],
         }, {
             'name': 'test-volume-2',
             'launched_at': 1234567891,
@@ -1140,19 +1161,16 @@ def test_volume_list_with_get_all_volumes_usedby_exception_multiple_volumes(
             'last_use': None,
             'handle': mock_handle2,
             'status': status_lib.VolumeStatus.READY,
-            'is_ephemeral': False
+            'is_ephemeral': False,
+            'error_message': None,
+            'usedby_pods': ['pod-b'],
+            'usedby_clusters': ['cluster-b'],
         }]
 
         # Mock global_user_state
         mock_get_volumes = mock.MagicMock(return_value=mock_volumes)
         monkeypatch.setattr(global_user_state, 'get_volumes', mock_get_volumes)
 
-        # Mock provision.get_all_volumes_usedby to raise an exception
-        mock_get_all_usedby = mock.MagicMock(
-            side_effect=Exception('Failed to fetch usedby info'))
-        monkeypatch.setattr(provision, 'get_all_volumes_usedby',
-                            mock_get_all_usedby)
-
         # Mock global_user_state.get_all_users
         mock_get_all_users = mock.MagicMock(return_value=[])
         monkeypatch.setattr(global_user_state, 'get_all_users',
@@ -1161,12 +1179,12 @@ def test_volume_list_with_get_all_volumes_usedby_exception_multiple_volumes(
         # Call the function
         result = core.volume_list()
 
-        # Verify result - both volumes should have usedby_fetch_failed=True
+        # Verify result - both volumes should have correct usedby from database
         assert len(result) == 2
-        for vol in result:
-            assert vol['usedby_fetch_failed'] is True
-            assert vol['usedby_pods'] == []
-            assert vol['usedby_clusters'] == []
+        assert result[0]['usedby_pods'] == ['pod-a']
+        assert result[0]['usedby_clusters'] == ['cluster-a']
+        assert result[1]['usedby_pods'] == ['pod-b']
+        assert result[1]['usedby_clusters'] == ['cluster-b']
 
     def test_volume_refresh_with_config_refresh_multiple_volumes(
             self, monkeypatch):
@@ -1293,3 +1311,211 @@ def refresh_side_effect(cloud, handle):
         # update_volume_config should only be called for volume-1 (need_refresh=True)
         mock_update_config.assert_called_once_with('test-volume-1',
                                                    refreshed_handle1)
+
+    def test_volume_refresh_with_errors(self, monkeypatch):
+        """Test volume_refresh updates status to ERROR with errors."""
+        mock_handle = mock.MagicMock(cloud='kubernetes',
+                                     type='k8s-pvc',
+                                     region='my-context',
+                                     zone=None,
+                                     size='100Gi',
+                                     config={},
+                                     name_on_cloud='test-pvc',
+                                     spec=models.VolumeConfig)
+        mock_handle.name = 'test-volume'
+        mock_volumes = [{
+            'name': 'test-volume',
+            'launched_at': 1234567890,
+            'user_hash': 'user123',
+            'workspace': 'default',
+            'last_attached_at': None,
+            'last_use': None,
+            'handle': mock_handle,
+            'status': status_lib.VolumeStatus.READY,
+            'is_ephemeral': False,
+            'error_message': None,
+            'usedby_pods': [],
+            'usedby_clusters': [],
+        }]
+
+        mock_get_volumes = mock.MagicMock(return_value=mock_volumes)
+        monkeypatch.setattr(global_user_state, 'get_volumes', mock_get_volumes)
+
+        # Mock get_all_volumes_errors to return an error
+        error_msg = 'PVC access mode mismatch: PVC requests ReadWriteOnce'
+        mock_get_errors = mock.MagicMock(
+            return_value={'test-volume': error_msg})
+        monkeypatch.setattr(provision, 'get_all_volumes_errors',
+                            mock_get_errors)
+
+        mock_get_all_usedby = mock.MagicMock(return_value=({}, {}, set()))
+        monkeypatch.setattr(provision, 'get_all_volumes_usedby',
+                            mock_get_all_usedby)
+
+        mock_map_all_usedby = mock.MagicMock(return_value=([], []))
+        monkeypatch.setattr(provision, 'map_all_volumes_usedby',
+                            mock_map_all_usedby)
+
+        mock_get_volume_by_name = mock.MagicMock(return_value=mock_volumes[0])
+        monkeypatch.setattr(global_user_state, 'get_volume_by_name',
+                            mock_get_volume_by_name)
+
+        mock_update_status = mock.MagicMock()
+        monkeypatch.setattr(global_user_state, 'update_volume_status',
+                            mock_update_status)
+
+        mock_filelock = mock.MagicMock()
+        monkeypatch.setattr('sky.volumes.server.core.filelock.FileLock',
+                            mock_filelock)
+
+        core.volume_refresh()
+
+        # Verify update_volume_status was called with ERROR status
+        mock_update_status.assert_called_once()
+        call_kwargs = mock_update_status.call_args[1]
+        assert call_kwargs['status'] == status_lib.VolumeStatus.NOT_READY
+        assert call_kwargs['error_message'] == error_msg
+
+    def test_volume_list_with_refresh(self, monkeypatch):
+        """Test volume_list with refresh=True calls volume_refresh first."""
+        mock_handle = mock.MagicMock(cloud='kubernetes',
+                                     type='k8s-pvc',
+                                     region='my-context',
+                                     zone=None,
+                                     size='100Gi',
+                                     config={},
+                                     name_on_cloud='test-pvc',
+                                     spec=models.VolumeConfig)
+        mock_handle.name = 'test-volume'
+        mock_volumes = [{
+            'name': 'test-volume',
+            'launched_at': 1234567890,
+            'user_hash': 'user123',
+            'workspace': 'default',
+            'last_attached_at': None,
+            'last_use': None,
+            'handle': mock_handle,
+            'status': status_lib.VolumeStatus.READY,
+            'is_ephemeral': False,
+            'error_message': None,
+            'usedby_pods': ['pod-1'],
+            'usedby_clusters': ['cluster-1'],
+        }]
+
+        mock_get_volumes = mock.MagicMock(return_value=mock_volumes)
+        monkeypatch.setattr(global_user_state, 'get_volumes', mock_get_volumes)
+
+        mock_get_all_users = mock.MagicMock(return_value=[])
+        monkeypatch.setattr(global_user_state, 'get_all_users',
+                            mock_get_all_users)
+
+        # Mock volume_refresh
+        mock_volume_refresh = mock.MagicMock()
+        monkeypatch.setattr(core, 'volume_refresh', mock_volume_refresh)
+
+        # Call with refresh=True
+        result = core.volume_list(refresh=True)
+
+        # Verify volume_refresh was called
+        mock_volume_refresh.assert_called_once()
+
+        # Verify result contains volume data from database
+        assert len(result) == 1
+        assert result[0]['name'] == 'test-volume'
+        assert result[0]['usedby_pods'] == ['pod-1']
+        assert result[0]['usedby_clusters'] == ['cluster-1']
+
+    def test_volume_list_without_refresh(self, monkeypatch):
+        """Test volume_list with refresh=False does not call volume_refresh."""
+        mock_handle = mock.MagicMock(cloud='kubernetes',
+                                     type='k8s-pvc',
+                                     region='my-context',
+                                     zone=None,
+                                     size='100Gi',
+                                     config={},
+                                     name_on_cloud='test-pvc',
+                                     spec=models.VolumeConfig)
+        mock_handle.name = 'test-volume'
+        mock_volumes = [{
+            'name': 'test-volume',
+            'launched_at': 1234567890,
+            'user_hash': 'user123',
+            'workspace': 'default',
+            'last_attached_at': None,
+            'last_use': None,
+            'handle': mock_handle,
+            'status': status_lib.VolumeStatus.READY,
+            'is_ephemeral': False,
+            'error_message': None,
+            'usedby_pods': [],
+            'usedby_clusters': [],
+        }]
+
+        mock_get_volumes = mock.MagicMock(return_value=mock_volumes)
+        monkeypatch.setattr(global_user_state, 'get_volumes', mock_get_volumes)
+
+        mock_get_all_users = mock.MagicMock(return_value=[])
+        monkeypatch.setattr(global_user_state, 'get_all_users',
+                            mock_get_all_users)
+
+        # Mock volume_refresh
+        mock_volume_refresh = mock.MagicMock()
+        monkeypatch.setattr(core, 'volume_refresh', mock_volume_refresh)
+
+        # Call with refresh=False (default)
+        result = core.volume_list(refresh=False)
+
+        # Verify volume_refresh was NOT called
+        mock_volume_refresh.assert_not_called()
+
+        # Verify result
+        assert len(result) == 1
+
+    def test_volume_list_returns_error_message_from_db(self, monkeypatch):
+        """Test volume_list returns error_message stored in database."""
+        mock_handle = mock.MagicMock(cloud='kubernetes',
+                                     type='k8s-pvc',
+                                     region='my-context',
+                                     zone=None,
+                                     size='100Gi',
+                                     config={},
+                                     name_on_cloud='test-pvc',
+                                     spec=models.VolumeConfig)
+        mock_handle.name = 'test-volume'
+        error_msg = 'PVC access mode mismatch'
+        mock_volumes = [{
+            'name': 'test-volume',
+            'launched_at': 1234567890,
+            'user_hash': 'user123',
+            'workspace': 'default',
+            'last_attached_at': None,
+            'last_use': None,
+            'handle': mock_handle,
+            'status': status_lib.VolumeStatus.NOT_READY,
+            'is_ephemeral': False,
+            'error_message': error_msg,
+            'usedby_pods': [],
+            'usedby_clusters': [],
+        }]
+
+        mock_get_volumes = mock.MagicMock(return_value=mock_volumes)
+        monkeypatch.setattr(global_user_state, 'get_volumes', mock_get_volumes)
+
+        mock_get_all_users = mock.MagicMock(return_value=[])
+        monkeypatch.setattr(global_user_state, 'get_all_users',
+                            mock_get_all_users)
+
+        result = core.volume_list()
+
+        assert len(result) == 1
+        assert result[0]['status'] == 'NOT_READY'
+        assert result[0]['error_message'] == error_msg
+
+
+class TestVolumeStatus:
+    """Tests for VolumeStatus enum."""
+
+    def test_volume_status_not_ready_exists(self):
+        """Test that NOT_READY status exists."""
+        assert hasattr(status_lib.VolumeStatus, 'NOT_READY')
+        assert status_lib.VolumeStatus.NOT_READY.value == 'NOT_READY'
diff --git a/tests/unit_tests/test_sky/volumes/test_global_user_state_volumes.py b/tests/unit_tests/test_sky/volumes/test_global_user_state_volumes.py
new file mode 100644
index 00000000000..503977d380d
--- /dev/null
+++ b/tests/unit_tests/test_sky/volumes/test_global_user_state_volumes.py
@@ -0,0 +1,264 @@
+"""Unit tests for volume database operations in global_user_state.py.
+
+These tests specifically cover the is_ephemeral boolean/integer conversion
+that is necessary for PostgreSQL compatibility. See issue #8178 and PR #8179.
+"""
+
+import pickle
+from unittest import mock
+
+import pytest
+
+from sky import global_user_state
+from sky import models
+from sky.utils import status_lib
+
+
+class TestVolumeIsEphemeralHandling:
+    """Test is_ephemeral boolean/integer handling in volume operations.
+
+    PostgreSQL stores boolean values as integers (0/1), while SQLite
+    handles Python booleans transparently. These tests verify that
+    is_ephemeral is correctly converted to integer when storing and
+    back to boolean when retrieving.
+    """
+
+    @pytest.fixture
+    def mock_engine(self):
+        """Mock SQLAlchemy engine."""
+        with mock.patch.object(global_user_state,
+                               '_SQLALCHEMY_ENGINE') as engine:
+            engine.dialect.name = 'sqlite'
+            yield engine
+
+    @pytest.fixture
+    def mock_session(self, mock_engine):  # pylint: disable=unused-argument
+        """Mock SQLAlchemy session."""
+        with mock.patch(
+                'sky.global_user_state.orm.Session') as mock_session_class:
+            session = mock.Mock()
+            mock_session_class.return_value.__enter__.return_value = session
+            yield session
+
+    @pytest.fixture
+    def mock_volume_config(self):
+        """Create a mock volume config."""
+        return models.VolumeConfig(
+            name='test-volume',
+            cloud='kubernetes',
+            type='k8s-pvc',
+            region='my-context',
+            zone=None,
+            size='100Gi',
+            config={},
+            name_on_cloud='test-pvc',
+        )
+
+    def _setup_add_volume_mocks(self, mock_db_module):
+        """Helper to set up common mocks for add_volume tests."""
+        mock_insert = mock.Mock()
+        mock_db_module.insert.return_value = mock_insert
+        return mock_insert
+
+    @pytest.mark.parametrize('dialect,db_module_path', [
+        ('sqlite', 'sky.global_user_state.sqlite'),
+        ('postgresql', 'sky.global_user_state.postgresql'),
+    ])
+    @pytest.mark.parametrize('is_ephemeral,expected_value', [
+        (True, 1),
+        (False, 0),
+    ])
+    def test_add_volume_is_ephemeral(
+            self,
+            mock_engine,
+            mock_session,  # pylint: disable=unused-argument
+            mock_volume_config,
+            dialect,
+            db_module_path,
+            is_ephemeral,
+            expected_value):
+        """Test add_volume converts is_ephemeral boolean to integer.
+
+        This covers both SQLite and PostgreSQL dialects, verifying the fix
+        for issue #8178. Uses type() is int to ensure the value is actually
+        an int and not a bool (since bool is a subclass of int in Python).
+        """
+        mock_engine.dialect.name = dialect
+        with mock.patch(db_module_path) as mock_db_module:
+            with mock.patch('time.time', return_value=1234567890):
+                with mock.patch(
+                        'sky.global_user_state.common_utils.get_current_command',
+                        return_value='sky volumes apply'):
+                    with mock.patch(
+                            'sky.global_user_state.common_utils.get_current_user'
+                    ) as mock_user:
+                        with mock.patch(
+                                'sky.global_user_state.skypilot_config.'
+                                'get_active_workspace',
+                                return_value='default'):
+                            mock_user.return_value = mock.Mock(id='user123')
+                            mock_insert = self._setup_add_volume_mocks(
+                                mock_db_module)
+
+                            global_user_state.add_volume(
+                                name='test-volume',
+                                config=mock_volume_config,
+                                status=status_lib.VolumeStatus.READY,
+                                is_ephemeral=is_ephemeral,
+                            )
+
+                            # Verify insert was called with correct int value
+                            mock_insert.values.assert_called_once()
+                            call_kwargs = mock_insert.values.call_args[1]
+                            assert call_kwargs['is_ephemeral'] == expected_value
+                            # Use type() is int because bool is subclass of int
+                            # pylint: disable=unidiomatic-typecheck
+                            assert type(call_kwargs['is_ephemeral']) is int
+
+    @pytest.mark.parametrize('dialect', ['sqlite', 'postgresql'])
+    @pytest.mark.parametrize('is_ephemeral,expected_value', [
+        (True, 1),
+        (False, 0),
+    ])
+    def test_get_volumes_filter_is_ephemeral(self, mock_engine, mock_session,
+                                             dialect, is_ephemeral,
+                                             expected_value):
+        """Test get_volumes filtering converts is_ephemeral to integer.
+
+        This covers both SQLite and PostgreSQL dialects, verifying the fix
+        for issue #8178. Uses type() is int to ensure the value is actually
+        an int and not a bool (since bool is a subclass of int in Python).
+        """
+        mock_engine.dialect.name = dialect
+        mock_session.query.return_value.filter_by.return_value.all.return_value = []
+
+        global_user_state.get_volumes(is_ephemeral=is_ephemeral)
+
+        # Verify filter_by was called with correct int value
+        mock_session.query.return_value.filter_by.assert_called_once()
+        call_kwargs = mock_session.query.return_value.filter_by.call_args[1]
+        assert call_kwargs['is_ephemeral'] == expected_value
+        # Use type() is int because bool is subclass of int
+        # pylint: disable=unidiomatic-typecheck
+        assert type(call_kwargs['is_ephemeral']) is int
+
+    def test_get_volumes_filter_none_returns_all(
+            self,
+            mock_engine,  # pylint: disable=unused-argument
+            mock_session):
+        """Test get_volumes with is_ephemeral=None returns all volumes."""
+        mock_session.query.return_value.all.return_value = []
+
+        global_user_state.get_volumes(is_ephemeral=None)
+
+        # Verify query.all() was called (not filter_by)
+        mock_session.query.return_value.all.assert_called_once()
+        mock_session.query.return_value.filter_by.assert_not_called()
+
+    @pytest.mark.parametrize('db_value,expected_bool,last_attached', [
+        (1, True, 1234567890),
+        (0, False, None),
+    ])
+    def test_get_volumes_returns_bool_is_ephemeral(
+            self,
+            mock_engine,  # pylint: disable=unused-argument
+            mock_session,
+            mock_volume_config,
+            db_value,
+            expected_bool,
+            last_attached):
+        """Test get_volumes returns a boolean for is_ephemeral.
+
+        Verifies that the integer value stored in the database is
+        converted back to a Python boolean when returned.
+        """
+        # Mock row with is_ephemeral as stored in database
+        mock_row = mock.Mock()
+        mock_row.name = 'test-volume'
+        mock_row.launched_at = 1234567890
+        mock_row.handle = pickle.dumps(mock_volume_config)
+        mock_row.user_hash = 'user123'
+        mock_row.workspace = 'default'
+        mock_row.last_attached_at = last_attached
+        mock_row.last_use = 'sky volumes apply'
+        mock_row.status = 'READY'
+        mock_row.is_ephemeral = db_value  # Integer as stored in database
+        mock_row.error_message = None
+        mock_row.usedby_pods = '[]'
+        mock_row.usedby_clusters = '[]'
+
+        mock_session.query.return_value.all.return_value = [mock_row]
+
+        result = global_user_state.get_volumes()
+
+        assert len(result) == 1
+        assert result[0]['is_ephemeral'] is expected_bool
+        assert isinstance(result[0]['is_ephemeral'], bool)
+
+    def test_add_volume_unsupported_dialect(
+            self,
+            mock_engine,
+            mock_session,  # pylint: disable=unused-argument
+            mock_volume_config):
+        """Test add_volume raises error for unsupported database dialect."""
+        mock_engine.dialect.name = 'mysql'
+
+        with mock.patch('time.time', return_value=1234567890):
+            with mock.patch(
+                    'sky.global_user_state.common_utils.get_current_command',
+                    return_value='sky volumes apply'):
+                with mock.patch(
+                        'sky.global_user_state.common_utils.get_current_user'
+                ) as mock_user:
+                    with mock.patch(
+                            'sky.global_user_state.skypilot_config.'
+                            'get_active_workspace',
+                            return_value='default'):
+                        mock_user.return_value = mock.Mock(id='user123')
+
+                        with pytest.raises(
+                                ValueError,
+                                match='Unsupported database dialect'):
+                            global_user_state.add_volume(
+                                name='test-volume',
+                                config=mock_volume_config,
+                                status=status_lib.VolumeStatus.READY,
+                                is_ephemeral=True,
+                            )
+
+    def test_add_volume_ephemeral_sets_in_use_status(
+            self,
+            mock_engine,  # pylint: disable=unused-argument
+            mock_session,  # pylint: disable=unused-argument
+            mock_volume_config):
+        """Test add_volume with is_ephemeral=True sets status to IN_USE."""
+        with mock.patch('sky.global_user_state.sqlite') as mock_sqlite:
+            with mock.patch('time.time', return_value=1234567890):
+                with mock.patch(
+                        'sky.global_user_state.common_utils.get_current_command',
+                        return_value='sky volumes apply'):
+                    with mock.patch(
+                            'sky.global_user_state.common_utils.get_current_user'
+                    ) as mock_user:
+                        with mock.patch(
+                                'sky.global_user_state.skypilot_config.'
+                                'get_active_workspace',
+                                return_value='default'):
+                            mock_user.return_value = mock.Mock(id='user123')
+                            mock_insert = self._setup_add_volume_mocks(
+                                mock_sqlite)
+
+                            global_user_state.add_volume(
+                                name='test-volume',
+                                config=mock_volume_config,
+                                status=status_lib.VolumeStatus.READY,
+                                is_ephemeral=True,
+                            )
+
+                            # Verify status is IN_USE for ephemeral volumes
+                            call_kwargs = mock_insert.values.call_args[1]
+                            expected_status = status_lib.VolumeStatus.IN_USE
+                            assert call_kwargs[
+                                'status'] == expected_status.value
+                            # Verify last_attached_at is set
+                            assert call_kwargs['last_attached_at'] == 1234567890
diff --git a/tests/unit_tests/test_sky/volumes/test_k8s_volume.py b/tests/unit_tests/test_sky/volumes/test_k8s_volume.py
index 8571989dcd8..7e5d9a70a1b 100644
--- a/tests/unit_tests/test_sky/volumes/test_k8s_volume.py
+++ b/tests/unit_tests/test_sky/volumes/test_k8s_volume.py
@@ -1412,6 +1412,25 @@ def test_get_pvc_spec_access_mode_required(self):
             k8s_volume._get_pvc_spec('my-namespace', config)
 
 
+class MockPV:
+    """Mock PersistentVolume object."""
+
+    def __init__(self,
+                 name: str,
+                 storage_class: str = 'standard',
+                 access_modes: List[str] = None,
+                 phase: str = 'Available'):
+        self.metadata = Mock()
+        self.metadata.name = name
+
+        self.spec = Mock()
+        self.spec.storage_class_name = storage_class
+        self.spec.access_modes = access_modes or ['ReadWriteOnce']
+
+        self.status = Mock()
+        self.status.phase = phase
+
+
 class TestRefreshVolumeConfig:
     """Tests for refresh_volume_config function."""
 
@@ -1459,3 +1478,316 @@ def test_refresh_volume_config_region_set(self):
         assert need_refresh is False
         assert new_config.region == 'existing-context'
         assert config.region == 'existing-context'
+
+
+class TestGetAllVolumesErrors:
+    """Tests for get_all_volumes_errors function."""
+
+    @patch('sky.provision.kubernetes.volume._get_context_namespace')
+    @patch('sky.adaptors.kubernetes.core_api')
+    def test_pvc_bound_no_error(self, mock_core_api, mock_get_context):
+        """Test that bound PVCs have no error."""
+        mock_get_context.return_value = ('my-context', 'my-namespace')
+
+        # Create a bound PVC
+        mock_pvc = MockPVC('test-pvc', 'my-namespace')
+        mock_pvc.status.phase = 'Bound'
+
+        mock_pvc_list = Mock()
+        mock_pvc_list.items = [mock_pvc]
+        mock_core_api.return_value.list_namespaced_persistent_volume_claim.return_value = mock_pvc_list
+
+        config = models.VolumeConfig(
+            _version=1,
+            name='test-vol',
+            type='k8s-pvc',
+            cloud='kubernetes',
+            region='my-context',
+            zone=None,
+            name_on_cloud='test-pvc',
+            size='10',
+            config={'namespace': 'my-namespace'},
+        )
+
+        errors = k8s_volume.get_all_volumes_errors([config])
+
+        assert errors.get('test-vol') is None
+
+    @patch('sky.provision.kubernetes.volume._get_context_namespace')
+    @patch('sky.adaptors.kubernetes.core_api')
+    @patch('sky.adaptors.kubernetes.storage_api')
+    def test_pvc_pending_generic_error(self, mock_storage_api, mock_core_api,
+                                       mock_get_context):
+        """Test that pending PVCs without access mode mismatch get generic error."""
+        mock_get_context.return_value = ('my-context', 'my-namespace')
+
+        # Create a pending PVC
+        mock_pvc = MockPVC('test-pvc', 'my-namespace')
+        mock_pvc.status.phase = 'Pending'
+        mock_pvc.spec.storage_class_name = 'standard'
+
+        mock_pvc_list = Mock()
+        mock_pvc_list.items = [mock_pvc]
+        mock_core_api.return_value.list_namespaced_persistent_volume_claim.return_value = mock_pvc_list
+
+        # No available PVs
+        mock_pv_list = Mock()
+        mock_pv_list.items = []
+        mock_core_api.return_value.list_persistent_volume.return_value = mock_pv_list
+
+        # Mock storage class with Immediate binding mode
+        mock_storage_class = Mock()
+        mock_storage_class.volume_binding_mode = 'Immediate'
+        mock_storage_api.return_value.read_storage_class.return_value = mock_storage_class
+
+        config = models.VolumeConfig(
+            _version=1,
+            name='test-vol',
+            type='k8s-pvc',
+            cloud='kubernetes',
+            region='my-context',
+            zone=None,
+            name_on_cloud='test-pvc',
+            size='10',
+            config={'namespace': 'my-namespace'},
+        )
+
+        errors = k8s_volume.get_all_volumes_errors([config])
+
+        assert 'test-vol' in errors
+        assert errors['test-vol'] is not None
+        assert 'pending' in errors['test-vol'].lower()
+        assert 'kubectl describe pvc' in errors['test-vol']
+
+    @patch('sky.provision.kubernetes.volume._get_context_namespace')
+    @patch('sky.adaptors.kubernetes.core_api')
+    def test_pvc_pending_access_mode_mismatch(self, mock_core_api,
+                                              mock_get_context):
+        """Test detection of access mode mismatch."""
+        mock_get_context.return_value = ('my-context', 'my-namespace')
+
+        # Create a pending PVC requesting ReadWriteOnce
+        mock_pvc = MockPVC('test-pvc',
+                           'my-namespace',
+                           access_modes=['ReadWriteOnce'])
+        mock_pvc.status.phase = 'Pending'
+        mock_pvc.spec.storage_class_name = 'standard'
+
+        mock_pvc_list = Mock()
+        mock_pvc_list.items = [mock_pvc]
+        mock_core_api.return_value.list_namespaced_persistent_volume_claim.return_value = mock_pvc_list
+
+        # Available PV only supports ReadWriteMany
+        mock_pv = MockPV('test-pv',
+                         storage_class='standard',
+                         access_modes=['ReadWriteMany'],
+                         phase='Available')
+        mock_pv_list = Mock()
+        mock_pv_list.items = [mock_pv]
+        mock_core_api.return_value.list_persistent_volume.return_value = mock_pv_list
+
+        config = models.VolumeConfig(
+            _version=1,
+            name='test-vol',
+            type='k8s-pvc',
+            cloud='kubernetes',
+            region='my-context',
+            zone=None,
+            name_on_cloud='test-pvc',
+            size='10',
+            config={'namespace': 'my-namespace'},
+        )
+
+        errors = k8s_volume.get_all_volumes_errors([config])
+
+        assert 'test-vol' in errors
+        assert 'access mode mismatch' in errors['test-vol'].lower()
+        assert 'ReadWriteOnce' in errors['test-vol']
+        assert 'ReadWriteMany' in errors['test-vol']
+        assert 'kubectl describe pvc' in errors['test-vol']
+
+    @patch('sky.provision.kubernetes.volume._get_context_namespace')
+    @patch('sky.adaptors.kubernetes.core_api')
+    @patch('sky.adaptors.kubernetes.storage_api')
+    def test_pvc_pending_wait_for_first_consumer(self, mock_storage_api,
+                                                 mock_core_api,
+                                                 mock_get_context):
+        """Test that pending PVCs with WaitForFirstConsumer get no error."""
+        mock_get_context.return_value = ('my-context', 'my-namespace')
+
+        # Create a pending PVC
+        mock_pvc = MockPVC('test-pvc', 'my-namespace')
+        mock_pvc.status.phase = 'Pending'
+        mock_pvc.spec.storage_class_name = 'standard'
+
+        mock_pvc_list = Mock()
+        mock_pvc_list.items = [mock_pvc]
+        mock_core_api.return_value.list_namespaced_persistent_volume_claim.return_value = mock_pvc_list
+
+        # No available PVs
+        mock_pv_list = Mock()
+        mock_pv_list.items = []
+        mock_core_api.return_value.list_persistent_volume.return_value = mock_pv_list
+
+        # Mock storage class with WaitForFirstConsumer binding mode
+        mock_storage_class = Mock()
+        mock_storage_class.volume_binding_mode = 'WaitForFirstConsumer'
+        mock_storage_api.return_value.read_storage_class.return_value = mock_storage_class
+
+        config = models.VolumeConfig(
+            _version=1,
+            name='test-vol',
+            type='k8s-pvc',
+            cloud='kubernetes',
+            region='my-context',
+            zone=None,
+            name_on_cloud='test-pvc',
+            size='10',
+            config={'namespace': 'my-namespace'},
+        )
+
+        errors = k8s_volume.get_all_volumes_errors([config])
+
+        # Should be None for WaitForFirstConsumer
+        assert errors.get('test-vol') is None
+
+    @patch('sky.provision.kubernetes.volume._get_context_namespace')
+    @patch('sky.adaptors.kubernetes.core_api')
+    @patch('sky.adaptors.kubernetes.storage_api')
+    def test_pvc_pending_storage_class_read_failure(self, mock_storage_api,
+                                                    mock_core_api,
+                                                    mock_get_context):
+        """Test that pending PVCs with storage class read failure get no error."""
+        mock_get_context.return_value = ('my-context', 'my-namespace')
+
+        # Create a pending PVC
+        mock_pvc = MockPVC('test-pvc', 'my-namespace')
+        mock_pvc.status.phase = 'Pending'
+        mock_pvc.spec.storage_class_name = 'existent'
+
+        mock_pvc_list = Mock()
+        mock_pvc_list.items = [mock_pvc]
+        mock_core_api.return_value.list_namespaced_persistent_volume_claim.return_value = mock_pvc_list
+
+        # No available PVs
+        mock_pv_list = Mock()
+        mock_pv_list.items = []
+        mock_core_api.return_value.list_persistent_volume.return_value = mock_pv_list
+
+        # Mock storage class read failure
+        mock_storage_api.return_value.read_storage_class.side_effect = Exception(
+            'Storage class read failure')
+
+        config = models.VolumeConfig(
+            _version=1,
+            name='test-vol',
+            type='k8s-pvc',
+            cloud='kubernetes',
+            region='my-context',
+            zone=None,
+            name_on_cloud='test-pvc',
+            size='10',
+            config={'namespace': 'my-namespace'},
+        )
+
+        errors = k8s_volume.get_all_volumes_errors([config])
+
+        # Should be None when storage class read fails
+        assert errors.get('test-vol') is None
+
+    @patch('sky.provision.kubernetes.volume._get_context_namespace')
+    @patch('sky.adaptors.kubernetes.core_api')
+    @patch('sky.adaptors.kubernetes.storage_api')
+    def test_pvc_pending_storage_class_empty(self, mock_storage_api,
+                                             mock_core_api, mock_get_context):
+        """Test that pending PVCs with missing storage class get no error."""
+        mock_get_context.return_value = ('my-context', 'my-namespace')
+
+        # Create a pending PVC
+        mock_pvc = MockPVC('test-pvc', 'my-namespace')
+        mock_pvc.status.phase = 'Pending'
+        mock_pvc.spec.storage_class_name = ''
+
+        mock_pvc_list = Mock()
+        mock_pvc_list.items = [mock_pvc]
+        mock_core_api.return_value.list_namespaced_persistent_volume_claim.return_value = mock_pvc_list
+
+        # No available PVs
+        mock_pv_list = Mock()
+        mock_pv_list.items = []
+        mock_core_api.return_value.list_persistent_volume.return_value = mock_pv_list
+
+        config = models.VolumeConfig(
+            _version=1,
+            name='test-vol',
+            type='k8s-pvc',
+            cloud='kubernetes',
+            region='my-context',
+            zone=None,
+            name_on_cloud='test-pvc',
+            size='10',
+            config={'namespace': 'my-namespace'},
+        )
+
+        errors = k8s_volume.get_all_volumes_errors([config])
+
+        # Should be None when storage class read fails
+        assert errors.get('test-vol') is None
+
+    @patch('sky.provision.kubernetes.volume._get_context_namespace')
+    @patch('sky.adaptors.kubernetes.core_api')
+    def test_pvc_lost_error(self, mock_core_api, mock_get_context):
+        """Test that lost PVCs get appropriate error."""
+        mock_get_context.return_value = ('my-context', 'my-namespace')
+
+        # Create a lost PVC
+        mock_pvc = MockPVC('test-pvc', 'my-namespace')
+        mock_pvc.status.phase = 'Lost'
+
+        mock_pvc_list = Mock()
+        mock_pvc_list.items = [mock_pvc]
+        mock_core_api.return_value.list_namespaced_persistent_volume_claim.return_value = mock_pvc_list
+
+        config = models.VolumeConfig(
+            _version=1,
+            name='test-vol',
+            type='k8s-pvc',
+            cloud='kubernetes',
+            region='my-context',
+            zone=None,
+            name_on_cloud='test-pvc',
+            size='10',
+            config={'namespace': 'my-namespace'},
+        )
+
+        errors = k8s_volume.get_all_volumes_errors([config])
+
+        assert 'test-vol' in errors
+        assert 'Lost' in errors['test-vol']
+        assert 'kubectl describe pvc' in errors['test-vol']
+
+    @patch('sky.provision.kubernetes.volume._get_context_namespace')
+    @patch('sky.adaptors.kubernetes.core_api')
+    def test_pvc_list_exception(self, mock_core_api, mock_get_context):
+        """Test handling of exceptions when listing PVCs."""
+        mock_get_context.return_value = ('my-context', 'my-namespace')
+
+        # Make list_namespaced_persistent_volume_claim raise an exception
+        mock_core_api.return_value.list_namespaced_persistent_volume_claim.side_effect = Exception(
+            'API error')
+
+        config = models.VolumeConfig(
+            _version=1,
+            name='test-vol',
+            type='k8s-pvc',
+            cloud='kubernetes',
+            region='my-context',
+            zone=None,
+            name_on_cloud='test-pvc',
+            size='10',
+            config={'namespace': 'my-namespace'},
+        )
+
+        # Should not raise, just return empty dict
+        errors = k8s_volume.get_all_volumes_errors([config])
+        assert errors == {}
diff --git a/tests/unit_tests/test_sky/volumes/test_server.py b/tests/unit_tests/test_sky/volumes/test_server.py
index 330ebd8f8f2..252e4306ace 100644
--- a/tests/unit_tests/test_sky/volumes/test_server.py
+++ b/tests/unit_tests/test_sky/volumes/test_server.py
@@ -41,13 +41,17 @@ def test_volume_list_success(self, monkeypatch):
             assert response.status_code == 200
 
             # Verify executor was called correctly
-            mock_schedule_async.assert_called_once_with(
-                request_id='test-request-id',
-                request_name='volume_list',
-                request_body=payloads.RequestBody(),
-                func=server.core.volume_list,
-                schedule_type=requests_lib.ScheduleType.SHORT,
-            )
+            mock_schedule_async.assert_called_once()
+            call_args = mock_schedule_async.call_args
+            assert call_args[1]['request_id'] == 'test-request-id'
+            assert call_args[1]['request_name'] == 'volume_list'
+            assert call_args[1]['func'] == server.core.volume_list
+            assert call_args[1][
+                'schedule_type'] == requests_lib.ScheduleType.SHORT
+            # Verify VolumeListBody with refresh=False (default)
+            request_body = call_args[1]['request_body']
+            assert isinstance(request_body, payloads.VolumeListBody)
+            assert request_body.refresh is False
 
     def test_volume_delete_success(self, monkeypatch):
         """Test volume_delete endpoint with successful request."""
@@ -64,6 +68,7 @@ def test_volume_delete_success(self, monkeypatch):
         # Mock request state
         with mock.patch.object(fastapi.Request, 'state') as mock_state:
             mock_state.request_id = 'test-request-id'
+            mock_state.auth_user = None
 
             # Test data
             delete_body = {'names': ['test-volume-1', 'test-volume-2']}
@@ -108,6 +113,7 @@ def test_volume_apply_success_pvc(self, monkeypatch):
         # Mock request state
         with mock.patch.object(fastapi.Request, 'state') as mock_state:
             mock_state.request_id = 'test-request-id'
+            mock_state.auth_user = None
 
             # Test data for PVC volume
             apply_body = {
@@ -163,6 +169,7 @@ def test_volume_apply_success_pvc_default_access_mode(self, monkeypatch):
         # Mock request state
         with mock.patch.object(fastapi.Request, 'state') as mock_state:
             mock_state.request_id = 'test-request-id'
+            mock_state.auth_user = None
 
             # Test data for PVC volume without access_mode
             apply_body = {
@@ -214,6 +221,7 @@ def test_volume_apply_success_pvc_none_config(self, monkeypatch):
         # Mock request state
         with mock.patch.object(fastapi.Request, 'state') as mock_state:
             mock_state.request_id = 'test-request-id'
+            mock_state.auth_user = None
 
             # Test data for PVC volume with None config
             apply_body = {
@@ -244,6 +252,7 @@ def test_volume_apply_invalid_volume_type(self, monkeypatch):
         # Mock request state
         with mock.patch.object(fastapi.Request, 'state') as mock_state:
             mock_state.request_id = 'test-request-id'
+            mock_state.auth_user = None
 
             # Test data with invalid volume type
             apply_body = {
@@ -279,6 +288,7 @@ def test_volume_apply_invalid_cloud(self, monkeypatch):
         # Mock request state
         with mock.patch.object(fastapi.Request, 'state') as mock_state:
             mock_state.request_id = 'test-request-id'
+            mock_state.auth_user = None
 
             # Test data with invalid cloud
             apply_body = {
@@ -315,6 +325,7 @@ def test_volume_apply_pvc_non_kubernetes_cloud(self, monkeypatch):
         # Mock request state
         with mock.patch.object(fastapi.Request, 'state') as mock_state:
             mock_state.request_id = 'test-request-id'
+            mock_state.auth_user = None
 
             # Test data for PVC volume on non-Kubernetes cloud
             apply_body = {
@@ -352,6 +363,7 @@ def test_volume_apply_pvc_invalid_access_mode(self, monkeypatch):
         # Mock request state
         with mock.patch.object(fastapi.Request, 'state') as mock_state:
             mock_state.request_id = 'test-request-id'
+            mock_state.auth_user = None
 
             # Test data for PVC volume with invalid access mode
             apply_body = {
@@ -395,6 +407,7 @@ def test_volume_apply_non_pvc_volume_type(self, monkeypatch):
         # Mock request state
         with mock.patch.object(fastapi.Request, 'state') as mock_state:
             mock_state.request_id = 'test-request-id'
+            mock_state.auth_user = None
 
             # Test data for non-PVC volume type (assuming there's another type)
             # Note: This test assumes there are other volume types besides PVC
@@ -430,6 +443,7 @@ def test_volume_delete_empty_volume_names(self, monkeypatch):
         # Mock request state
         with mock.patch.object(fastapi.Request, 'state') as mock_state:
             mock_state.request_id = 'test-request-id'
+            mock_state.auth_user = None
 
             # Test data with empty volume names
             delete_body = {'names': []}
@@ -556,6 +570,7 @@ def test_volume_apply_success_runpod_network_volume(self, monkeypatch):
         # Mock request state
         with mock.patch.object(fastapi.Request, 'state') as mock_state:
             mock_state.request_id = 'test-request-id'
+            mock_state.auth_user = None
 
             # Test data for RunPod network volume
             apply_body = {
@@ -603,6 +618,7 @@ def test_volume_apply_runpod_network_volume_non_runpod_cloud(
         # Mock request state
         with mock.patch.object(fastapi.Request, 'state') as mock_state:
             mock_state.request_id = 'test-request-id'
+            mock_state.auth_user = None
 
             # Test data for RunPod network volume on non-RunPod cloud
             apply_body = {
diff --git a/tests/unit_tests/test_zip_and_unzip.py b/tests/unit_tests/test_zip_and_unzip.py
index 3c9d690d7dd..e3e4d927565 100644
--- a/tests/unit_tests/test_zip_and_unzip.py
+++ b/tests/unit_tests/test_zip_and_unzip.py
@@ -5,6 +5,9 @@
 import tempfile
 import zipfile
 
+import fastapi
+import pytest
+
 from sky.data import storage_utils
 from sky.server import server
 from sky.skylet import constants
@@ -115,3 +118,62 @@ def test_unzip_file(skyignore_dir, tmp_path):
         # Verify empty folders are preserved
         assert (unzipped_dir / 'empty-folder').is_dir()
         assert not any((unzipped_dir / 'empty-folder').iterdir())
+
+
+def test_zip_files_and_folders_compression():
+    """Test that compression is applied by default."""
+    with tempfile.TemporaryDirectory() as temp_dir:
+        # Create a text file with repetitive content (highly compressible)
+        test_file = os.path.join(temp_dir, 'test.log')
+        with open(test_file, 'w', encoding='utf-8') as f:
+            f.write('INFO: This is a log line\n' * 10000)
+
+        uncompressed_size = os.path.getsize(test_file)
+
+        # Create zip file in the same temp directory for automatic cleanup
+        zip_path = os.path.join(temp_dir, 'test.zip')
+        storage_utils.zip_files_and_folders([test_file], zip_path)
+        compressed_size = os.path.getsize(zip_path)
+
+        # Compressed ZIP should be significantly smaller
+        # Log files with repetitive content should compress to <50% of
+        # original size
+        assert compressed_size < uncompressed_size * 0.5, (
+            f'Compression not effective: {compressed_size} >= '
+            f'{uncompressed_size * 0.5} (original: {uncompressed_size})')
+
+        # Verify the zip is valid and can be extracted
+        with zipfile.ZipFile(zip_path, 'r') as zipf:
+            assert len(zipf.namelist()) == 1
+            # Check compression method is DEFLATED
+            info = zipf.getinfo(zipf.namelist()[0])
+            assert info.compress_type == zipfile.ZIP_DEFLATED
+
+
+def test_unzip_file_zip_slip_blocked():
+    """Test that Zip Slip path traversal attacks are blocked."""
+    malicious_names = [
+        '../../../etc/passwd',
+        'foo/../../bar/../../etc/passwd',
+        'normal/../../../etc/shadow',
+    ]
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        tmpdir = pathlib.Path(tmpdir)
+
+        for name in malicious_names:
+            zip_path = tmpdir / 'malicious.zip'
+            extract_dir = tmpdir / 'extract'
+            extract_dir.mkdir(exist_ok=True)
+
+            with zipfile.ZipFile(zip_path, 'w') as z:
+                z.writestr(name, b'malicious content')
+
+            with pytest.raises(fastapi.HTTPException) as exc_info:
+                asyncio.run(server.unzip_file(zip_path, extract_dir))
+
+            # HTTPException stores message in .detail, not __str__
+            exc = exc_info.value
+            error_msg = getattr(exc, 'detail', None) or str(exc)
+            assert 'outside target directory' in error_msg, \
+                f'Expected "outside target directory" error for {name}'