skypilot-org
diff --git a/‎.buildkite/generate_pipeline.py‎
Lines changed: 106 additions & 38 deletions b/‎.buildkite/generate_pipeline.py‎
Lines changed: 106 additions & 38 deletions
diff --git a/‎README.md‎
Lines changed: 2 additions & 2 deletions b/‎README.md‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎charts/skypilot/templates/api-deployment.yaml‎
Lines changed: 5 additions & 2 deletions b/‎charts/skypilot/templates/api-deployment.yaml‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎charts/skypilot/tests/deployment_test.yaml‎
Lines changed: 44 additions & 0 deletions b/‎charts/skypilot/tests/deployment_test.yaml‎
Lines changed: 44 additions & 0 deletions
diff --git a/‎charts/skypilot/values.schema.json‎
Lines changed: 20 additions & 0 deletions b/‎charts/skypilot/values.schema.json‎
Lines changed: 20 additions & 0 deletions
@@ -25,7 +25,9 @@
 import collections
 import os
 import re
+import shlex
 import subprocess
+import sys
 import time
 from typing import Any, Dict, List, Optional, Tuple
 
@@ -124,7 +126,7 @@ def _parse_args(args: Optional[str] = None):
     :return: (list_of_clouds, k_pattern)
     """
     if args:
-        args_list = args.split()
+        args_list = shlex.split(args)
     else:
         args_list = []
     parser = argparse.ArgumentParser(
@@ -154,7 +156,10 @@ def _parse_args(args: Optional[str] = None):
     parser.add_argument('--dependency', nargs='?', const='', default='all')
     parser.add_argument('--concurrency', type=int)
 
-    parsed_args, _ = parser.parse_known_args(args_list)
+    # pytest_native: args the generate_pipeline parser does not recognise
+    # (e.g. --no-resource-heavy).  They are conftest-registered pytest flags
+    # and must be forwarded to `pytest --collect-only` unchanged.
+    parsed_args, pytest_native = parser.parse_known_args(args_list)
 
     # Collect chosen clouds from the flags
     # TODO(zpoint): get default clouds from the conftest.py
@@ -177,38 +182,52 @@ def _parse_args(args: Optional[str] = None):
     if not default_clouds_to_run:
         default_clouds_to_run = DEFAULT_CLOUDS_TO_RUN
 
-    extra_args = []
+    # Each entry is a single shell token so that shlex.join() can safely
+    # quote the list when it is passed to pytest --collect-only.
+    extra_args: List[str] = []
     if parsed_args.remote_server:
         extra_args.append('--remote-server')
     if parsed_args.base_branch:
-        extra_args.append(f'--base-branch {parsed_args.base_branch}')
+        extra_args.extend(['--base-branch', parsed_args.base_branch])
     if parsed_args.controller_cloud:
-        extra_args.append(f'--controller-cloud {parsed_args.controller_cloud}')
+        extra_args.extend(['--controller-cloud', parsed_args.controller_cloud])
     if parsed_args.postgres:
         extra_args.append('--postgres')
     if parsed_args.helm_version:
-        extra_args.append(f'--helm-version {parsed_args.helm_version}')
+        extra_args.extend(['--helm-version', parsed_args.helm_version])
     if parsed_args.helm_package:
-        extra_args.append(f'--helm-package {parsed_args.helm_package}')
+        extra_args.extend(['--helm-package', parsed_args.helm_package])
     if parsed_args.jobs_consolidation:
         extra_args.append('--jobs-consolidation')
     if parsed_args.serve_consolidation:
         extra_args.append('--serve-consolidation')
     if parsed_args.grpc:
         extra_args.append('--grpc')
     if parsed_args.env_file:
-        extra_args.append(f'--env-file {parsed_args.env_file}')
+        extra_args.extend(['--env-file', parsed_args.env_file])
     if parsed_args.plugin_yaml:
-        extra_args.append(f'--plugin-yaml {parsed_args.plugin_yaml}')
+        extra_args.extend(['--plugin-yaml', parsed_args.plugin_yaml])
     if parsed_args.submodule_base_branch:
-        extra_args.append(
-            f'--submodule-base-branch {parsed_args.submodule_base_branch}')
+        extra_args.extend(
+            ['--submodule-base-branch', parsed_args.submodule_base_branch])
     if parsed_args.dependency != 'all':
-        space = ' ' if parsed_args.dependency else ''
-        extra_args.append(f'--dependency{space}{parsed_args.dependency}')
+        if parsed_args.dependency:
+            extra_args.extend(['--dependency', parsed_args.dependency])
+        else:
+            extra_args.append('--dependency')
+    # Cloud flags are conftest-registered; include them in extra_args so that
+    # they reach `pytest --collect-only` (some marks depend on which clouds
+    # are active).  They are already captured in default_clouds_to_run for
+    # Buildkite-step generation; adding them here is intentional duplication.
+    for cloud in all_clouds_in_smoke_tests:
+        if getattr(parsed_args, cloud, False):
+            extra_args.append(f'--{cloud}')
+    if parsed_args.generic_cloud:
+        extra_args.append(f'--generic-cloud {parsed_args.generic_cloud}')
 
     return (default_clouds_to_run, parsed_args.k, extra_args,
-            parsed_args.concurrency, parsed_args.env_file is not None)
+            parsed_args.concurrency, parsed_args.env_file
+            is not None, pytest_native)
 
 
 def _extract_marked_tests(
@@ -236,6 +255,19 @@ def _extract_marked_tests(
     # Args are already in the format pytest expects (cloud names like --lambda)
     cmd = f'pytest {file_path} --collect-only {args}'
     output = subprocess.run(cmd, shell=True, capture_output=True, text=True)
+    # Exit code 5 means "no tests collected" — normal when a file has no
+    # matching tests for the requested clouds.  Any other non-zero code is a
+    # real error (e.g. unrecognised arguments, import failure) that would
+    # silently produce 0 matches and generate an empty pipeline.  Fail loudly
+    # so the build is visibly broken rather than a noop.
+    if output.returncode not in (0, 5):
+        print(
+            f'ERROR: pytest collection failed (exit {output.returncode}) '
+            f'for {file_path}:\n'
+            f'STDOUT:\n{output.stdout}\n'
+            f'STDERR:\n{output.stderr}',
+            file=sys.stderr)
+        sys.exit(output.returncode)
     matches = re.findall('Collected .+?\.py::(.+?) with marks: \[(.*?)\]',
                          output.stdout)
 
@@ -331,9 +363,17 @@ def _generate_pipeline(test_file: str, args: str) -> Dict[str, Any]:
     """Generate a Buildkite pipeline from test files."""
     steps = []
     generated_steps_set = set()
-    (default_clouds_to_run, k_value, extra_args, concurrency,
-     has_env_file) = _parse_args(args)
-    function_cloud_map = _extract_marked_tests(test_file, args,
+    (default_clouds_to_run, k_value, extra_args, concurrency, has_env_file,
+     pytest_native) = _parse_args(args)
+    # Pass a clean arg string: extra_args (conftest-registered flags extracted
+    # from the generate_pipeline parser) + pytest_native (conftest-registered
+    # flags the generate_pipeline parser did not recognise).
+    # This excludes generate_pipeline-only flags (--concurrency,
+    # --submodule-base-branch, --dependency, --generic-cloud, --base-branch)
+    # that are not in older pinned conftests and would cause
+    # `pytest --collect-only` to exit with code 4, silently collecting 0 tests.
+    pytest_collect_args = shlex.join(extra_args + list(pytest_native))
+    function_cloud_map = _extract_marked_tests(test_file, pytest_collect_args,
                                                default_clouds_to_run, k_value,
                                                extra_args)
     concurrency_limit = None
@@ -392,32 +432,40 @@ def _generate_pipeline(test_file: str, args: str) -> Dict[str, Any]:
 def _dump_pipeline_to_file(yaml_file_path: str,
                            pipelines: List[Dict[str, Any]],
                            trigger_command: str,
-                           extra_env: Optional[Dict[str, str]] = None):
+                           extra_env: Optional[Dict[str, str]] = None) -> int:
+    """Write the generated steps to a pipeline file; return the step count.
+
+    main() always generates more than one pipeline file (e.g. release and
+    quick-tests-core).  A `-k`/file filter often matches tests in only one of
+    them, so an individual file legitimately ending up with 0 steps is not an
+    error -- it is skipped here, and main() fails loudly only if *every* file
+    is empty (the genuine "matched nothing anywhere" misconfiguration).
+    """
     default_env = {
         'LOG_TO_STDOUT': '1',
         'SKYPILOT_DISABLE_USAGE_COLLECTION': '1'
     }
     if extra_env:
         default_env.update(extra_env)
+    all_steps = []
+    for pipeline in pipelines:
+        all_steps.extend(pipeline['steps'])
+
+    if not all_steps:
+        # Buildkite rejects pipelines with empty step groups, so skip writing
+        # this file. main() decides whether 0 steps overall is fatal.
+        print(f'No matching tests for {yaml_file_path}, skipping.')
+        return 0
+
     with open(yaml_file_path, 'w', encoding='utf-8') as file:
         file.write(GENERATED_FILE_HEAD)
-        all_steps = []
-        for pipeline in pipelines:
-            all_steps.extend(pipeline['steps'])
-
         # Extract key from trigger command, keeping only valid characters
         key = re.sub(r'[^a-zA-Z0-9_\-:]', '',
                      re.match(r'^[^ ]*', trigger_command).group(0))
         # Generate formatted group name from key
         group_name = ' '.join(
             word.capitalize() for word in re.split(r'[-_]', key))
 
-        if not all_steps:
-            # Skip empty groups — Buildkite rejects pipelines with
-            # empty step groups.
-            print(f'No matching tests for {yaml_file_path}, skipping.')
-            return
-
         grouped_steps = [{
             'group': group_name,
             'key': key,
@@ -431,9 +479,11 @@ def _dump_pipeline_to_file(yaml_file_path: str,
 
         final_pipeline = {'steps': grouped_steps, 'env': default_env}
         yaml.dump(final_pipeline, file, default_flow_style=False)
+    return len(all_steps)
 
 
-def _convert_release(test_files: List[str], args: str, trigger_command: str):
+def _convert_release(test_files: List[str], args: str,
+                     trigger_command: str) -> int:
     yaml_file_path = '.buildkite/pipeline_smoke_tests_release.yaml'
     output_file_pipelines = []
     for test_file in test_files:
@@ -442,8 +492,8 @@ def _convert_release(test_files: List[str], args: str, trigger_command: str):
         output_file_pipelines.append(pipeline)
         print(f'Converted {test_file} to {yaml_file_path}\n\n')
     # Enable all clouds by default for release pipeline.
-    _dump_pipeline_to_file(yaml_file_path, output_file_pipelines,
-                           trigger_command)
+    return _dump_pipeline_to_file(yaml_file_path, output_file_pipelines,
+                                  trigger_command)
 
 
 def _rest_request(url: str,
@@ -488,7 +538,7 @@ def _get_latest_pypi_version():
 
 
 def _convert_quick_tests_core(test_files: List[str], args: str,
-                              trigger_command: str):
+                              trigger_command: str) -> int:
     yaml_file_path = '.buildkite/pipeline_smoke_tests_quick_tests_core.yaml'
     base_branch = '--base-branch' in args
     base_branches = []
@@ -515,10 +565,11 @@ def _convert_quick_tests_core(test_files: List[str], args: str,
             pipeline = _generate_pipeline(test_file, args)
             output_file_pipelines.append(pipeline)
         print(f'Converted {test_file} to {yaml_file_path}\n\n')
-    _dump_pipeline_to_file(yaml_file_path,
-                           output_file_pipelines,
-                           trigger_command,
-                           extra_env={'SKYPILOT_SUPPRESS_SENSITIVE_LOG': '1'})
+    return _dump_pipeline_to_file(
+        yaml_file_path,
+        output_file_pipelines,
+        trigger_command,
+        extra_env={'SKYPILOT_SUPPRESS_SENSITIVE_LOG': '1'})
 
 
 @click.command()
@@ -562,8 +613,25 @@ def main(args: str, file_pattern: str):
             release_files.append(test_file)
 
     print(f'trigger_command: {trigger_command}')
-    _convert_release(release_files, args, trigger_command)
-    _convert_quick_tests_core(quick_tests_core_files, args, trigger_command)
+    total_steps = 0
+    total_steps += _convert_release(release_files, args, trigger_command)
+    total_steps += _convert_quick_tests_core(quick_tests_core_files, args,
+                                             trigger_command)
+
+    if total_steps == 0:
+        # Every generated pipeline file was empty: pytest --collect-only matched
+        # no tests anywhere.  This is almost always a misconfiguration (wrong
+        # cloud filter, unrecognised ARGS flag, missing env file, a typo'd -k)
+        # rather than a legitimate "nothing to run".  Fail loudly so the empty
+        # pipeline is not uploaded as a vacuous success that posts a false
+        # "passed" status while running zero tests.
+        print(
+            'ERROR: No pipeline steps generated for any pipeline file. '
+            'pytest --collect-only matched 0 tests across all test files. '
+            'Check that ARGS point to valid tests and that the env-file (if '
+            'any) is reachable.',
+            file=sys.stderr)
+        sys.exit(1)
 
 
 if __name__ == '__main__':
 
@@ -48,15 +48,15 @@ SkyPilot gives **AI teams** a simple interface to run jobs on any infra.
 -----
 
 :fire: *News* :fire:
+- [Jun 2026] **Announcing SkyPilot Sandboxes**: run untrusted, LLM-generated code on the Kubernetes clusters you already own. [**Learn more**](https://blog.skypilot.co/sandboxes/), [**join early access**](https://forms.gle/o4keAryXsVazNjyGA)
+- [May 2026] **How Multiverse doubled their GPU utilization with SkyPilot**: [**case study**](https://multiversecomputing.com/papers/2x-gpu-utilization-same-hardware-discover-our-efficiency-playbook)
 - [Apr 2026] Introducing **GPU Compass**: One dashboard to browse, compare pricing, and launch across every GPU cloud. Try it at [**gpus.skypilot.co**](https://gpus.skypilot.co).
 - [Apr 2026] **Research-Driven Agents**: Agents read arxiv papers before coding, landed 5 llama.cpp kernel fusions and +15% faster flash attention in ~3 hours for ~$29: [**blog**](https://blog.skypilot.co/research-driven-agents/), [**HackerNews**](https://news.ycombinator.com/item?id=47706141)
 - [Mar 2026] **Scaling Karpathy's Autoresearch**: Autoresearch runs 1 experiment at a time. We gave it 16 GPUs and let it run in parallel: [**blog**](https://blog.skypilot.co/scaling-autoresearch/), [**HackerNews**](https://news.ycombinator.com/item?id=47442435)
 - [Mar 2026] **How H Company Unlocked Online RL and Unified their AI Platform**: [**case study**](https://hcompany.ai/unlocking-online-rl-skypilot)
 - [Mar 2026] **SkyPilot v0.12** released: Slurm Support, Job Groups for RL, Agent Skill, Recipes, Pool Autoscaling for Batch Inference, 7x Data Mounting, and More: [**Release notes**](https://github.com/skypilot-org/skypilot/releases/tag/v0.12.0)
 - [Mar 2026] **SkyPilot Agent Skills**: GPU access and job management for AI agents: [**docs**](https://docs.skypilot.co/en/latest/getting-started/skill.html)
 - [Jan 2026] **Shopify case study**: Shopify runs all AI training workloads on SkyPilot: [**case study**](https://shopify.engineering/skypilot)
-- [Dec 2025] **SkyPilot v0.11** released: Multi-Cloud Pools, Fast Managed Jobs, Enterprise-Readiness at Large Scale, Programmability. [**Release notes**](https://github.com/skypilot-org/skypilot/releases/tag/v0.11.0)
-- [Dec 2025] Train **an agent to use Google Search** as a tool with RL on your Kubernetes or clouds: [**blog**](https://blog.skypilot.co/verl-tool-calling/), [**example**](./llm/verl/)
 
 ## Overview
 
 
@@ -14,11 +14,14 @@ spec:
   {{- if and .Values.storage.enabled (ne .Values.storage.accessMode "ReadWriteMany") }}
   {{- fail "Local storage with ReadWriteOnce access mode is not supported when using RollingUpdate strategy. Either use Recreate upgrade strategy, set storage.enabled to false, or use ReadWriteMany access mode with a compatible storage class (e.g., NFS-backed storage like Google Filestore)." }}
   {{- end }}
+  {{- $rollingUpdate := .Values.apiService.rollingUpdate | default (dict) }}
   strategy:
     type: RollingUpdate
     rollingUpdate:
-      maxSurge: 1
-      maxUnavailable: 0
+      {{- /* dig (not `default`) so an explicit 0 is honored — Go templates treat
+             0 as empty, which would silently fall back to the default. */}}
+      maxSurge: {{ dig "maxSurge" 1 $rollingUpdate }}
+      maxUnavailable: {{ dig "maxUnavailable" 0 $rollingUpdate }}
   {{- else }}
   strategy:
     type: Recreate
 
@@ -191,6 +191,50 @@ tests:
               fieldRef:
                 fieldPath: metadata.uid
 
+  - it: should honor configurable maxSurge/maxUnavailable including explicit 0
+    set:
+      apiService.upgradeStrategy: RollingUpdate
+      apiService.dbConnectionSecretName: test-db-secret
+      storage.enabled: false
+      apiService.rollingUpdate.maxSurge: 0
+      apiService.rollingUpdate.maxUnavailable: 1
+    asserts:
+      - equal:
+          path: spec.strategy.rollingUpdate.maxSurge
+          value: 0
+      - equal:
+          path: spec.strategy.rollingUpdate.maxUnavailable
+          value: 1
+
+  - it: should support percentage strings for maxSurge/maxUnavailable
+    set:
+      apiService.upgradeStrategy: RollingUpdate
+      apiService.dbConnectionSecretName: test-db-secret
+      storage.enabled: false
+      apiService.rollingUpdate.maxSurge: 25%
+      apiService.rollingUpdate.maxUnavailable: 50%
+    asserts:
+      - equal:
+          path: spec.strategy.rollingUpdate.maxSurge
+          value: 25%
+      - equal:
+          path: spec.strategy.rollingUpdate.maxUnavailable
+          value: 50%
+
+  - it: should fall back to defaults when rollingUpdate is null
+    set:
+      apiService.upgradeStrategy: RollingUpdate
+      apiService.dbConnectionSecretName: test-db-secret
+      storage.enabled: false
+      apiService.rollingUpdate: null
+    asserts:
+      - equal:
+          path: spec.strategy.rollingUpdate.maxSurge
+          value: 1
+      - equal:
+          path: spec.strategy.rollingUpdate.maxUnavailable
+          value: 0
+
   - it: should fail RollingUpdate strategy without external database
     set:
       apiService.upgradeStrategy: RollingUpdate
 
@@ -192,6 +192,26 @@
                 "resources": {
                     "type": "object"
                 },
+                "rollingUpdate": {
+                    "type": [
+                        "object",
+                        "null"
+                    ],
+                    "properties": {
+                        "maxSurge": {
+                            "type": [
+                                "integer",
+                                "string"
+                            ]
+                        },
+                        "maxUnavailable": {
+                            "type": [
+                                "integer",
+                                "string"
+                            ]
+                        }
+                    }
+                },
                 "serveServerLog": {
                     "type": "boolean"
                 },