Skip to content

Commit bfce142

Browse files
committed
Merge upstream master into Modal PR
2 parents 01fbed7 + 2de65b7 commit bfce142

86 files changed

Lines changed: 4714 additions & 458 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.buildkite/generate_pipeline.py

Lines changed: 106 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,9 @@
2525
import collections
2626
import os
2727
import re
28+
import shlex
2829
import subprocess
30+
import sys
2931
import time
3032
from typing import Any, Dict, List, Optional, Tuple
3133

@@ -124,7 +126,7 @@ def _parse_args(args: Optional[str] = None):
124126
:return: (list_of_clouds, k_pattern)
125127
"""
126128
if args:
127-
args_list = args.split()
129+
args_list = shlex.split(args)
128130
else:
129131
args_list = []
130132
parser = argparse.ArgumentParser(
@@ -154,7 +156,10 @@ def _parse_args(args: Optional[str] = None):
154156
parser.add_argument('--dependency', nargs='?', const='', default='all')
155157
parser.add_argument('--concurrency', type=int)
156158

157-
parsed_args, _ = parser.parse_known_args(args_list)
159+
# pytest_native: args the generate_pipeline parser does not recognise
160+
# (e.g. --no-resource-heavy). They are conftest-registered pytest flags
161+
# and must be forwarded to `pytest --collect-only` unchanged.
162+
parsed_args, pytest_native = parser.parse_known_args(args_list)
158163

159164
# Collect chosen clouds from the flags
160165
# TODO(zpoint): get default clouds from the conftest.py
@@ -177,38 +182,52 @@ def _parse_args(args: Optional[str] = None):
177182
if not default_clouds_to_run:
178183
default_clouds_to_run = DEFAULT_CLOUDS_TO_RUN
179184

180-
extra_args = []
185+
# Each entry is a single shell token so that shlex.join() can safely
186+
# quote the list when it is passed to pytest --collect-only.
187+
extra_args: List[str] = []
181188
if parsed_args.remote_server:
182189
extra_args.append('--remote-server')
183190
if parsed_args.base_branch:
184-
extra_args.append(f'--base-branch {parsed_args.base_branch}')
191+
extra_args.extend(['--base-branch', parsed_args.base_branch])
185192
if parsed_args.controller_cloud:
186-
extra_args.append(f'--controller-cloud {parsed_args.controller_cloud}')
193+
extra_args.extend(['--controller-cloud', parsed_args.controller_cloud])
187194
if parsed_args.postgres:
188195
extra_args.append('--postgres')
189196
if parsed_args.helm_version:
190-
extra_args.append(f'--helm-version {parsed_args.helm_version}')
197+
extra_args.extend(['--helm-version', parsed_args.helm_version])
191198
if parsed_args.helm_package:
192-
extra_args.append(f'--helm-package {parsed_args.helm_package}')
199+
extra_args.extend(['--helm-package', parsed_args.helm_package])
193200
if parsed_args.jobs_consolidation:
194201
extra_args.append('--jobs-consolidation')
195202
if parsed_args.serve_consolidation:
196203
extra_args.append('--serve-consolidation')
197204
if parsed_args.grpc:
198205
extra_args.append('--grpc')
199206
if parsed_args.env_file:
200-
extra_args.append(f'--env-file {parsed_args.env_file}')
207+
extra_args.extend(['--env-file', parsed_args.env_file])
201208
if parsed_args.plugin_yaml:
202-
extra_args.append(f'--plugin-yaml {parsed_args.plugin_yaml}')
209+
extra_args.extend(['--plugin-yaml', parsed_args.plugin_yaml])
203210
if parsed_args.submodule_base_branch:
204-
extra_args.append(
205-
f'--submodule-base-branch {parsed_args.submodule_base_branch}')
211+
extra_args.extend(
212+
['--submodule-base-branch', parsed_args.submodule_base_branch])
206213
if parsed_args.dependency != 'all':
207-
space = ' ' if parsed_args.dependency else ''
208-
extra_args.append(f'--dependency{space}{parsed_args.dependency}')
214+
if parsed_args.dependency:
215+
extra_args.extend(['--dependency', parsed_args.dependency])
216+
else:
217+
extra_args.append('--dependency')
218+
# Cloud flags are conftest-registered; include them in extra_args so that
219+
# they reach `pytest --collect-only` (some marks depend on which clouds
220+
# are active). They are already captured in default_clouds_to_run for
221+
# Buildkite-step generation; adding them here is intentional duplication.
222+
for cloud in all_clouds_in_smoke_tests:
223+
if getattr(parsed_args, cloud, False):
224+
extra_args.append(f'--{cloud}')
225+
if parsed_args.generic_cloud:
226+
extra_args.append(f'--generic-cloud {parsed_args.generic_cloud}')
209227

210228
return (default_clouds_to_run, parsed_args.k, extra_args,
211-
parsed_args.concurrency, parsed_args.env_file is not None)
229+
parsed_args.concurrency, parsed_args.env_file
230+
is not None, pytest_native)
212231

213232

214233
def _extract_marked_tests(
@@ -236,6 +255,19 @@ def _extract_marked_tests(
236255
# Args are already in the format pytest expects (cloud names like --lambda)
237256
cmd = f'pytest {file_path} --collect-only {args}'
238257
output = subprocess.run(cmd, shell=True, capture_output=True, text=True)
258+
# Exit code 5 means "no tests collected" — normal when a file has no
259+
# matching tests for the requested clouds. Any other non-zero code is a
260+
# real error (e.g. unrecognised arguments, import failure) that would
261+
# silently produce 0 matches and generate an empty pipeline. Fail loudly
262+
# so the build is visibly broken rather than a noop.
263+
if output.returncode not in (0, 5):
264+
print(
265+
f'ERROR: pytest collection failed (exit {output.returncode}) '
266+
f'for {file_path}:\n'
267+
f'STDOUT:\n{output.stdout}\n'
268+
f'STDERR:\n{output.stderr}',
269+
file=sys.stderr)
270+
sys.exit(output.returncode)
239271
matches = re.findall('Collected .+?\.py::(.+?) with marks: \[(.*?)\]',
240272
output.stdout)
241273

@@ -331,9 +363,17 @@ def _generate_pipeline(test_file: str, args: str) -> Dict[str, Any]:
331363
"""Generate a Buildkite pipeline from test files."""
332364
steps = []
333365
generated_steps_set = set()
334-
(default_clouds_to_run, k_value, extra_args, concurrency,
335-
has_env_file) = _parse_args(args)
336-
function_cloud_map = _extract_marked_tests(test_file, args,
366+
(default_clouds_to_run, k_value, extra_args, concurrency, has_env_file,
367+
pytest_native) = _parse_args(args)
368+
# Pass a clean arg string: extra_args (conftest-registered flags extracted
369+
# from the generate_pipeline parser) + pytest_native (conftest-registered
370+
# flags the generate_pipeline parser did not recognise).
371+
# This excludes generate_pipeline-only flags (--concurrency,
372+
# --submodule-base-branch, --dependency, --generic-cloud, --base-branch)
373+
# that are not in older pinned conftests and would cause
374+
# `pytest --collect-only` to exit with code 4, silently collecting 0 tests.
375+
pytest_collect_args = shlex.join(extra_args + list(pytest_native))
376+
function_cloud_map = _extract_marked_tests(test_file, pytest_collect_args,
337377
default_clouds_to_run, k_value,
338378
extra_args)
339379
concurrency_limit = None
@@ -392,32 +432,40 @@ def _generate_pipeline(test_file: str, args: str) -> Dict[str, Any]:
392432
def _dump_pipeline_to_file(yaml_file_path: str,
393433
pipelines: List[Dict[str, Any]],
394434
trigger_command: str,
395-
extra_env: Optional[Dict[str, str]] = None):
435+
extra_env: Optional[Dict[str, str]] = None) -> int:
436+
"""Write the generated steps to a pipeline file; return the step count.
437+
438+
main() always generates more than one pipeline file (e.g. release and
439+
quick-tests-core). A `-k`/file filter often matches tests in only one of
440+
them, so an individual file legitimately ending up with 0 steps is not an
441+
error -- it is skipped here, and main() fails loudly only if *every* file
442+
is empty (the genuine "matched nothing anywhere" misconfiguration).
443+
"""
396444
default_env = {
397445
'LOG_TO_STDOUT': '1',
398446
'SKYPILOT_DISABLE_USAGE_COLLECTION': '1'
399447
}
400448
if extra_env:
401449
default_env.update(extra_env)
450+
all_steps = []
451+
for pipeline in pipelines:
452+
all_steps.extend(pipeline['steps'])
453+
454+
if not all_steps:
455+
# Buildkite rejects pipelines with empty step groups, so skip writing
456+
# this file. main() decides whether 0 steps overall is fatal.
457+
print(f'No matching tests for {yaml_file_path}, skipping.')
458+
return 0
459+
402460
with open(yaml_file_path, 'w', encoding='utf-8') as file:
403461
file.write(GENERATED_FILE_HEAD)
404-
all_steps = []
405-
for pipeline in pipelines:
406-
all_steps.extend(pipeline['steps'])
407-
408462
# Extract key from trigger command, keeping only valid characters
409463
key = re.sub(r'[^a-zA-Z0-9_\-:]', '',
410464
re.match(r'^[^ ]*', trigger_command).group(0))
411465
# Generate formatted group name from key
412466
group_name = ' '.join(
413467
word.capitalize() for word in re.split(r'[-_]', key))
414468

415-
if not all_steps:
416-
# Skip empty groups — Buildkite rejects pipelines with
417-
# empty step groups.
418-
print(f'No matching tests for {yaml_file_path}, skipping.')
419-
return
420-
421469
grouped_steps = [{
422470
'group': group_name,
423471
'key': key,
@@ -431,9 +479,11 @@ def _dump_pipeline_to_file(yaml_file_path: str,
431479

432480
final_pipeline = {'steps': grouped_steps, 'env': default_env}
433481
yaml.dump(final_pipeline, file, default_flow_style=False)
482+
return len(all_steps)
434483

435484

436-
def _convert_release(test_files: List[str], args: str, trigger_command: str):
485+
def _convert_release(test_files: List[str], args: str,
486+
trigger_command: str) -> int:
437487
yaml_file_path = '.buildkite/pipeline_smoke_tests_release.yaml'
438488
output_file_pipelines = []
439489
for test_file in test_files:
@@ -442,8 +492,8 @@ def _convert_release(test_files: List[str], args: str, trigger_command: str):
442492
output_file_pipelines.append(pipeline)
443493
print(f'Converted {test_file} to {yaml_file_path}\n\n')
444494
# Enable all clouds by default for release pipeline.
445-
_dump_pipeline_to_file(yaml_file_path, output_file_pipelines,
446-
trigger_command)
495+
return _dump_pipeline_to_file(yaml_file_path, output_file_pipelines,
496+
trigger_command)
447497

448498

449499
def _rest_request(url: str,
@@ -488,7 +538,7 @@ def _get_latest_pypi_version():
488538

489539

490540
def _convert_quick_tests_core(test_files: List[str], args: str,
491-
trigger_command: str):
541+
trigger_command: str) -> int:
492542
yaml_file_path = '.buildkite/pipeline_smoke_tests_quick_tests_core.yaml'
493543
base_branch = '--base-branch' in args
494544
base_branches = []
@@ -515,10 +565,11 @@ def _convert_quick_tests_core(test_files: List[str], args: str,
515565
pipeline = _generate_pipeline(test_file, args)
516566
output_file_pipelines.append(pipeline)
517567
print(f'Converted {test_file} to {yaml_file_path}\n\n')
518-
_dump_pipeline_to_file(yaml_file_path,
519-
output_file_pipelines,
520-
trigger_command,
521-
extra_env={'SKYPILOT_SUPPRESS_SENSITIVE_LOG': '1'})
568+
return _dump_pipeline_to_file(
569+
yaml_file_path,
570+
output_file_pipelines,
571+
trigger_command,
572+
extra_env={'SKYPILOT_SUPPRESS_SENSITIVE_LOG': '1'})
522573

523574

524575
@click.command()
@@ -562,8 +613,25 @@ def main(args: str, file_pattern: str):
562613
release_files.append(test_file)
563614

564615
print(f'trigger_command: {trigger_command}')
565-
_convert_release(release_files, args, trigger_command)
566-
_convert_quick_tests_core(quick_tests_core_files, args, trigger_command)
616+
total_steps = 0
617+
total_steps += _convert_release(release_files, args, trigger_command)
618+
total_steps += _convert_quick_tests_core(quick_tests_core_files, args,
619+
trigger_command)
620+
621+
if total_steps == 0:
622+
# Every generated pipeline file was empty: pytest --collect-only matched
623+
# no tests anywhere. This is almost always a misconfiguration (wrong
624+
# cloud filter, unrecognised ARGS flag, missing env file, a typo'd -k)
625+
# rather than a legitimate "nothing to run". Fail loudly so the empty
626+
# pipeline is not uploaded as a vacuous success that posts a false
627+
# "passed" status while running zero tests.
628+
print(
629+
'ERROR: No pipeline steps generated for any pipeline file. '
630+
'pytest --collect-only matched 0 tests across all test files. '
631+
'Check that ARGS point to valid tests and that the env-file (if '
632+
'any) is reachable.',
633+
file=sys.stderr)
634+
sys.exit(1)
567635

568636

569637
if __name__ == '__main__':

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -48,15 +48,15 @@ SkyPilot gives **AI teams** a simple interface to run jobs on any infra.
4848
-----
4949

5050
:fire: *News* :fire:
51+
- [Jun 2026] **Announcing SkyPilot Sandboxes**: run untrusted, LLM-generated code on the Kubernetes clusters you already own. [**Learn more**](https://blog.skypilot.co/sandboxes/), [**join early access**](https://forms.gle/o4keAryXsVazNjyGA)
52+
- [May 2026] **How Multiverse doubled their GPU utilization with SkyPilot**: [**case study**](https://multiversecomputing.com/papers/2x-gpu-utilization-same-hardware-discover-our-efficiency-playbook)
5153
- [Apr 2026] Introducing **GPU Compass**: One dashboard to browse, compare pricing, and launch across every GPU cloud. Try it at [**gpus.skypilot.co**](https://gpus.skypilot.co).
5254
- [Apr 2026] **Research-Driven Agents**: Agents read arxiv papers before coding, landed 5 llama.cpp kernel fusions and +15% faster flash attention in ~3 hours for ~$29: [**blog**](https://blog.skypilot.co/research-driven-agents/), [**HackerNews**](https://news.ycombinator.com/item?id=47706141)
5355
- [Mar 2026] **Scaling Karpathy's Autoresearch**: Autoresearch runs 1 experiment at a time. We gave it 16 GPUs and let it run in parallel: [**blog**](https://blog.skypilot.co/scaling-autoresearch/), [**HackerNews**](https://news.ycombinator.com/item?id=47442435)
5456
- [Mar 2026] **How H Company Unlocked Online RL and Unified their AI Platform**: [**case study**](https://hcompany.ai/unlocking-online-rl-skypilot)
5557
- [Mar 2026] **SkyPilot v0.12** released: Slurm Support, Job Groups for RL, Agent Skill, Recipes, Pool Autoscaling for Batch Inference, 7x Data Mounting, and More: [**Release notes**](https://github.com/skypilot-org/skypilot/releases/tag/v0.12.0)
5658
- [Mar 2026] **SkyPilot Agent Skills**: GPU access and job management for AI agents: [**docs**](https://docs.skypilot.co/en/latest/getting-started/skill.html)
5759
- [Jan 2026] **Shopify case study**: Shopify runs all AI training workloads on SkyPilot: [**case study**](https://shopify.engineering/skypilot)
58-
- [Dec 2025] **SkyPilot v0.11** released: Multi-Cloud Pools, Fast Managed Jobs, Enterprise-Readiness at Large Scale, Programmability. [**Release notes**](https://github.com/skypilot-org/skypilot/releases/tag/v0.11.0)
59-
- [Dec 2025] Train **an agent to use Google Search** as a tool with RL on your Kubernetes or clouds: [**blog**](https://blog.skypilot.co/verl-tool-calling/), [**example**](./llm/verl/)
6060

6161
## Overview
6262

charts/skypilot/templates/api-deployment.yaml

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,11 +14,14 @@ spec:
1414
{{- if and .Values.storage.enabled (ne .Values.storage.accessMode "ReadWriteMany") }}
1515
{{- fail "Local storage with ReadWriteOnce access mode is not supported when using RollingUpdate strategy. Either use Recreate upgrade strategy, set storage.enabled to false, or use ReadWriteMany access mode with a compatible storage class (e.g., NFS-backed storage like Google Filestore)." }}
1616
{{- end }}
17+
{{- $rollingUpdate := .Values.apiService.rollingUpdate | default (dict) }}
1718
strategy:
1819
type: RollingUpdate
1920
rollingUpdate:
20-
maxSurge: 1
21-
maxUnavailable: 0
21+
{{- /* dig (not `default`) so an explicit 0 is honored — Go templates treat
22+
0 as empty, which would silently fall back to the default. */}}
23+
maxSurge: {{ dig "maxSurge" 1 $rollingUpdate }}
24+
maxUnavailable: {{ dig "maxUnavailable" 0 $rollingUpdate }}
2225
{{- else }}
2326
strategy:
2427
type: Recreate

charts/skypilot/tests/deployment_test.yaml

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -191,6 +191,50 @@ tests:
191191
fieldRef:
192192
fieldPath: metadata.uid
193193

194+
- it: should honor configurable maxSurge/maxUnavailable including explicit 0
195+
set:
196+
apiService.upgradeStrategy: RollingUpdate
197+
apiService.dbConnectionSecretName: test-db-secret
198+
storage.enabled: false
199+
apiService.rollingUpdate.maxSurge: 0
200+
apiService.rollingUpdate.maxUnavailable: 1
201+
asserts:
202+
- equal:
203+
path: spec.strategy.rollingUpdate.maxSurge
204+
value: 0
205+
- equal:
206+
path: spec.strategy.rollingUpdate.maxUnavailable
207+
value: 1
208+
209+
- it: should support percentage strings for maxSurge/maxUnavailable
210+
set:
211+
apiService.upgradeStrategy: RollingUpdate
212+
apiService.dbConnectionSecretName: test-db-secret
213+
storage.enabled: false
214+
apiService.rollingUpdate.maxSurge: 25%
215+
apiService.rollingUpdate.maxUnavailable: 50%
216+
asserts:
217+
- equal:
218+
path: spec.strategy.rollingUpdate.maxSurge
219+
value: 25%
220+
- equal:
221+
path: spec.strategy.rollingUpdate.maxUnavailable
222+
value: 50%
223+
224+
- it: should fall back to defaults when rollingUpdate is null
225+
set:
226+
apiService.upgradeStrategy: RollingUpdate
227+
apiService.dbConnectionSecretName: test-db-secret
228+
storage.enabled: false
229+
apiService.rollingUpdate: null
230+
asserts:
231+
- equal:
232+
path: spec.strategy.rollingUpdate.maxSurge
233+
value: 1
234+
- equal:
235+
path: spec.strategy.rollingUpdate.maxUnavailable
236+
value: 0
237+
194238
- it: should fail RollingUpdate strategy without external database
195239
set:
196240
apiService.upgradeStrategy: RollingUpdate

charts/skypilot/values.schema.json

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -192,6 +192,26 @@
192192
"resources": {
193193
"type": "object"
194194
},
195+
"rollingUpdate": {
196+
"type": [
197+
"object",
198+
"null"
199+
],
200+
"properties": {
201+
"maxSurge": {
202+
"type": [
203+
"integer",
204+
"string"
205+
]
206+
},
207+
"maxUnavailable": {
208+
"type": [
209+
"integer",
210+
"string"
211+
]
212+
}
213+
}
214+
},
195215
"serveServerLog": {
196216
"type": "boolean"
197217
},

0 commit comments

Comments
 (0)