Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions charts/skypilot/templates/api-deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,8 @@ spec:
valueFrom:
fieldRef:
fieldPath: metadata.uid
- name: SKYPILOT_ROLLING_UPDATE_ENABLED
value: "true"
{{- end }}
{{- if .Values.apiService.metrics.enabled }}
- name: SKY_API_SERVER_METRICS_ENABLED
Expand Down
4 changes: 4 additions & 0 deletions docs/source/examples/managed-jobs.rst
Original file line number Diff line number Diff line change
Expand Up @@ -763,4 +763,8 @@ To enable the consolidated deployment, set :ref:`consolidation_mode <config-yaml
See :ref:`more about the Kubernetes upgrade strategy of the API server <sky-api-server-graceful-upgrade>`.

.. warning::

When using consolidation mode with a remote :ref:`SkyPilot API server with RollingUpdate upgrade strategy <sky-api-server-upgrade-strategy>`, any file mounts or workdirs that uploads local files/folders of the managed jobs will be lost during a rolling update. To address that, configure a cloud bucket via :ref:`config-yaml-jobs-bucket` in your :ref:`SkyPilot config <config-yaml>` to persist them.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
When using consolidation mode with a remote :ref:`SkyPilot API server with RollingUpdate upgrade strategy <sky-api-server-upgrade-strategy>`, any file mounts or workdirs that uploads local files/folders of the managed jobs will be lost during a rolling update. To address that, configure a cloud bucket via :ref:`config-yaml-jobs-bucket` in your :ref:`SkyPilot config <config-yaml>` to persist them.
When using consolidation mode with a remote :ref:`SkyPilot API server with RollingUpdate upgrade strategy <sky-api-server-upgrade-strategy>`, any file mounts or workdirs that uploads local files/folders of the managed jobs will be lost during a rolling update. To address that, use :ref:`bucket <todo-link>`, :ref:`volume <todo-link>`, or :ref:`git <todo-link>`; or, configure a cloud bucket for all local files via :ref:`config-yaml-jobs-bucket` in your :ref:`SkyPilot config <config-yaml>` to persist them.
.. code-block::
jobs:
bucket: s3://xxx


The jobs controller will use a bit of overhead - it reserves an extra 2GB of memory for itself, which may reduce the amount of requests your API server can handle. To counteract, you can increase the amount of CPU and memory allocated to the API server: See :ref:`sky-api-server-resources-tuning`.
49 changes: 49 additions & 0 deletions sky/jobs/server/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
from sky.backends import backend_utils
from sky.backends import cloud_vm_ray_backend
from sky.catalog import common as service_catalog_common
from sky.data import data_utils
from sky.data import storage as storage_lib
from sky.jobs import constants as managed_job_constants
from sky.jobs import state as managed_job_state
Expand Down Expand Up @@ -93,6 +94,51 @@
]


def _warn_file_mounts_rolling_update(dag: 'sky.Dag') -> None:
"""Warn if local file mounts or workdir may be lost during rolling update.

When rolling update is enabled with consolidation mode but no jobs bucket
is configured, local file mounts and workdirs are stored locally on the API
server pod and will be lost during a rolling update.
"""
# If rolling update is not enabled, don't warn.
if os.environ.get(skylet_constants.SKYPILOT_ROLLING_UPDATE_ENABLED) is None:
return

# If consolidation mode is not enabled, don't warn.
if not managed_job_utils.is_consolidation_mode():
return

# If a jobs bucket is configured, don't warn.
if skypilot_config.get_nested(('jobs', 'bucket'), None) is not None:
return

# Check if any task has local file_mounts (not cloud store URLs) or workdir
has_local_file_mounts = False
has_local_workdir = False
for task_ in dag.tasks:
if task_.file_mounts:
for src in task_.file_mounts.values():
if not data_utils.is_cloud_store_url(src):
has_local_file_mounts = True
break
if task_.workdir and isinstance(task_.workdir, str):
has_local_workdir = True
break
if has_local_file_mounts:
break

if not has_local_file_mounts and not has_local_workdir:
return

logger.warning(
f'{colorama.Fore.YELLOW}WARNING: Local file mounts or workdir detected '
'with rolling update enabled for API server. '
'To persist files across API server restarts/update, configure a cloud '
f'bucket in your SkyPilot config under `jobs.bucket`.'
f'{colorama.Style.RESET_ALL}')
Comment on lines +135 to +139
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
f'{colorama.Fore.YELLOW}WARNING: Local file mounts or workdir detected '
'with rolling update enabled for API server. '
'To persist files across API server restarts/update, configure a cloud '
f'bucket in your SkyPilot config under `jobs.bucket`.'
f'{colorama.Style.RESET_ALL}')
f'{colorama.Fore.YELLOW}WARNING: Local file mounts or workdir detected '
'with rolling update enabled for API server. '
'To persist files across API server restarts/update, use buckets, volumes, or git '
'for your file mounts; or, configure a bucket in your SkyPilot config under '
f'`jobs.bucket`. {colorama.Style.RESET_ALL}')



def _upload_files_to_controller(dag: 'sky.Dag') -> Dict[str, str]:
"""Upload files to the controller.

Expand Down Expand Up @@ -353,6 +399,9 @@ def launch(
f'with:\n\n`sky down {cluster_name} --purge`\n\n'
f'Reason: {common_utils.format_exception(e)}')

# Warn if file mounts may be lost during rolling update
_warn_file_mounts_rolling_update(dag)

local_to_controller_file_mounts = _upload_files_to_controller(dag)
controller = controller_utils.Controllers.JOBS_CONTROLLER
controller_name = controller.value.cluster_name
Expand Down
3 changes: 3 additions & 0 deletions sky/skylet/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -523,6 +523,9 @@
ENV_VAR_IS_SKYPILOT_SERVER = 'IS_SKYPILOT_SERVER'
OVERRIDE_CONSOLIDATION_MODE = 'IS_SKYPILOT_JOB_CONTROLLER'
IS_SKYPILOT_SERVE_CONTROLLER = 'IS_SKYPILOT_SERVE_CONTROLLER'
# Environment variable that is set to 'true' if rolling update strategy is
# enabled for the API server deployment.
SKYPILOT_ROLLING_UPDATE_ENABLED = 'SKYPILOT_ROLLING_UPDATE_ENABLED'

SERVE_OVERRIDE_CONCURRENT_LAUNCHES = (
f'{SKYPILOT_ENV_VAR_PREFIX}SERVE_OVERRIDE_CONCURRENT_LAUNCHES')
Expand Down
Loading