Skip to content

Commit 4f1df76

Browse files
committed
Revert "[Kubernetes] Self-heal half-installed conda/venv + clean rsync on un-marked PVC"
This reverts commit acf0dbc.
1 parent acf0dbc commit 4f1df76

2 files changed

Lines changed: 4 additions & 37 deletions

File tree

sky/skylet/constants.py

Lines changed: 3 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -228,18 +228,8 @@
228228
# We use python 3.10 to be consistent with the python version of the
229229
# AWS's Deep Learning AMI's default conda environment.
230230
CONDA_INSTALLATION_COMMANDS = (
231-
# Validate that conda *runs*, not just that the binary exists. A
232-
# half-installed conda from a previous crashed setup run can leave a
233-
# binary on the PVC with an unrelocated shebang (e.g.
234-
# `#!/croot/conda_…/_h_env_placehold…/bin/python`) which `which`
235-
# treats as present but every invocation fails with "command not
236-
# found" because the shebang's python path doesn't exist. In that
237-
# case we must reinstall — but the Miniconda installer refuses to
238-
# overwrite an existing prefix, so we also wipe the broken dir first.
239-
f'(command -v conda >/dev/null 2>&1 && conda --version >/dev/null 2>&1) || '
231+
'which conda > /dev/null 2>&1 || '
240232
'{ '
241-
f'echo "conda not functional; reinstalling miniconda at {SKY_CONDA_ROOT}"; '
242-
f'rm -rf "{SKY_CONDA_ROOT}"; '
243233
# Use uname -m to get the architecture of the machine and download the
244234
# corresponding Miniconda3-Linux.sh script.
245235
# Download to /tmp to ensure write access for non-root users.
@@ -274,18 +264,7 @@
274264
# Install uv for venv management and pip installation.
275265
f'{SKY_UV_INSTALL_CMD};'
276266
# Create a separate python environment for SkyPilot dependencies.
277-
#
278-
# Validate that the venv *works*, not just that the directory exists.
279-
# A previous crashed install can leave a directory with partial
280-
# contents (no bin/python, or a broken bin/python whose interpreter
281-
# files were truncated). `uv pip install --python <venv>/bin/python`
282-
# downstream then fails with "No virtual environment or system Python
283-
# installation found for path". Test the interpreter end-to-end and
284-
# rebuild the venv if it doesn't run.
285-
f'([ -x {SKY_REMOTE_PYTHON_ENV}/bin/python ] && '
286-
f'{SKY_REMOTE_PYTHON_ENV}/bin/python -c "import sys" >/dev/null 2>&1) || '
287-
f'{{ echo "skypilot-runtime venv not functional; recreating"; '
288-
f'rm -rf {SKY_REMOTE_PYTHON_ENV}; '
267+
f'[ -d {SKY_REMOTE_PYTHON_ENV} ] || '
289268
# Do NOT use --system-site-packages here, because if users upgrade any
290269
# packages in the base env, they interfere with skypilot dependencies.
291270
# Reference: https://github.com/skypilot-org/skypilot/issues/4097
@@ -298,7 +277,7 @@
298277
# uv to use the python version specified in the `.python_version` file.
299278
# TODO(zhwu): consider adding --python-preference only-managed to avoid
300279
# using the system python, if a user report such issue.
301-
f'{SKY_UV_CMD} venv --seed {SKY_REMOTE_PYTHON_ENV} --python 3.10; }};'
280+
f'{SKY_UV_CMD} venv --seed {SKY_REMOTE_PYTHON_ENV} --python 3.10;'
302281
f'echo "$(echo {SKY_REMOTE_PYTHON_ENV})/bin/python" > {SKY_PYTHON_PATH_FILE};' # pylint: disable=line-too-long
303282
)
304283

sky/templates/kubernetes-ray.yml.j2

Lines changed: 1 addition & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1384,19 +1384,7 @@ available_node_types:
13841384
# which caused 'Operation not permitted' errors on the PVC root directory (/mnt/home).
13851385
# Owner/group preservation (-o, -g) is also skipped (default for -rl), ensuring
13861386
# files are owned by the container's user/group.
1387-
#
1388-
# --delete: if the marker is missing, treat the PVC as needing a
1389-
# fresh start. A previous controller pod can crash mid-install
1390-
# (e.g. miniconda half-extracted, skypilot-runtime venv missing
1391-
# bin/python) before the marker is written; on restart the PVC
1392-
# still has the partial files. Without --delete, this initContainer
1393-
# just overlays image files on top, leaving the corruption in place
1394-
# — downstream `which conda` finds a broken conda binary, skips
1395-
# reinstall, and the runtime-setup step crashes. With --delete,
1396-
# the PVC is mirrored to the image's /home/sky contents on each
1397-
# un-marked start, so a crashed mid-install can't poison subsequent
1398-
# starts.
1399-
rsync -rl --delete "$SOURCE_PATH/" "$DEST_PATH"
1387+
rsync -rl "$SOURCE_PATH/" "$DEST_PATH"
14001388

14011389
# Check if rsync failed
14021390
if [ $? -ne 0 ]; then

0 commit comments

Comments
 (0)