Skip to content

Commit fd31652

Browse files
Fix/pytorch add updated start script (#10)
* update * fix: add missing start1.sh to container-template for pytorch build * Update container-template/updated_start.sh Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * name change * rename start1.sh to updated_start.sh --------- Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
1 parent 7f0f3ba commit fd31652

3 files changed

Lines changed: 143 additions & 67 deletions

File tree

Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
2+
#!/bin/bash
3+
set -e # Exit the script if any statement returns a non-true return value
4+
5+
# ---------------------------------------------------------------------------- #
6+
# Function Definitions #
7+
# ---------------------------------------------------------------------------- #
8+
9+
# Start nginx service
10+
start_nginx() {
11+
echo "Starting Nginx service..."
12+
service nginx start
13+
}
14+
15+
# Execute script if exists
16+
execute_script() {
17+
local script_path=$1
18+
local script_msg=$2
19+
if [ -f "${script_path}" ]; then
20+
echo "${script_msg}"
21+
bash ${script_path}
22+
fi
23+
}
24+
25+
# Setup ssh
26+
setup_ssh() {
27+
if [ "$PUBLIC_KEY" ]; then
28+
echo "Setting up SSH..."
29+
mkdir -p ~/.ssh
30+
echo "$PUBLIC_KEY" >> ~/.ssh/authorized_keys
31+
chmod 700 -R ~/.ssh
32+
fi
33+
34+
if [ ! -f /etc/ssh/ssh_host_rsa_key ]; then
35+
ssh-keygen -t rsa -f /etc/ssh/ssh_host_rsa_key -q -N ''
36+
echo "RSA key fingerprint:"
37+
ssh-keygen -lf /etc/ssh/ssh_host_rsa_key.pub
38+
fi
39+
40+
if [ ! -f /etc/ssh/ssh_host_dsa_key ]; then
41+
ssh-keygen -t dsa -f /etc/ssh/ssh_host_dsa_key -q -N ''
42+
echo "DSA key fingerprint:"
43+
ssh-keygen -lf /etc/ssh/ssh_host_dsa_key.pub
44+
fi
45+
46+
if [ ! -f /etc/ssh/ssh_host_ecdsa_key ]; then
47+
ssh-keygen -t ecdsa -f /etc/ssh/ssh_host_ecdsa_key -q -N ''
48+
echo "ECDSA key fingerprint:"
49+
ssh-keygen -lf /etc/ssh/ssh_host_ecdsa_key.pub
50+
fi
51+
52+
if [ ! -f /etc/ssh/ssh_host_ed25519_key ]; then
53+
ssh-keygen -t ed25519 -f /etc/ssh/ssh_host_ed25519_key -q -N ''
54+
echo "ED25519 key fingerprint:"
55+
ssh-keygen -lf /etc/ssh/ssh_host_ed25519_key.pub
56+
fi
57+
58+
service ssh start
59+
60+
echo "SSH host keys:"
61+
for key in /etc/ssh/*.pub; do
62+
echo "Key: $key"
63+
ssh-keygen -lf $key
64+
done
65+
66+
}
67+
68+
# Export env vars
69+
export_env_vars() {
70+
echo "Exporting environment variables..."
71+
printenv | grep -E '^YOTTA_|^PATH=|^_=' | awk -F = '{ print "export " $1 "=\"" $2 "\"" }' >> /etc/rp_environment
72+
echo 'export PATH=/usr/local/nvidia/bin:/usr/local/cuda-12.8/bin:~/.local/bin:$PATH' >> /etc/rp_environment
73+
echo 'source /etc/rp_environment' >> ~/.bashrc
74+
}
75+
76+
# Start jupyter lab
77+
start_jupyter() {
78+
if [ "$JUPYTER_PASSWORD" ]; then
79+
echo "Starting Jupyter Lab..."
80+
mkdir -p /workspace && \
81+
cd / && \
82+
nohup python3 -m jupyter lab --allow-root --no-browser --port=8888 --ip=* --FileContentsManager.delete_to_trash=False --ServerApp.terminado_settings='{"shell_command":["/bin/bash"]}' --ServerApp.token=$JUPYTER_PASSWORD --ServerApp.allow_origin=* --ServerApp.preferred_dir=/workspace &> /workspace/jupyter.log &
83+
echo "Jupyter Lab started"
84+
fi
85+
}
86+
87+
88+
# ---------------------------------------------------------------------------- #
89+
# Main Program #
90+
# ---------------------------------------------------------------------------- #
91+
92+
start_nginx
93+
94+
execute_script "/pre_start.sh" "Running pre-start script..."
95+
96+
echo "Pod Started"
97+
98+
setup_ssh
99+
start_jupyter
100+
export_env_vars
101+
102+
execute_script "/post_start.sh" "Running post-start script..."
103+
104+
echo "Start script(s) finished, pod is ready to use."
105+
106+
sleep infinity

official-templates/pytorch/Dockerfile

Lines changed: 34 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -22,13 +22,13 @@ ENV DEBIAN_FRONTEND=noninteractive \
2222
SHELL=/bin/bash \
2323
PATH=/opt/conda/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/bin:$PATH \
2424
LD_LIBRARY_PATH=/usr/local/nvidia/lib64:$LD_LIBRARY_PATH \
25-
JUPYTER_PASSWORD=ubuntu
25+
JUPYTER_PASSWORD=yotta
2626

2727
# ===============================
2828
# Workspace
2929
# ===============================
3030
WORKDIR /
31-
RUN mkdir -p /workspace && chmod 777 /workspace
31+
RUN mkdir -p /workspace && chmod 777 /workspace /root
3232

3333
# ===============================
3434
# Base system packages
@@ -44,46 +44,16 @@ RUN apt-get update -y && \
4444
build-essential pkg-config \
4545
&& echo "en_US.UTF-8 UTF-8" > /etc/locale.gen \
4646
&& locale-gen \
47-
&& mkdir -p /var/run/sshd \
47+
&& mkdir -p /var/run/sshd /var/log/supervisor \
48+
&& chmod 700 /var/run/sshd /var/log/supervisor \
49+
&& chmod 755 /var/log \
4850
&& apt-get clean \
4951
&& rm -rf /var/lib/apt/lists/*
5052

5153
# ===============================
52-
# uv (Astral) - Python package manager
53-
# - Install to /usr/local/bin
54-
# - Avoids modifying shell profile (suitable for container/CI)
54+
# Remove ubuntu user (for security: prevent unauthorized SSH access)
5555
# ===============================
56-
ARG UV_VERSION="latest"
57-
RUN set -eux; \
58-
if [ "${UV_VERSION}" = "latest" ]; then \
59-
curl -LsSf https://astral.sh/uv/install.sh | env UV_UNMANAGED_INSTALL="/usr/local/bin" sh; \
60-
else \
61-
curl -LsSf "https://astral.sh/uv/${UV_VERSION}/install.sh" | env UV_UNMANAGED_INSTALL="/usr/local/bin" sh; \
62-
fi; \
63-
uv --version
64-
65-
# ===============================
66-
# Miniconda
67-
# ===============================
68-
ARG MINICONDA_VERSION="py311_24.1.2-0"
69-
ARG CONDA_DIR="/opt/conda"
70-
71-
RUN set -eux; \
72-
ARCH="$(uname -m)"; \
73-
case "${ARCH}" in \
74-
x86_64) MINICONDA_ARCH="x86_64" ;; \
75-
aarch64) MINICONDA_ARCH="aarch64" ;; \
76-
*) echo "Unsupported arch: ${ARCH}" && exit 1 ;; \
77-
esac; \
78-
curl -fsSL \
79-
"https://repo.anaconda.com/miniconda/Miniconda3-${MINICONDA_VERSION}-Linux-${MINICONDA_ARCH}.sh" \
80-
-o /tmp/miniconda.sh; \
81-
bash /tmp/miniconda.sh -b -p "${CONDA_DIR}"; \
82-
rm -f /tmp/miniconda.sh; \
83-
"${CONDA_DIR}/bin/conda" config --system --set auto_activate_base false; \
84-
"${CONDA_DIR}/bin/conda" clean -afy
85-
86-
RUN ln -sf /opt/conda/bin/conda /usr/local/bin/conda
56+
RUN userdel -r ubuntu || true
8757

8858
# ===============================
8959
# Python 3.11 (build from source, with ensurepip)
@@ -101,7 +71,7 @@ RUN set -eux; \
10171
&& tar -xzf /tmp/Python.tgz -C /tmp/python-src --strip-components=1 \
10272
&& rm -f /tmp/Python.tgz \
10373
&& cd /tmp/python-src \
104-
&& ./configure --enable-optimizations --with-ensurepip=install \
74+
&& ./configure --with-ensurepip=install \
10575
&& make -j"$(nproc)" \
10676
&& make altinstall \
10777
&& cd / \
@@ -125,15 +95,23 @@ RUN python -m pip install --no-cache-dir \
12595
huggingface-hub datasets
12696

12797
# ===============================
128-
# Patch: ensure python3.11 has Jupyter (required by /start.sh)
129-
# Only adds jupyter to the python3.11 environment, does not modify the existing pip install logic
98+
# Build-time assertion: verify Jupyter installation
13099
# ===============================
131-
RUN /usr/local/bin/python3.11 -m ensurepip --upgrade && \
132-
/usr/local/bin/python3.11 -m pip install --no-cache-dir \
133-
jupyterlab ipywidgets jupyter-archive notebook==7.3.3
100+
RUN python -c "import jupyter; import notebook; import jupyterlab; print('jupyter ok')"
134101

135-
# Build-time assertion: prevents pushing a broken image
136-
RUN /usr/local/bin/python3.11 -c "import jupyter; import notebook; import jupyterlab; print('python3.11 jupyter ok')"
102+
# ===============================
103+
# Configure JupyterLab: auto-login with token (no password prompt)
104+
# ===============================
105+
RUN mkdir -p /root/.jupyter && printf '%s\n' \
106+
'c.ServerApp.token = "yotta"' \
107+
'c.ServerApp.password = ""' \
108+
'c.ServerApp.allow_remote_access = True' \
109+
'c.ServerApp.allow_origin = "*"' \
110+
'c.NotebookApp.token = "yotta"' \
111+
'c.NotebookApp.password = ""' \
112+
'c.NotebookApp.allow_remote_access = True' \
113+
> /root/.jupyter/jupyter_lab_config.py && \
114+
chmod 600 /root/.jupyter/jupyter_lab_config.py
137115

138116
# ===============================
139117
# NCCL tests (build from source, force MPI=0 to avoid mpi.h missing)
@@ -145,31 +123,23 @@ RUN set -eux; \
145123
ln -sf /opt/nccl-tests/build/* /usr/local/bin/; \
146124
rm -rf /opt/nccl-tests/.git
147125

148-
# ===============================
149-
# User
150-
# ===============================
151-
RUN useradd -ms /bin/bash ubuntu && \
152-
usermod -aG sudo ubuntu && \
153-
echo "ubuntu ALL=(ALL) NOPASSWD:ALL" > /etc/sudoers.d/ubuntu && \
154-
echo "ubuntu:ubuntu" | chpasswd
155-
156-
# ===============================
157-
# SSH config (start.sh handles sshd startup; this ensures password login is enabled)
158-
# ===============================
159-
RUN sed -i 's/#PasswordAuthentication yes/PasswordAuthentication yes/' /etc/ssh/sshd_config && \
160-
sed -i 's/PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config && \
161-
rm -f /etc/ssh/ssh_host_*
162126

163127
# ===============================
164128
# CUDA bin convenience
165129
# ===============================
166130
RUN ln -sf /usr/local/cuda/bin/* /usr/bin/ || true
167131

168132
# ===============================
169-
# start.sh (from buildx bake context "scripts")
133+
# Supervisor configuration
170134
# ===============================
171-
COPY --from=scripts start.sh /start.sh
172-
RUN chmod 755 /start.sh
135+
RUN mkdir -p /var/log/supervisor /usr/local/bin && \
136+
chmod 777 /var/log/supervisor /workspace /var/run /var/lib/nginx && \
137+
mkdir -p /run/sshd && \
138+
chmod 700 /run/sshd
139+
140+
COPY --from=scripts start1.sh /start1.sh
141+
RUN chmod 755 /start1.sh && \
142+
sed -i 's/\r$//' /start1.sh
173143

174144
# ===============================
175145
# nginx / branding
@@ -187,8 +157,8 @@ RUN echo 'cat /etc/yotta.txt' >> /root/.bashrc
187157
EXPOSE 22 80 8888
188158

189159
# ===============================
190-
# Entrypoint: root runs start.sh directly (does not modify the shared start.sh)
160+
# Entrypoint: root runs start1.sh with explicit bash (ensures bash syntax works)
191161
# ===============================
192162
USER root
193163
WORKDIR /root
194-
CMD ["/bin/bash", "-lc", "exec /start.sh"]
164+
CMD ["/bin/bash", "-c", "exec /bin/bash /start1.sh"]

official-templates/pytorch/docker-bake.hcl

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
variable "PUBLISHER" { default = "yottalabsai" }
2-
variable "TAG_SUFFIX" { default = "2026010901" }
2+
variable "TAG_SUFFIX" { default = "2026031701" }
33

44
group "default" {
55
targets = ["pytorch290"]
@@ -15,7 +15,7 @@ target "pytorch290" {
1515
dockerfile = "Dockerfile"
1616

1717
tags = [
18-
"${PUBLISHER}/pytorch:2.9.0-py3.11-cuda12.8.1-cudnn-devel-ubuntu22.04"
18+
"${PUBLISHER}/pytorch:${TAG_SUFFIX}"
1919
]
2020

2121
contexts = {
@@ -25,7 +25,7 @@ target "pytorch290" {
2525
}
2626

2727
args = {
28-
BASE_IMAGE = "nvidia/cuda:12.8.1-cudnn-devel-ubuntu22.04"
28+
BASE_IMAGE = "nvidia/cuda:12.8.1-cudnn-devel-ubuntu22.04"
2929
PYTHON_VERSION = "3.11.14"
3030
TORCH = "torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu128"
3131
}

0 commit comments

Comments
 (0)