Skip to content

Commit 53abc6b

Browse files
committed
Updated yaml files for running in kubernetes
1 parent 83e2386 commit 53abc6b

File tree

3 files changed

+113
-43
lines changed

3 files changed

+113
-43
lines changed

Dockerfile

Lines changed: 11 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,10 @@
22
FROM ubuntu:22.04
33

44
# Set the working directory
5-
WORKDIR /T2M-GPT
5+
WORKDIR /P-BIGE
66

77
# Install necessary dependencies
8-
RUN apt-get update && apt-get install -y wget git htop
8+
RUN apt-get update && apt-get install -y wget git htop xvfb
99

1010
# Install Miniconda
1111
RUN MINICONDA_INSTALLER_SCRIPT=Miniconda3-py38_23.1.0-1-Linux-x86_64.sh && \
@@ -18,46 +18,26 @@ RUN MINICONDA_INSTALLER_SCRIPT=Miniconda3-py38_23.1.0-1-Linux-x86_64.sh && \
1818
# Update PATH to include conda
1919
ENV PATH=/usr/local/bin:$PATH
2020

21+
# Copy the entire repo (including environment.yml) into the image
22+
COPY . /P-BIGE
2123

22-
23-
# Clone UCSD-Github dataset
24-
# Set the working directory
25-
#WORKDIR /
26-
#RUN git -c http.sslVerify=false clone https://github.com/Rose-STL-Lab/UCSD-OpenCap-Fitness-Dataset.git
27-
28-
29-
# Clone the digital-coach-anwesh repository
30-
#RUN git -c http.sslVerify=false clone https://gitlab.nrp-nautilus.io/shmaheshwari/digital-coach-anwesh.git .
31-
32-
# Copy the environment.yml file and create the conda environment
33-
# COPY digital-coach-anwesh/environment.yml /T2M-GPT/environment.yml
34-
COPY . /T2M-GPT
24+
# Create the conda environment from environment.yml
3525
RUN conda env create -f environment.yml
3626

37-
# Activate the conda environment
38-
SHELL ["conda", "run", "-n", "T2M-GPT", "/bin/bash", "-c"]
27+
# Activate the conda environment for subsequent RUN commands
28+
SHELL ["conda", "run", "-n", "P-BIGE", "/bin/bash", "-c"]
3929

4030
# Download the model and extractor
4131
RUN bash dataset/prepare/download_model.sh && \
4232
bash dataset/prepare/download_extractor.sh
4333

44-
# Install additional Python packages
45-
RUN pip install --user ipykernel nimblephysics deepspeed polyscope easydict trimesh
34+
# Install additional Python packages (if needed)
35+
RUN pip install --user ipykernel polyscope easydict trimesh
4636
RUN pip install --user --force-reinstall numpy==1.22.0
4737

48-
# Install CUDA toolkit
49-
# RUN apt-get install -y cuda-toolkit-11-2
50-
5138
# Set up Xvfb for Polyscope
52-
RUN apt-get install -y xvfb
5339
ENV DISPLAY=:99.0
54-
55-
# Create a fake screen
5640
RUN Xvfb :99 -screen 0 1024x768x24 > /dev/null 2>&1 &
5741

58-
# Expose ports 443 and 80
59-
# EXPOSE 443
60-
# EXPOSE 80
61-
62-
# Set the entrypoint
63-
ENTRYPOINT ["conda", "run", "--no-capture-output", "-n", "T2M-GPT", "python"]
42+
# Set the entrypoint to use the conda environment
43+
ENTRYPOINT ["conda", "run", "--no-capture-output", "-n", "P-BIGE", "python"]

environment.yml

Lines changed: 13 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
name: T2M-GPT
1+
name: P-BIGE
22
channels:
33
- pytorch
44
- defaults
@@ -71,20 +71,25 @@ dependencies:
7171
- chumpy==0.70
7272
- cycler==0.10.0
7373
- decorator==5.0.9
74-
- google-auth==1.35.0
74+
- deepspeed==0.5.8
75+
- gdown
76+
- git+https://github.com/nghorbani/human_body_prior
77+
- git+https://github.com/openai/CLIP.git
7578
- google-auth-oauthlib==0.4.5
79+
- google-auth==1.35.0
7680
- grpcio==1.39.0
7781
- idna==3.2
7882
- imageio==2.9.0
7983
- ipdb==0.13.9
80-
- ipython==7.26.0
8184
- ipython-genutils==0.2.0
85+
- ipython==7.26.0
8286
- jedi==0.18.0
8387
- joblib==1.0.1
8488
- kiwisolver==1.3.1
8589
- markdown==3.3.4
86-
- matplotlib==3.4.3
8790
- matplotlib-inline==0.1.2
91+
- matplotlib==3.4.3
92+
- moviepy
8893
- nimblephysics
8994
- oauthlib==3.1.1
9095
- pandas==1.3.2
@@ -94,31 +99,27 @@ dependencies:
9499
- prompt-toolkit==3.0.20
95100
- protobuf==3.17.3
96101
- ptyprocess==0.7.0
97-
- pyasn1==0.4.8
98102
- pyasn1-modules==0.2.8
103+
- pyasn1==0.4.8
99104
- pygments==2.10.0
100105
- pyparsing==2.4.7
101106
- python-dateutil==2.8.2
102107
- pytz==2021.1
103108
- pyyaml==5.4.1
104-
- requests==2.26.0
105109
- requests-oauthlib==1.3.0
110+
- requests==2.26.0
106111
- rsa==4.7.2
107112
- scikit-learn==0.24.2
108113
- scipy==1.7.1
109114
- sklearn==0.0
110115
- smplx==0.1.28
111-
- tensorboard==2.6.0
112116
- tensorboard-data-server==0.6.1
113117
- tensorboard-plugin-wit==1.8.0
118+
- tensorboard==2.6.0
114119
- threadpoolctl==2.2.0
115120
- toml==0.10.2
116121
- tqdm==4.62.2
117122
- traitlets==5.0.5
118123
- urllib3==1.26.6
119124
- wcwidth==0.2.5
120-
- werkzeug==2.0.1
121-
- git+https://github.com/openai/CLIP.git
122-
- git+https://github.com/nghorbani/human_body_prior
123-
- gdown
124-
- moviepy
125+
- werkzeug==2.0.1

train_vq.yaml

Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
apiVersion: batch/v1
2+
kind: Job
3+
metadata:
4+
name: train-vq-job
5+
spec:
6+
template:
7+
spec:
8+
containers:
9+
- name: gpu-container
10+
image: gitlab-registry.nrp-nautilus.io/shmaheshwari/digital-coach-anwesh:latest
11+
command: ["/bin/bash", "-c"]
12+
args:
13+
- |
14+
conda init bash
15+
source ~/.bashrc
16+
source /root/.bashrc
17+
conda activate T2M-GPT
18+
19+
apt-get update && apt-get install -y nvidia-cuda-toolkit && \
20+
mkdir -p /usr/local/cuda/bin && \
21+
ln -sf /usr/bin/nvcc /usr/local/cuda/bin/nvcc && \
22+
ln -sf /usr/lib/x86_64-linux-gnu /usr/local/cuda/lib64 && \
23+
export CUDA_HOME=/usr/local/cuda && \
24+
export PATH=$PATH:/root/.local/bin && \
25+
export PATH=$PATH:$CUDA_HOME/bin:/root/.local/bin && \
26+
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CUDA_HOME/lib64 && \
27+
28+
export CUDA_HOME=/usr/local/cuda && \
29+
export PATH=$PATH:$CUDA_HOME/bin:/root/.local/bin && \
30+
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CUDA_HOME/lib64 && \
31+
32+
export PATH=$PATH:$CUDA_HOME/bin:/root/.local/bin
33+
export PYTHONUNBUFFERED=1
34+
35+
git clone https://github.com/Starfractor/P-BIGE.git /workspace &&
36+
cd /workspace && \
37+
38+
pip install nimblephysics && \
39+
pip install tensorboard && \
40+
pip install git+https://github.com/openai/CLIP.git && \
41+
pip install scipy && \
42+
pip install imageio && \
43+
pip install matplotlib && \
44+
pip install deepspeed && \
45+
46+
deepspeed train_vq.py \
47+
--batch-size 256 \
48+
--lr 2e-4 \
49+
--total-iter 300000 \
50+
--lr-scheduler 200000 \
51+
--nb-code 512 \
52+
--down-t 2 \
53+
--depth 3 \
54+
--dilation-growth-rate 3 \
55+
--out-dir output \
56+
--dataname mcs \
57+
--vq-act relu \
58+
--quantizer ema_reset \
59+
--loss-vel 0.5 \
60+
--recons-loss l1_smooth \
61+
--exp-name VQVAE9_Bs32
62+
resources:
63+
limits:
64+
nvidia.com/gpu: 2
65+
cpu: "2.4"
66+
memory: "9830Mi"
67+
ephemeral-storage: "20Gi"
68+
requests:
69+
nvidia.com/gpu: 2
70+
cpu: "2"
71+
memory: "8Gi"
72+
ephemeral-storage: "10Gi"
73+
volumeMounts:
74+
- name: biomechanics-dataset
75+
mountPath: /home/mnt/data
76+
volumes:
77+
- name: biomechanics-dataset
78+
persistentVolumeClaim:
79+
claimName: add-biomechanics-dataset-pvc
80+
affinity:
81+
nodeAffinity:
82+
requiredDuringSchedulingIgnoredDuringExecution:
83+
nodeSelectorTerms:
84+
- matchExpressions:
85+
- key: nvidia.com/gpu.product
86+
operator: In
87+
values:
88+
- NVIDIA-GeForce-RTX-2080-Ti
89+
restartPolicy: Never

0 commit comments

Comments
 (0)