Updated yaml files for running in kubernetes

Starfractor · Starfractor · commit 53abc6b2345f · 2025-05-07T14:36:57.000-07:00
diff --git a/Dockerfile b/Dockerfile
@@ -2,10 +2,10 @@
 FROM ubuntu:22.04
 
 # Set the working directory
-WORKDIR /T2M-GPT
+WORKDIR /P-BIGE
 
 # Install necessary dependencies
-RUN apt-get update && apt-get install -y wget git htop
+RUN apt-get update && apt-get install -y wget git htop xvfb
 
 # Install Miniconda
 RUN MINICONDA_INSTALLER_SCRIPT=Miniconda3-py38_23.1.0-1-Linux-x86_64.sh && \
@@ -18,46 +18,26 @@ RUN MINICONDA_INSTALLER_SCRIPT=Miniconda3-py38_23.1.0-1-Linux-x86_64.sh && \
 # Update PATH to include conda
 ENV PATH=/usr/local/bin:$PATH
 
+# Copy the entire repo (including environment.yml) into the image
+COPY . /P-BIGE
 
-
-# Clone UCSD-Github dataset 
-# Set the working directory
-#WORKDIR /
-#RUN git -c http.sslVerify=false clone https://github.com/Rose-STL-Lab/UCSD-OpenCap-Fitness-Dataset.git
-
-
-# Clone the digital-coach-anwesh repository
-#RUN git -c http.sslVerify=false clone https://gitlab.nrp-nautilus.io/shmaheshwari/digital-coach-anwesh.git .
-
-# Copy the environment.yml file and create the conda environment
-# COPY digital-coach-anwesh/environment.yml /T2M-GPT/environment.yml
-COPY . /T2M-GPT
+# Create the conda environment from environment.yml
 RUN conda env create -f environment.yml
 
-# Activate the conda environment
-SHELL ["conda", "run", "-n", "T2M-GPT", "/bin/bash", "-c"]
+# Activate the conda environment for subsequent RUN commands
+SHELL ["conda", "run", "-n", "P-BIGE", "/bin/bash", "-c"]
 
 # Download the model and extractor
 RUN bash dataset/prepare/download_model.sh && \
     bash dataset/prepare/download_extractor.sh
 
-# Install additional Python packages
-RUN pip install --user ipykernel nimblephysics deepspeed polyscope easydict trimesh
+# Install additional Python packages (if needed)
+RUN pip install --user ipykernel polyscope easydict trimesh
 RUN pip install --user --force-reinstall numpy==1.22.0
 
-# Install CUDA toolkit
-# RUN apt-get install -y cuda-toolkit-11-2
-
 # Set up Xvfb for Polyscope
-RUN apt-get install -y xvfb
 ENV DISPLAY=:99.0
-
-# Create a fake screen
 RUN Xvfb :99 -screen 0 1024x768x24 > /dev/null 2>&1 &
 
-# Expose ports 443 and 80
-# EXPOSE 443
-# EXPOSE 80
-
-# Set the entrypoint
-ENTRYPOINT ["conda", "run", "--no-capture-output", "-n", "T2M-GPT", "python"]
+# Set the entrypoint to use the conda environment
+ENTRYPOINT ["conda", "run", "--no-capture-output", "-n", "P-BIGE", "python"]
diff --git a/environment.yml b/environment.yml
@@ -1,4 +1,4 @@
-name: T2M-GPT
+name: P-BIGE
 channels:
   - pytorch
   - defaults
@@ -71,20 +71,25 @@ dependencies:
     - chumpy==0.70
     - cycler==0.10.0
     - decorator==5.0.9
-    - google-auth==1.35.0
+    - deepspeed==0.5.8
+    - gdown
+    - git+https://github.com/nghorbani/human_body_prior
+    - git+https://github.com/openai/CLIP.git
     - google-auth-oauthlib==0.4.5
+    - google-auth==1.35.0
     - grpcio==1.39.0
     - idna==3.2
     - imageio==2.9.0
     - ipdb==0.13.9
-    - ipython==7.26.0
     - ipython-genutils==0.2.0
+    - ipython==7.26.0
     - jedi==0.18.0
     - joblib==1.0.1
     - kiwisolver==1.3.1
     - markdown==3.3.4
-    - matplotlib==3.4.3
     - matplotlib-inline==0.1.2
+    - matplotlib==3.4.3
+    - moviepy
     - nimblephysics
     - oauthlib==3.1.1
     - pandas==1.3.2
@@ -94,31 +99,27 @@ dependencies:
     - prompt-toolkit==3.0.20
     - protobuf==3.17.3
     - ptyprocess==0.7.0
-    - pyasn1==0.4.8
     - pyasn1-modules==0.2.8
+    - pyasn1==0.4.8
     - pygments==2.10.0
     - pyparsing==2.4.7
     - python-dateutil==2.8.2
     - pytz==2021.1
     - pyyaml==5.4.1
-    - requests==2.26.0
     - requests-oauthlib==1.3.0
+    - requests==2.26.0
     - rsa==4.7.2
     - scikit-learn==0.24.2
     - scipy==1.7.1
     - sklearn==0.0
     - smplx==0.1.28
-    - tensorboard==2.6.0
     - tensorboard-data-server==0.6.1
     - tensorboard-plugin-wit==1.8.0
+    - tensorboard==2.6.0
     - threadpoolctl==2.2.0
     - toml==0.10.2
     - tqdm==4.62.2
     - traitlets==5.0.5
     - urllib3==1.26.6
     - wcwidth==0.2.5
-    - werkzeug==2.0.1
-    - git+https://github.com/openai/CLIP.git
-    - git+https://github.com/nghorbani/human_body_prior
-    - gdown
-    - moviepy
+    - werkzeug==2.0.1
diff --git a/train_vq.yaml b/train_vq.yaml
@@ -0,0 +1,89 @@
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: train-vq-job
+spec:
+  template:
+    spec:
+      containers:
+      - name: gpu-container
+        image: gitlab-registry.nrp-nautilus.io/shmaheshwari/digital-coach-anwesh:latest
+        command: ["/bin/bash", "-c"]
+        args:
+          - |
+            conda init bash 
+            source ~/.bashrc
+            source /root/.bashrc
+            conda activate T2M-GPT
+
+            apt-get update && apt-get install -y nvidia-cuda-toolkit && \
+            mkdir -p /usr/local/cuda/bin && \
+            ln -sf /usr/bin/nvcc /usr/local/cuda/bin/nvcc && \
+            ln -sf /usr/lib/x86_64-linux-gnu /usr/local/cuda/lib64 && \
+            export CUDA_HOME=/usr/local/cuda && \
+            export PATH=$PATH:/root/.local/bin && \
+            export PATH=$PATH:$CUDA_HOME/bin:/root/.local/bin && \
+            export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CUDA_HOME/lib64 && \
+
+            export CUDA_HOME=/usr/local/cuda && \
+            export PATH=$PATH:$CUDA_HOME/bin:/root/.local/bin && \
+            export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CUDA_HOME/lib64 && \
+
+            export PATH=$PATH:$CUDA_HOME/bin:/root/.local/bin
+            export PYTHONUNBUFFERED=1
+
+            git clone https://github.com/Starfractor/P-BIGE.git /workspace &&
+            cd /workspace && \
+
+            pip install nimblephysics && \
+            pip install tensorboard && \
+            pip install git+https://github.com/openai/CLIP.git && \
+            pip install scipy && \
+            pip install imageio && \
+            pip install matplotlib && \
+            pip install deepspeed && \
+
+            deepspeed train_vq.py \
+              --batch-size 256 \
+              --lr 2e-4 \
+              --total-iter 300000 \
+              --lr-scheduler 200000 \
+              --nb-code 512 \
+              --down-t 2 \
+              --depth 3 \
+              --dilation-growth-rate 3 \
+              --out-dir output \
+              --dataname mcs \
+              --vq-act relu \
+              --quantizer ema_reset \
+              --loss-vel 0.5 \
+              --recons-loss l1_smooth \
+              --exp-name VQVAE9_Bs32 
+        resources:
+          limits:
+            nvidia.com/gpu: 2
+            cpu: "2.4"
+            memory: "9830Mi"
+            ephemeral-storage: "20Gi"
+          requests:
+            nvidia.com/gpu: 2
+            cpu: "2"
+            memory: "8Gi"
+            ephemeral-storage: "10Gi"
+        volumeMounts:
+        - name: biomechanics-dataset
+          mountPath: /home/mnt/data
+      volumes:
+      - name: biomechanics-dataset
+        persistentVolumeClaim:
+          claimName: add-biomechanics-dataset-pvc
+      affinity:
+        nodeAffinity:
+          requiredDuringSchedulingIgnoredDuringExecution:
+            nodeSelectorTerms:
+            - matchExpressions:
+              - key: nvidia.com/gpu.product
+                operator: In
+                values:
+                - NVIDIA-GeForce-RTX-2080-Ti
+      restartPolicy: Never