-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathrun_1gpu.sh
More file actions
executable file
·48 lines (41 loc) · 1.34 KB
/
run_1gpu.sh
File metadata and controls
executable file
·48 lines (41 loc) · 1.34 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
#!/usr/bin/env bash
set -euo pipefail
export MASTER_ADDR=$(hostname -I | awk '{print $1}')
export MASTER_PORT=30001
export NNODES=1
export NPROC_PER_NODE=1
export CUDA_VISIBLE_DEVICES="0"
export NODE_RANK=0
CONFIG_FILE="./dinov3/configs/train/vith16plus.yaml"
OUTPUT_DIR="./output_vith16plus_1gpu"
RESUME="False"
REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd -P)"
export DINOV3_RUN_SCRIPT="${REPO_ROOT}/$(basename "${BASH_SOURCE[0]}")"
export PYTHONPATH="${REPO_ROOT}${PYTHONPATH:+:${PYTHONPATH}}"
export PATH="$HOME/.local/bin:$PATH"
if [[ "${RESUME}" == "True" ]]; then
echo "Resume enabled; preserving ${OUTPUT_DIR}"
RESUME_FLAG=""
else
echo "Resume disabled; cleaning ${OUTPUT_DIR}"
rm -rf "${OUTPUT_DIR}"
RESUME_FLAG="--no-resume"
fi
mkdir -p "${OUTPUT_DIR}"
echo "[Master Node] Starting training..."
echo "MASTER_ADDR=${MASTER_ADDR}"
echo "MASTER_PORT=${MASTER_PORT}"
echo "NNODES=${NNODES}, NPROC_PER_NODE=${NPROC_PER_NODE}"
echo "CONFIG_FILE=${CONFIG_FILE}"
echo "OUTPUT_DIR=${OUTPUT_DIR}"
echo "CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES}"
uv run torchrun \
--nnodes "${NNODES}" \
--nproc_per_node "${NPROC_PER_NODE}" \
--node_rank "${NODE_RANK}" \
--master_addr "${MASTER_ADDR}" \
--master_port "${MASTER_PORT}" \
dinov3/train/train.py \
--config-file "${CONFIG_FILE}" \
--output-dir "${OUTPUT_DIR}" \
${RESUME_FLAG}