-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathjob.sh
More file actions
39 lines (34 loc) · 1.49 KB
/
job.sh
File metadata and controls
39 lines (34 loc) · 1.49 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
#!/bin/bash
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=1
#SBATCH --gpus-per-task=4
#SBATCH --cpus-per-task=48
#SBATCH --mem=0
#SBATCH --time=0-03:00:00
#SBATCH --output=logs/%j/slurm-%j.out
set -eof pipefail
# Prevents changes in the python files from messing up future jobs.
repo="$HOME/repos/scaling_pqn" # could also use the current directory.
dest="$SLURM_TMPDIR/$(basename "$repo")"
if [[ -n "$GIT_COMMIT" ]]; then
git clone "$repo" "$dest"
echo "Checking out commit $GIT_COMMIT"
cd "$dest"
git checkout $GIT_COMMIT
elif [[ -n "$(git -C $repo status --porcelain)" ]]; then
echo "Warning: GIT_COMMIT is not set and the current repo at ~/repos/scaling_pqn has uncommitted changes."
echo "This may cause future jobs to fail or produce inconsistent results!"
echo "Consider using the 'safe_sbatch' script to submit jobs instead."
else
echo "GIT_COMMIT environment variable is not set, but the repo state is clean. "
echo "If you modify the files in the repo, future jobs might fail or produce inconsistent results. "
fi
export JAX_TRACEBACK_FILTERING=off
# Disable gpu mem pre-allocation so we can see how much we're using in wandb.
export XLA_PYTHON_CLIENT_PREALLOCATE=false
# Note: could potentially affect jax.distributed.initialize (according
# to the warning that appears when it is used).
# module load httpproxy/1.0
# NOTE: Seems to only be necessary on the Mila cluster (otherwise we get some warning related to ptax)
module load cuda/12.6.0
srun uv run --offline --frozen "$@"