1212# See the License for the specific language governing permissions and
1313# limitations under the License.
1414
15- # SSH: username that connects to the remote cluster
16- export USER=" DUMMY"
17- # SSH: remote cluster URL
18- export HOST=" DUMMY"
15+ # SLURM: Username on a cluster
16+ export USER=" <SLURM_USER>"
1917# Slurm: account for job submission
20- export ACCOUNT=" DUMMY "
18+ export ACCOUNT=" <SLURM_ACCOUNT> "
2119# Slurm: partition for job submission
22- export PARTITION=" DUMMY "
20+ export PARTITION=" <SLURM_PARTITION> "
2321# Slurm: job time limit, defaults to 8 hours
24- export TIME=" 08 :00:00"
25- # Slurm: --nodes arguments, default to use 288 nodes
22+ export TIME=" 02 :00:00"
23+ # Slurm: --nodes arguments
2624export NNODES=64
27- # Slurm: --gpus_per_node and --ntasks_per_node argument, defaults to 8 GPUs per node
25+ # Slurm: --gpus_per_node and --ntasks_per_node argument
2826export GPUS_PER_NODE=4
29- # Slurm: max job retries for transient job failures, defaults to retry 3 times
27+ # Slurm: max job retries for transient job failures
3028export MAX_RETRIES=1
3129
3230# Folder mapping:
3331# Output directory that holds logs, any path that you like.
34- export JOB_DIR= " /workspace/code/logs "
32+ export LOG_DIR= " <LOG_DIR> "
3533# Image / container path, either local cache file or remote URL
36- export IMAGE=" DUMMY "
34+ export IMAGE=" <IMAGE> "
3735# Dataset: C4 dataset location that contains the dataset after preprocessing
38- # export ORIGINAL_C4_PATH="/data/data/C4"
39-
40- # This corresponds to the PREPROCESSED_PATH in README section 3's dataset download part
41- export PREPROCESSED_PATH=" /data/deepseek_v3_671b/data/C4_processed"
42- export MERGED_C4_PATH=" /data/deepseek_v3_671b/data/C4_merged"
43- # Dataset: Numpy index working directory, contains shuffled dataset
44- # This path must be able to hold >400GB data
45- export TMP_NPY_INDEX=" /data/npy_indices"
46- # Dataset: Tokenizer path
47- # This corresponds to the TOKENIZER_PATH in README section 3's tokenizer download part
48- export TOKENIZER_PATH=" /data/deepseek_v3_671b/model/DeepSeek-V3-671B-Base"
49- # export TOKENIZER_PATH="/data/llama3_405b_ref/tokenizer"
50-
51- export MODEL_CKPT=" $TOKENIZER_PATH "
36+ export DATA_DIR=" <DATA_DIR>"
37+ # Model checkpoint path
38+ export MODEL_CKPT=" <MODEL_CKPT>"
5239
5340# Training Configs:
5441# Dataloader: Global batch size
@@ -66,7 +53,7 @@ export PIPELINE_PARALLEL_SIZE=4
6653export CONTEXT_PARALLEL_SIZE=1
6754export EXPERT_PARALLEL_SIZE=64
6855export EXPERT_TENSOR_PARALLEL_SIZE=1
69- export RECOMPUTE_MODULES=" mlp,moe_act "
56+ export RECOMPUTE_MODULES=" mlp,moe "
7057export CUDA_GRAPH_IMPLEMENTATION=" transformer_engine"
7158export CUDA_GRAPH_SCOPE=" attn"
7259
0 commit comments