mlcommons · suachong · Aug 15, 2025 · Aug 15, 2025 · Aug 16, 2025 · Aug 16, 2025
@@ -3,3 +3,7 @@ __pycache__/
 *.py[cod]
 *$py.class
 single_stage_detector/mlcube/workspace/*
+
+# Dev folder
+dev/
+output/
@@ -0,0 +1,44 @@
+#
+# Copyright (c) 2026, Advanced Micro Devices, Inc. All rights reserved.
+#
+# MIT License
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+
+ARG BASE_IMAGE=docker.io/rocm/primus:v25.11
+FROM ${BASE_IMAGE}
+
+WORKDIR /workspace/deps
+
+RUN rm -rf Primus && \
+    git clone --recursive https://github.com/AMD-AIG-AIMA/Primus.git && \
+    cd Primus && \
+    git checkout 85c51c0da12f7d9b819f944eba9ffeb313795b9a && \
+    git submodule update --init --recursive && \
+    pip install -r requirements.txt
+
+RUN cd /workspace/deps/Primus/third_party/Megatron-LM && \
+    pip install -e . --no-deps
+
+WORKDIR /workspace/code
+
+COPY . .
+
+RUN pip install primus_mllog-0.1.0-py3-none-any.whl
@@ -0,0 +1,31 @@
+ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:25.12-py3
+FROM ${BASE_IMAGE}
+
+WORKDIR /workspace
+
+RUN pip install --no-cache-dir \
+    pyyaml \
+    pybind11 \
+    ninja \
+    packaging \
+    transformers
+
+WORKDIR /workspace/deps
+RUN git clone --recursive https://github.com/AMD-AIG-AIMA/Primus.git && \
+    cd Primus && \
+    git checkout 85c51c0da12f7d9b819f944eba9ffeb313795b9a && \
+    git submodule update --init --recursive && \
+    pip install -r requirements.txt
+
+RUN cd /workspace/deps/Primus/third_party/Megatron-LM && \
+    pip install -e . --no-deps
+
+ENV PYTHONPATH="/workspace/deps/Primus:/workspace/deps/Primus/third_party/Megatron-LM"
+
+WORKDIR /workspace/code
+COPY . .
+
+# Install primus-mllog from local wheel
+RUN pip install primus_mllog-0.1.0-py3-none-any.whl
+
+RUN pip install --no-build-isolation git+https://github.com/fanshiqing/grouped_gemm@v1.1.4
@@ -0,0 +1,138 @@
+# GPT-OSS-20B Pretraining Benchmark
+
+GPT-OSS 20B (Mixture of Experts)
+
+## Overview
+
+This benchmark trains a 20B parameter GPT model with Mixture of Experts (MoE) architecture using the Primus framework on AMD and NVIDIA GPUs.
+
+# 1. Setup Docker Image
+
+
+Run the following build command from this directory. The build process will take a while to complete.
+
+```bash
+# From gpt-oss-20b/primus directory
+docker build -t rocm/amd-mlperf:gpt_oss_20b_training_5.1 .
+```
+
+# 2. Prepare Dataset
+
+The current codebase uses the c4/en/3.0.1 dataset from [HuggingFace/AllenAI](https://huggingface.co/datasets/allenai/c4) for training and evaluation.
+
+## Download Preprocessed Data
+
+The pre-tokenized dataset is available for download. Navigate to your desired download directory and run the following commands:
+
+```bash
+# Create desired download directory with the right permission 
+cd /data/gpt_oss_20b
+
+# Download training and validation data
+bash <(curl -s https://raw.githubusercontent.com/mlcommons/r2-downloader/refs/heads/main/mlc-r2-downloader.sh) \
+    -d data https://training.mlcommons-storage.org/metadata/llama-3-1-8b-preprocessed-c4-dataset.uri
+```
+
+After download, you should see files with the following naming conventions:
+- Training: `c4-train.en_6_text_document.bin` and `.idx`
+- Validation: `c4-validation-91205-samples.en_text_document.bin` and `.idx`
+
+The data directory is approximately **80 GB**.
+
+# 3. Run Training
+
+## Set Environment Variables
+
+Set the directory for data and results. Ensure `$LOGDIR` has write access.
+
+```bash
+export DATADIR=/data/gpt_oss_20b/data
+export LOGDIR=/data/gpt_oss_20b/results
+export CONT=rocm/amd-mlperf:gpt_oss_20b_training_5.1
+export HF_TOKEN=<your HF token>
+
+# Create results directory
+mkdir -p $LOGDIR
+sudo chmod -R 777 $LOGDIR
+```
+
+## Set Configuration
+
+Set appropriate configuration and system-specific hyperparameters based on hardware type:
+
+| Config File | System | GPUs |
+|-------------|--------|------|
+| `config_MI355X_1x8x1.sh` | MI355X | 1 node × 8 GPUs |
+| `config_B200_1x8x1.sh` | B200 | 1 node × 8 GPUs |
+
+```bash
+source config_MI355X_1x8x1.sh
+```
+
+## Launch Training
+
+### Single Run
+
+```bash
+export NEXP=1
+bash run_with_docker.sh
+```
+
+### Multiple Runs (for submission)
+
+```bash
+export NEXP=10
+bash run_with_docker.sh
+```
+
+After completion, logs will be available under `$LOGDIR`.
+
+# 4. Quality Metrics
+
+## Quality Metric
+
+Validation loss (log perplexity)
+
+## Evaluation Frequency
+
+Evaluation every **768 iterations** (12,288 samples with GBS=16)
+
+## Evaluation Thoroughness
+
+We evaluate using **1024 sequences** from the validation dataset.
+
+# 5. Model Architecture
+
+| Parameter | Value |
+|-----------|-------|
+| Model Size | 20B parameters |
+| Architecture | GPT with Mixture of Experts |
+| Sequence Length | 8192 |
+| Expert Parallelism | 8 |
+
+# 6. Training Configuration
+
+| Hyperparameter | Value |
+|----------------|-------|
+| Micro Batch Size | 2 |
+| Global Batch Size | 16 |
+| Learning Rate | 8e-4 |
+| LR Schedule | Cosine decay with warmup |
+| Weight Decay | 0.1 |
+| Adam β1, β2 | 0.9, 0.95 |
+| Training Iterations | 20,000 |
+
+# 7. Directory Structure
+
+```
+gpt-oss-20b/primus/
+├── conf/                       # Configuration files
+│   └── gpt_oss_20B-pretrain.yaml
+├── src/                        # Training source code
+│   └── train.py
+├── config_MI355X_1x8x1.sh      # System configuration (MI355 - AMD)
+├── config_B200_1x8x1.sh        # System configuration (B200 - NVIDIA)
+├── Dockerfile                  # Dockerfile (MI355 - AMD)
+├── Dockerfile.nvidia           # Dockerfile (B200 - NVIDIA)
+└── requirements.txt            # Python dependencies (includes primus-mllog)
+```
@@ -0,0 +1,107 @@
+work_group: ${TEAM:nvidia}
+user_name: ${USER:root}
+exp_name: ${EXP_NAME:gpt_oss_20b_nvidia}
+workspace: ./output
+
+modules:
+  pre_trainer:
+    framework: megatron
+    config: pre_trainer.yaml
+
+    # model to run
+    model: ${PRIMUS_MODEL:gpt_oss_20B}.yaml
+    overrides:
+      # log
+      wandb_project: "Primus_GPT_OSS_20B_NVIDIA"
+      stderr_sink_level: DEBUG
+      log_interval: 99999999  # Suppress console logs
+
+      # debug
+      moe_router_force_load_balancing: true
+      log_avg_skip_iterations: 2
+      log_avg_reset_interval: 50
+
+      # profile
+      profile: false
+      use_pytorch_profiler: false
+      profile_step_end: 7
+      profile_step_start: 6
+
+      # precision (mixed precision training)
+      # Using bf16 for B200
+      bf16: true
+      fp16: false
+      fp8: null  # Disabled - using bf16 instead
+
+      # hyper parameters
+      train_iters: ${PRIMUS_TRAIN_ITERS:20000}
+      micro_batch_size: ${PRIMUS_MICRO_BATCH_SIZE:2}
+      global_batch_size: ${PRIMUS_GLOBAL_BATCH_SIZE:16}
+      seq_length: ${PRIMUS_SEQ_LENGTH:8192}
+      max_position_embeddings: ${PRIMUS_MAX_POSITION_EMBEDDINGS:8192}
+      seed: ${SEED:1234} 
+      lr: ${PRIMUS_LR:8.0e-4}
+      min_lr: 8.0e-5  # Set to 10% of max LR
+      lr_warmup_iters: ${PRIMUS_LR_WARMUP_ITERS:128}
+      lr_decay_iters: 1199872
+      lr_decay_style: cosine
+      weight_decay: 0.1
+      adam_beta1: 0.9
+      adam_beta2: 0.95
+      eod_mask_loss: true
+      init_method_std: 0.008
+      norm_epsilon: 1.0e-6
+
+      # parallel
+      tensor_model_parallel_size: ${PRIMUS_TP:1}
+      pipeline_model_parallel_size: ${PRIMUS_PP:1}
+      expert_model_parallel_size: ${PRIMUS_EP:8}
+      overlap_grad_reduce: true
+      overlap_param_gather: true
+
+      # data
+      mock_data: false
+      train_data_path: "10 /data/c4-train.en_6_text_document"
+      valid_data_path: "/data/c4-validation-91205-samples.en_text_document"
+      test_data_path: "/data/c4-validation-91205-samples.en_text_document"
+
+      # fusion (standard Megatron optimizations)
+      moe_permute_fusion: false
+      gradient_accumulation_fusion: false
+      moe_grouped_gemm: false  # Disable grouped_gemm requirement
+      moe_use_legacy_grouped_gemm: false
+      moe_use_fused_router_with_aux_score: false
+      multi_latent_attention: false
+      apply_rope_fusion: false
+
+      # MoE router configuration
+      moe_shared_expert_overlap: false
+      moe_router_dtype: fp32
+
+      # ckpt
+      finetune: false
+      auto_continue_train: false
+      load: null
+      no_load_optim: null
+      no_load_rng: null
+      save: null
+      save_interval: 20000
+      no_save_optim: null
+      no_save_rng: null
+      disable_last_saving: true
+      exit_on_missing_checkpoint: false
+      ckpt_format: torch
+      eval_iters: 64  # 64 iters × 2 MBS × 8 GPUs = 1024 eval samples
+      eval_interval: ${PRIMUS_EVAL_INTERVAL:768}
+
+      # Turbo features disabled for NVIDIA
+      enable_primus_turbo: false
+      use_turbo_attention: false
+      use_turbo_grouped_mlp: false
+
+      use_turbo_deepep: false
+      turbo_deepep_num_cu: 0
+      turbo_deepep_use_comm_stream: false
+
+      turbo_sync_free_moe_stage: 0
+