Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
96c085f
Add new files from b17248aa3 Move config src files into a dedicated d…
ilml Mar 20, 2026
80290bc
Add new files from 60a25aa67 Optimize away add request overheads in d…
ilml Mar 20, 2026
3ef2dcb
Add new files from 310082a6d μP: Maximal Update Parameterization (#3…
ilml Mar 20, 2026
40b3e32
Add new files from 2f8c9bcc8 Add GPTOSS Example with Megatron-LM + Me…
ilml Mar 20, 2026
0a26a46
Add new files from c9312e6a5 Inference | KV prefix caching. (#3063)
ilml Mar 20, 2026
e6e48b7
Add new files from 7d1c01685 Inference Optimized MoEs (#3496)
ilml Mar 20, 2026
2212896
Add new files from 9b18de4d9 Prefix caching | Coordinator scheduling.…
ilml Mar 20, 2026
cd962dc
Add new files from 257094703 Nemo-RL Refit (#3520)
ilml Mar 20, 2026
5743bd1
Add new files from 94a903ba0 chore: CLI launch internal CI (#3695)
ilml Mar 20, 2026
2fa4c85
Add new files from 0d42bc6a0 Offload Flask frontend to separate proce…
ilml Mar 20, 2026
9a3595e
Add new files from 37ca7152c [main] Add TE CUDA Graph Support for Vis…
ilml Mar 20, 2026
9fecf8b
Add new files from 8318b8093 Fused dLN + add in backwards pass (#3384)
ilml Mar 20, 2026
e43d5c6
Add new files from 0e19bf11f Add CP + Sequence Packing support for Mi…
ilml Mar 20, 2026
ee8cad9
Add new files from fca1679e2 MXFP8 refit (#3742)
ilml Mar 20, 2026
87ffced
Add new files from 5bc89f368 Add NVIDIA-Nemotron-3-Super-120B-A12B-BF…
ilml Mar 20, 2026
3caae97
Add new files from 8f539df74 Add speculative decoding support with MT…
ilml Mar 20, 2026
239992b
Add new files from d1b8e27d2 Add unit tests for speculative decoding …
ilml Mar 20, 2026
41e43e8
Add new files from 905c0e386 Nemo-RL integration bugfixes for --trans…
ilml Mar 20, 2026
111feea
Add new files from 589cd9e12 Add torch grouped gemm bf16 and mxfp8 su…
ilml Mar 20, 2026
1187050
Add new files from 83498ef9c Add Lion optimizer support (#3813)
ilml Mar 20, 2026
fdd847c
Add new files from 0ca9b6395 Support multimodule pipelining in 1F1B s…
ilml Mar 20, 2026
aab793e
Add new files from c4bffde9e Inference | Hybrid prefix caching. (#3225)
ilml Mar 20, 2026
b18c7a6
Add new files from dde4701ea Implement forced lag in RL (#3517)
ilml Mar 20, 2026
e23cff0
Remove 3 test files that import symbols not yet on dev
ilml Mar 20, 2026
ab305c3
Remove 7 more test files with module-level import failures on dev
ilml Mar 20, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 55 additions & 0 deletions examples/gptoss/01_convert_from_hf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.

"""Convert HuggingFace checkpoints to Megatron format."""

import os
import argparse

from megatron.bridge import AutoBridge

def _parse_args():
parser = argparse.ArgumentParser(description="Convert HF LLMs to Megatron format")
parser.add_argument(
"--hf-model",
type=str,
required=True,
help="HuggingFace model identifier or path",
)
parser.add_argument(
"--save-path",
type=str,
default=None,
help="Path to save the converted Megatron checkpoint",
)
parser.add_argument('--local-rank', '--local_rank', type=int, default=0)
return parser.parse_args()

if __name__ == "__main__":
args = _parse_args()
HF_MODEL = args.hf_model
SAVE_PATH = args.save_path
WORLD_SIZE = int(os.environ.get("WORLD_SIZE", 1))

if SAVE_PATH is None:
SAVE_PATH = f"./megatron_checkpoints/{HF_MODEL.replace('/', '_')}"

print(f"Converting {HF_MODEL} to Megatron format...")
print(f"Save path: {SAVE_PATH}")

bridge = AutoBridge.from_hf_pretrained(HF_MODEL, trust_remote_code=True)
provider = bridge.to_megatron_provider()
# Update these configs as needed
provider.expert_tensor_parallel_size = 1
provider.tensor_model_parallel_size = 1
provider.pipeline_model_parallel_size = WORLD_SIZE
provider.finalize()

model = provider.provide_distributed_model(wrap_with_ddp=False)

bridge.save_megatron_model(
model,
SAVE_PATH,
hf_tokenizer_path=HF_MODEL
)

print(f"Saved Megatron checkpoint to {SAVE_PATH}")
259 changes: 259 additions & 0 deletions examples/gptoss/02_train.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,259 @@
#!/bin/bash

export CUDA_DEVICE_MAX_CONNECTIONS=${CUDA_DEVICE_MAX_CONNECTIONS:-1}


# Setup arguments with defaults
CHECKPOINT_PATH="NO_VALUE_PROVIDED"
TENSORBOARD_LOGS_PATH="./tensorboard_logs/"
TOKENIZER_ARG="MOCK"
DATA_ARG="MOCK"
DISTRIBUTED_CONFIG_FILE=""

# Parse command line arguments
while [[ $# -gt 0 ]]; do
case $1 in
--checkpoint-path)
CHECKPOINT_PATH="$2"
shift 2
;;
--tensorboard-logs-path)
TENSORBOARD_LOGS_PATH="$2"
shift 2
;;
--tokenizer)
TOKENIZER_ARG="$2"
shift 2
;;
--data)
DATA_ARG="$2"
shift 2
;;
--distributed-config-file)
DISTRIBUTED_CONFIG_FILE="$2"
shift 2
;;
-h|--help)
echo "Usage: $0 [OPTIONS]"
echo "Options:"
echo " --checkpoint-path PATH Path to Megatron checkpoint"
echo " --tensorboard-logs-path PATH Path to TensorBoard logs"
echo " --tokenizer PATH|MOCK Path to tokenizer model, or 'MOCK' (default: MOCK)"
echo " --data PATH|MOCK Data prefix, or 'MOCK' (default: MOCK)"
echo " --distributed-config-file FILE Path to distributed training config file"
echo " -h, --help Show this help message"
exit 0
;;
*)
echo "Unknown option: $1"
echo "Use --help for usage information"
exit 1
;;
esac
done

# Check if checkpoint path exists
if [ ! -d "$CHECKPOINT_PATH" ]; then
echo "Error: Checkpoint path does not exist: $CHECKPOINT_PATH"
exit 1
fi
echo "Checkpoint path exists: $CHECKPOINT_PATH"

# Check if tensorboard logs path exists
if [ ! -d "$TENSORBOARD_LOGS_PATH" ]; then
echo "Warning: TensorBoard logs path does not exist. Creating: $TENSORBOARD_LOGS_PATH"
mkdir -p "$TENSORBOARD_LOGS_PATH"
fi
echo "TensorBoard logs path exists: $TENSORBOARD_LOGS_PATH"

# NOTE: by default we use 8 GPUs
# These values will be over-written below with environmental variables
GPUS_PER_NODE=8
NUM_NODES=1
MASTER_ADDR="localhost"
MASTER_PORT=6000
NODE_RANK=0

# Load distributed config from file if provided
if [ -n "$DISTRIBUTED_CONFIG_FILE" ]; then
if [ ! -f "$DISTRIBUTED_CONFIG_FILE" ]; then
echo "Warning: Distributed config file does not exist: $DISTRIBUTED_CONFIG_FILE"
echo "Continuing with default distributed training settings."
else
echo "Loading distributed config from: $DISTRIBUTED_CONFIG_FILE"
source "$DISTRIBUTED_CONFIG_FILE"
fi
fi

# Override with environment variables if set
GPUS_PER_NODE=${GPUS_PER_NODE:-8}
NUM_NODES=${NUM_NODES:-1}
MASTER_ADDR=${MASTER_ADDR:-localhost}
MASTER_PORT=${MASTER_PORT:-6000}
NODE_RANK=${NODE_RANK:-0}
WORLD_SIZE=$(($GPUS_PER_NODE*$NUM_NODES))

# Path to the pretrain_gpt.py script, assuming this script is run from the root of the Megatron-LM repository
PRETRAIN_SCRIPT_PATH="pretrain_gpt.py"

# Data cache path (useful for both mock and real data)
DATA_CACHE_PATH="${PWD}/benchmark_cache_gpt_oss_20b"
mkdir -p "$DATA_CACHE_PATH"

DISTRIBUTED_ARGS=(
--nproc_per_node $GPUS_PER_NODE
--nnodes $NUM_NODES
--master_addr $MASTER_ADDR
--master_port $MASTER_PORT
--node_rank $NODE_RANK
)

# NOTE: we only set pipeline parallelism to be the number of GPUs
# Adjust each value based on your setup.
TP_SIZE=1
EP_SIZE=1
PP_SIZE=${WORLD_SIZE}
MICRO_BATCH_SIZE=1
GLOBAL_BATCH_SIZE=128
NUM_LAYERS=12
DTYPE="fp8"
SEQ_LENGTH=8192
MAX_POSITION_EMBEDDINGS=8192
TRAIN_SAMPLES=1953125000
LR_DECAY_SAMPLES=1949218748

MODEL_ARGS=(
--no-masked-softmax-fusion
--transformer-impl transformer_engine
--disable-bias-linear
--untie-embeddings-and-output-weights
--no-rope-fusion
--normalization RMSNorm
--num-layers ${NUM_LAYERS}
--hidden-size 512
--ffn-hidden-size 2048
--num-attention-heads 64
--group-query-attention
--num-query-groups 8
--seq-length ${SEQ_LENGTH}
--max-position-embeddings ${MAX_POSITION_EMBEDDINGS}
--use-mcore-models
--rotary-percent 1.0
--rope-type rope
--position-embedding-type rope
--rotary-base 10000
--no-bias-gelu-fusion
--export-force-local-attention
--no-bias-dropout-fusion
--quick-geglu
--glu-linear-offset 1.0
--softmax-type learnable
--window-attn-skip-freq 2
--activation-func-clamp-value 7.0
--window-size 128,0
--enable-gpt-oss
)

MOE_ARGS=(
--num-experts 4
--moe-router-topk 2
--moe-router-load-balancing-type aux_loss
--moe-aux-loss-coeff 1e-3
--moe-grouped-gemm
--moe-token-dispatcher-type alltoall
--overlap-param-gather
--overlap-grad-reduce
--moe-ffn-hidden-size 2048
--moe-router-dtype fp32
--moe-z-loss-coeff 1e-3
--moe-permute-fusion
)

DATA_ARGS_LIST=()
if [[ "$TOKENIZER_ARG" == "MOCK" ]] || [[ "$DATA_ARG" == "MOCK" ]] || [[ -z "$TOKENIZER_ARG" ]]; then
DATA_ARGS_LIST+=(
"--mock-data"
"--tokenizer-type NullTokenizer"
"--vocab-size 128256"
"--data-cache-path ${DATA_CACHE_PATH}"
"--tiktoken-pattern v2"
"--split '99,1,0'"
"--no-create-attention-mask-in-dataloader"
"--no-mmap-bin-files"
"--num-workers 1"
)
else
# Settings for real data
DATA_ARGS_LIST+=(
"--data-path $DATA_ARG"
"--tokenizer-type HuggingFaceTokenizer"
"--tokenizer-model $TOKENIZER_ARG"
"--data-cache-path ${DATA_CACHE_PATH}"
"--split '99,1,0'"
"--no-create-attention-mask-in-dataloader"
"--no-mmap-bin-files"
"--num-workers 1"
# Note: --vocab-size might be inferred by HuggingFaceTokenizer or might need to be explicit.
"--vocab-size 128256"
)
fi

TRAINING_ARGS=(
--micro-batch-size ${MICRO_BATCH_SIZE}
--global-batch-size ${GLOBAL_BATCH_SIZE}
--lr 1.0e-5
--train-samples ${TRAIN_SAMPLES}
--lr-decay-samples ${LR_DECAY_SAMPLES}
--lr-decay-style cosine
--min-lr 1.0e-6
--weight-decay 0.1
--lr-warmup-fraction 0.05
--clip-grad 1.0
--bf16
--use-flash-attn
--attention-softmax-in-fp32
--accumulate-allreduce-grads-in-fp32
--disable-bf16-reduced-precision-matmul
--recompute-activations
)

MODEL_PARALLEL_ARGS=(
--tensor-model-parallel-size ${TP_SIZE}
--pipeline-model-parallel-size ${PP_SIZE}
--expert-model-parallel-size ${EP_SIZE}
--sequence-parallel
--context-parallel-size 1
--use-distributed-optimizer
--fp8-format hybrid
--fp8-param-gather
--fp8-amax-compute-algo max
--fp8-amax-history-len 1024
)

LOGGING_ARGS=(
--log-interval 1
--save-interval 10000
--eval-interval 50000000
--eval-iters 0
--save $CHECKPOINT_PATH
--tensorboard-dir "${CHECKPOINT_PATH}/tensorboard"
--moe-per-layer-logging
--no-load-optim
--no-load-rng
--log-throughput
)

# Ensure pretrain_gpt.py is found
if [ ! -f "$PRETRAIN_SCRIPT_PATH" ]; then
echo "Error: pretrain_gpt.py not found at $PRETRAIN_SCRIPT_PATH"
echo "Please ensure you are running this script from the root of the Megatron-LM repository, and pretrain_gpt.py is present."
exit 1
fi

python -m torch.distributed.run ${DISTRIBUTED_ARGS[@]} ${PRETRAIN_SCRIPT_PATH} \
${MODEL_ARGS[@]} \
${MOE_ARGS[@]} \
${DATA_ARGS_LIST[@]} \
${TRAINING_ARGS[@]} \
${MODEL_PARALLEL_ARGS[@]} \
${LOGGING_ARGS[@]}
52 changes: 52 additions & 0 deletions examples/gptoss/03_convert_to_hf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.

"""Convert HuggingFace checkpoints to Megatron format."""

import os
import argparse

from megatron.bridge import AutoBridge

def _parse_args():
parser = argparse.ArgumentParser(description="Convert Megatron LLMs to HuggingFace format")
parser.add_argument(
"--hf-model",
type=str,
required=True,
help="HuggingFace model identifier or path to load config from",
)
parser.add_argument(
"--megatron-model",
type=str,
required=True,
help="Megatron model identifier or path",
)
parser.add_argument(
"--save-path",
type=str,
default=None,
help="Path to save the converted HuggingFace checkpoint",
)
parser.add_argument('--local-rank', '--local_rank', type=int, default=0)
return parser.parse_args()

if __name__ == "__main__":
args = _parse_args()
HF_MODEL = args.hf_model
MEGATRON_MODEL = args.megatron_model
SAVE_PATH = args.save_path
WORLD_SIZE = int(os.environ.get("WORLD_SIZE", 1))

if SAVE_PATH is None:
SAVE_PATH = f"./huggingface_checkpoints/{MEGATRON_MODEL.replace('/', '_')}"

print(f"Converting {MEGATRON_MODEL} to HuggingFace {HF_MODEL} format...")
print(f"Save path: {SAVE_PATH}")

bridge = AutoBridge.from_hf_pretrained(HF_MODEL, trust_remote_code=True)
bridge.export_ckpt(
MEGATRON_MODEL,
SAVE_PATH,
)

print(f"Saved HuggingFace checkpoint to {SAVE_PATH}")
Loading
Loading