Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion steptronoss/core/trainers/lm_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -349,7 +349,8 @@ def setup_model(self, model_config: Megatron3DParallelModelConfig) -> torch.nn.M

# Fp16 conversion.
if model_config.params_dtype in [torch.float16, torch.bfloat16]:
model = [Float16Module(model_module, model_config.params_dtype) for model_module in model]
fp32_output = getattr(model_config, "fp32_output", True)
model = [Float16Module(model_module, model_config.params_dtype, fp32_output) for model_module in model]

model = torch.nn.ModuleList(model)

Expand Down
3 changes: 2 additions & 1 deletion steptronoss/core/trainers/packed_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,8 @@ def setup_model(self, model_config: Megatron3DParallelModelConfig) -> torch.nn.M

# Fp16 conversion.
if model_config.params_dtype in [torch.float16, torch.bfloat16]:
model = [Float16Module(model_module, model_config.params_dtype) for model_module in model]
fp32_output = getattr(model_config, "fp32_output", True)
model = [Float16Module(model_module, model_config.params_dtype, fp32_output) for model_module in model]

model = torch.nn.ModuleList(model)

Expand Down
5 changes: 5 additions & 0 deletions steptronoss/exp/base_exp.py
Original file line number Diff line number Diff line change
Expand Up @@ -393,6 +393,11 @@ class MegatronPPModelConfig(AbstractModelConfig):

fp32_residual_connection: bool = False

fp32_output: bool = True
"""Upcast the model's final output to fp32 in Float16Module.forward.
Set False to keep logits in params_dtype (bf16/fp16); useful for bounding
transient memory on long-seq + large-vocab configs."""

def get_pp_scheduler(self) -> FWBWScheduler:
from steptronoss.core.parallel_state import PM, get_vpp_size
from steptronoss.core.pipeline_parallel.schedules import (
Expand Down
Loading