Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 9 additions & 8 deletions src/MaxText/configs/base.yml
Original file line number Diff line number Diff line change
Expand Up @@ -90,11 +90,12 @@ grad_dtype: "float32"
dtype: "bfloat16"
# Used to configure quantization in the transformer layers, defaults to null implying bf16.
# Possible alternative settings are as follows:
# 'int8' for dynamic range quantization using 8-bits
# 'int8' for 8-bit integer quantization.
# 'intmp' for mixed precision quantization for inference as described here: src/MaxText/configs/quantization/README.md
# 'fp8' for 8-bit floating-point GeMMs on NVIDIA GPUs.
# 'nanoo_fp8' for 8-bit floating-point GeMMs on AMD MI300/MI325 GPUs.
# 'fp8' for 8-bit floating-point quantization.
# 'fp8_full' for FP8 quantization with static scaling.
# 'fp8_gpu' for FP8 for NVIDIA GPUs.
# 'fp8_nanoo' for FP8 for AMD MI300/MI325 GPUs.
quantization: ""
# Used to configure constant_bound_config in aqt lib for static scaling, e.g. constant_bound_config='0.5, 0.5, 0.5, 0.5, 0.5, 0.5'
constant_bound_config: ""
Expand Down Expand Up @@ -234,7 +235,7 @@ pipeline_delay_activation_forwarding: False # This delays the activation forward
# and you must set the number of microbatches to at least 2 * num_stages (the minimum 2 * num_stages is set by default with this delay).

model_fsdp_ag_once: False # This controls whether the Zero-1 optimization is active.
# This is a memory/time tradeoff - True: This is Zero-1 Sharding. Use ZeroOneTransformer to gather weights once per gradient step.
# This is a memory/time tradeoff - True: This is Zero-1 Sharding. Use ZeroOneTransformer to gather weights once per gradient step.
# False: This is Zero-3 Sharing. Use the standard Transformer, which gathers for each microbatch's fwd/bwd pass.
pipeline_fsdp_ag_once: False # If set to true then all gather all of the weights over FSDP before the first pipeline iteration.
# This is a memory/time tradeoff - we now have to store the FSDP gathered weights and gradients (typically in bf16), as opposed
Expand Down Expand Up @@ -284,7 +285,7 @@ param_scan_axis: 1
# The attention_type parameter determines the variants of attention, e.g. global or local_sliding
attention: 'autoselected' # Supported attention: autoselected, dot_product, flash, cudnn_flash_te
attention_type: 'global' # Supported attention_type: global, local_sliding, chunk, mla
attention_bias: False # If True, adds a learnable bias to the query, key, and value projections
attention_bias: False # If True, adds a learnable bias to the query, key, and value projections
attention_sink: False
sliding_window_size: 0
chunk_attn_window_size: 0
Expand Down Expand Up @@ -402,7 +403,7 @@ logical_axis_rules: [
['embed_no_exp', ['fsdp', 'sequence', 'tensor_transpose', 'context']],
['embed_no_exp', ['fsdp', 'fsdp_transpose', 'sequence', 'context']],
['embed_no_exp', ['fsdp', 'sequence', 'context']],
['embed_tensor_transpose', ['tensor_transpose']],
['embed_tensor_transpose', ['tensor_transpose']],
['q_lora', ['fsdp', 'fsdp_transpose', 'sequence', 'context', 'tensor_transpose', 'expert']],
['q_lora', ['fsdp', 'sequence', 'context', 'tensor_transpose', 'expert']],
['q_lora', ['fsdp', 'fsdp_transpose', 'sequence', 'context', 'expert']],
Expand Down Expand Up @@ -507,7 +508,7 @@ per_device_batch_size: 12.0
# Each data-loading host will load per_device_batch_size * expansion_factor_real_data.
# When set to between 0 and 1, it's for grain pipeline to use a smaller chip count to read checkpoint from a larger chip count job.
# Details in https://github.com/AI-Hypercomputer/maxtext/blob/main/docs/guides/data_input_grain.md#using-grain
expansion_factor_real_data: -1.0
expansion_factor_real_data: -1.0
eval_per_device_batch_size: 0.0
max_corpus_chars: 10_000_000
train_data_columns: ['text'] # for DPO dataset containing "chosen" and "rejected"
Expand Down Expand Up @@ -883,7 +884,7 @@ vision_output_dim_for_vit: 4096
pixel_shuffle_ratio_for_vit: 0.5
projector_dropout_for_vit: 0.0

# Subslice shape in the form of "x,y,z" when using pathways (single controller).
# Subslice shape in the form of "x,y,z" when using pathways (single controller).
# Example: "8,8" to use a 8x8 subgrid (64 chips) of a full pod (16x16) of trillium.
subslice_shape: ""

Expand Down