Skip to content

Commit b498194

Browse files
ko3n1gjaredcasper
authored andcommitted
ADLR/megatron-lm!1954 - Style: Formatting and imports
1 parent ef85bc9 commit b498194

File tree

25 files changed

+571
-411
lines changed

25 files changed

+571
-411
lines changed

.flake8

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
[flake8]
22
max-line-length = 100
3-
extend-ignore = E203
3+
extend-ignore = E203,E501,F401,E402,E714
44
per-file-ignores = __init__.py:F401

.gitlab/stages/01.tests.yml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -123,8 +123,9 @@ formatting:
123123
stage: test
124124
needs: [build_image]
125125
script:
126+
- env
126127
- git fetch origin main
127-
- CHECK_ONLY=true bash tools/autoformat.sh
128+
- CHECK_ONLY=true SKIP_DOCS=$([[ "$CI_MERGE_REQUEST_LABELS" == *"Skip docs"* ]] && echo "true" || echo "false") bash tools/autoformat.sh
128129

129130
copyright:
130131
extends: [.tests_common]

.pylintrc

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,12 @@
11
[MAIN]
22
ignore-paths=tests
3+
max-line-length=100
34

45
[MESSAGES CONTROL]
56
disable=all
67

7-
enable=C0115,C0116
8+
enable=C0115,C0116,W0611,C0301
89
# C0115: missing-class-docstring
9-
# C0116: missing-function-docstring
10+
# C0116: missing-function-docstring
11+
# W0611: unused-import
12+
# C0301: line-too-long

megatron/core/models/gpt/gpt_layer_specs.py

Lines changed: 36 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
22

3+
from typing import Optional
4+
35
from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add
46
from megatron.core.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear
57
from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules
@@ -9,7 +11,6 @@
911
from megatron.core.transformer.mlp import MLP, MLPSubmodules
1012
from megatron.core.transformer.moe.moe_layer import MoELayer
1113
from megatron.core.transformer.spec_utils import ModuleSpec
12-
from megatron.core.transformer.transformer_block import TransformerBlockSubmodules
1314
from megatron.core.transformer.transformer_layer import TransformerLayer, TransformerLayerSubmodules
1415

1516
try:
@@ -27,7 +28,7 @@
2728
HAVE_TE = False
2829

2930
try:
30-
import apex
31+
import apex # pylint: disable=unused-import
3132

3233
from megatron.core.fusions.fused_layer_norm import FusedLayerNorm
3334

@@ -38,14 +39,26 @@
3839

3940
from megatron.core.transformer.torch_layer_norm import WrappedTorchLayerNorm
4041

41-
warnings.warn(f'Apex is not installed. Falling back to Torch LayerNorm')
42+
warnings.warn('Apex is not installed. Falling back to Torch LayerNorm')
4243
LNImpl = WrappedTorchLayerNorm
4344

4445

45-
# Use this spec to use lower level Transformer Engine modules (required for fp8 training)
4646
def get_gpt_layer_with_transformer_engine_spec(
47-
num_experts: int = None, moe_grouped_gemm: bool = False, qk_layernorm: bool = False
47+
num_experts: Optional[int] = None,
48+
moe_grouped_gemm: Optional[bool] = False,
49+
qk_layernorm: Optional[bool] = False,
4850
) -> ModuleSpec:
51+
"""Use this spec to use lower-level Transformer Engine modules (required for fp8 training).
52+
53+
54+
Args:
55+
num_experts (int, optional): Number of experts. Defaults to None.
56+
moe_grouped_gemm (bool, optional): To use Grouped GEMM. Defaults to False.
57+
qk_layernorm (bool, optional): To use layernorm for queries/keys. Defaults to False.
58+
59+
Returns:
60+
ModuleSpec: Module specification with TE modules
61+
"""
4962
mlp = _get_mlp_module_spec(
5063
use_te=True, num_experts=num_experts, moe_grouped_gemm=moe_grouped_gemm
5164
)
@@ -73,10 +86,22 @@ def get_gpt_layer_with_transformer_engine_spec(
7386
)
7487

7588

76-
# Use this spec for an implementation using only modules in megatron core
7789
def get_gpt_layer_local_spec(
78-
num_experts: int = None, moe_grouped_gemm: bool = False, qk_layernorm: bool = False
90+
num_experts: Optional[int] = None,
91+
moe_grouped_gemm: Optional[bool] = False,
92+
qk_layernorm: Optional[bool] = False,
7993
) -> ModuleSpec:
94+
"""Use this spec for an implementation using only modules in Megatron-Core.
95+
96+
97+
Args:
98+
num_experts (int, optional): Number of experts. Defaults to None.
99+
moe_grouped_gemm (bool, optional): To use Grouped GEMM. Defaults to False.
100+
qk_layernorm (bool, optional): To use layernorm for queries/keys. Defaults to False.
101+
102+
Returns:
103+
ModuleSpec: Module specification with Megatron-Core modules
104+
"""
80105
mlp = _get_mlp_module_spec(
81106
use_te=False, num_experts=num_experts, moe_grouped_gemm=moe_grouped_gemm
82107
)
@@ -107,10 +132,12 @@ def get_gpt_layer_local_spec(
107132
)
108133

109134

110-
# Helper function to get module spec for MLP/MoE
111135
def _get_mlp_module_spec(
112-
use_te: bool = True, num_experts: int = None, moe_grouped_gemm: bool = False
136+
use_te: Optional[bool] = True,
137+
num_experts: Optional[int] = None,
138+
moe_grouped_gemm: Optional[bool] = False,
113139
) -> ModuleSpec:
140+
"""Helper function to get module spec for MLP/MoE"""
114141
if num_experts is None:
115142
# Dense MLP w/ or w/o TE modules.
116143
return ModuleSpec(

megatron/core/models/gpt/gpt_model.py

Lines changed: 41 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1,43 +1,58 @@
11
# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
22

3-
import logging
43
from collections import OrderedDict
5-
from typing import Dict, Literal, Optional, Tuple, Union
4+
from typing import Dict, Literal, Optional
65

7-
import torch
86
from torch import Tensor
97

10-
from megatron.core import InferenceParams, parallel_state, tensor_parallel
8+
from megatron.core import InferenceParams, tensor_parallel
119
from megatron.core.config_logger import has_config_logger_enabled, log_config_to_disk
1210
from megatron.core.dist_checkpointing.mapping import ShardedStateDict
1311
from megatron.core.models.common.embeddings.language_model_embedding import LanguageModelEmbedding
1412
from megatron.core.models.common.embeddings.rotary_pos_embedding import RotaryEmbedding
1513
from megatron.core.models.common.language_module.language_module import LanguageModule
1614
from megatron.core.packed_seq_params import PackedSeqParams
17-
from megatron.core.transformer.enums import AttnMaskType, ModelType
15+
from megatron.core.transformer.enums import ModelType
1816
from megatron.core.transformer.spec_utils import ModuleSpec
1917
from megatron.core.transformer.transformer_block import TransformerBlock
2018
from megatron.core.transformer.transformer_config import TransformerConfig
21-
from megatron.core.utils import make_tp_sharded_tensor_for_checkpoint
2219

2320

2421
class GPTModel(LanguageModule):
2522
"""GPT Transformer language model.
2623
2724
Args:
28-
config (TransformerConfig): Transformer config
29-
transformer_layer_spec (ModuleSpec): Specifies module to use for transformer layers
30-
vocab_size (int): Vocabulary size
31-
max_sequence_length (int): maximum size of sequence. This is used for positional embedding
32-
pre_process (bool, optional): Include embedding layer (used with pipeline parallelism). Defaults to True.
33-
post_process (bool, optional): Include an output layer (used with pipeline parallelism). Defaults to True.
34-
fp16_lm_cross_entropy (bool, optional): Defaults to False.
35-
parallel_output (bool, optional): Do not gather the outputs, keep them split across tensor parallel ranks. Defaults to True.
36-
share_embeddings_and_output_weights (bool, optional): When True, input embeddings and output logit weights are shared. Defaults to False.
37-
position_embedding_type (Literal[learned_absolute,rope], optional): Position embedding type.. Defaults to 'learned_absolute'.
38-
rotary_percent (float, optional): Percent of rotary dimension to use for rotary position embeddings. Ignored unless position_embedding_type is 'rope'. Defaults to 1.0.
39-
rotary_base (int, optional): Base period for rotary position embeddings. Ignored unless position_embedding_type is 'rope'. Defaults to 10000.
40-
seq_len_interpolation_factor (Optional[float], optional): scale of linearly interpolating RoPE for longer sequences. The value must be a float larger than 1.0. Defaults to None.
25+
config (TransformerConfig):
26+
Transformer config
27+
transformer_layer_spec (ModuleSpec):
28+
Specifies module to use for transformer layers
29+
vocab_size (int):
30+
Vocabulary size
31+
max_sequence_length (int):
32+
maximum size of sequence. This is used for positional embedding
33+
pre_process (bool, optional):
34+
Include embedding layer (used with pipeline parallelism). Defaults to True.
35+
post_process (bool, optional):
36+
Include an output layer (used with pipeline parallelism). Defaults to True.
37+
fp16_lm_cross_entropy (bool, optional):
38+
Defaults to False.
39+
parallel_output (bool, optional):
40+
Do not gather the outputs, keep them split across tensor
41+
parallel ranks. Defaults to True.
42+
share_embeddings_and_output_weights (bool, optional):
43+
When True, input embeddings and output logit weights are shared. Defaults to False.
44+
position_embedding_type (Literal[learned_absolute,rope], optional):
45+
Position embedding type.. Defaults to 'learned_absolute'.
46+
rotary_percent (float, optional):
47+
Percent of rotary dimension to use for rotary position embeddings.
48+
Ignored unless position_embedding_type is 'rope'. Defaults to 1.0.
49+
rotary_base (int, optional):
50+
Base period for rotary position embeddings. Ignored unless
51+
position_embedding_type is 'rope'.
52+
Defaults to 10000.
53+
seq_len_interpolation_factor (Optional[float], optional):
54+
scale of linearly interpolating RoPE for longer sequences.
55+
The value must be a float larger than 1.0. Defaults to None.
4156
"""
4257

4358
def __init__(
@@ -113,8 +128,9 @@ def __init__(
113128
# all the micro-batches of a global batch for the last pipeline stage. Once we are
114129
# done with all the back props for all the microbatches for the last pipeline stage,
115130
# it will be in the pipeline flush stage. During this pipeline flush we use the
116-
# input activations stored in embedding activation buffer and gradient outputs stored
117-
# in gradient buffer to calculate the weight gradients for the embedding final linear layer.
131+
# input activations stored in embedding activation buffer and gradient outputs
132+
# stored in gradient buffer to calculate the weight gradients for the embedding
133+
# final linear layer.
118134
self.embedding_activation_buffer = []
119135
self.grad_output_buffer = []
120136
else:
@@ -239,7 +255,8 @@ def forward(
239255
def sharded_state_dict(
240256
self, prefix: str = '', sharded_offsets: tuple = (), metadata: Optional[Dict] = None
241257
) -> ShardedStateDict:
242-
"""Sharded state dict implementation for GPTModel backward-compatibility (removing extra state).
258+
"""Sharded state dict implementation for GPTModel backward-compatibility
259+
(removing extra state).
243260
244261
Args:
245262
prefix (str): Module name prefix.
@@ -252,8 +269,8 @@ def sharded_state_dict(
252269
sharded_state_dict = super().sharded_state_dict(prefix, sharded_offsets, metadata)
253270
output_layer_extra_state_key = f'{prefix}output_layer._extra_state'
254271

255-
# Old GPT checkpoints only stored the output layer weight key. So we remove the _extra_state key
256-
# but check that it doesn't contain any data anyway
272+
# Old GPT checkpoints only stored the output layer weight key. So we remove the
273+
# _extra_state key but check that it doesn't contain any data anyway
257274
output_extra_state = sharded_state_dict.pop(output_layer_extra_state_key, None)
258275
assert not (
259276
output_extra_state and output_extra_state.data

megatron/core/parallel_state.py

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -255,7 +255,8 @@ def __init__(
255255
for name in self.name_to_size.keys():
256256
if name not in order and self.name_to_size[name] != 1:
257257
raise RuntimeError(
258-
f"The size of ({name}) is ({self.name_to_size[name]}), but you haven't specified the order ({self.order})."
258+
f"The size of ({name}) is ({self.name_to_size[name]}), but you haven't"
259+
f"specified the order ({self.order})."
259260
)
260261
elif name not in order:
261262
order = order + '-' + name
@@ -355,6 +356,7 @@ def initialize_model_parallel(
355356
get_embedding_ranks: Optional[Callable[[List[int], Optional[int]], List[int]]] = None,
356357
get_position_embedding_ranks: Optional[Callable[[List[int], Optional[int]], List[int]]] = None,
357358
) -> None:
359+
# pylint: disable=line-too-long
358360
"""Initialize model data parallel groups.
359361
360362
Args:
@@ -524,7 +526,8 @@ def initialize_model_parallel(
524526

525527
if data_parallel_size % expert_model_parallel_size != 0:
526528
raise RuntimeError(
527-
f"data_parallel_size ({data_parallel_size}) is not divisible by expert_model_parallel_size "
529+
f"data_parallel_size ({data_parallel_size}) is not divisible by "
530+
"expert_model_parallel_size "
528531
)
529532

530533
encoder_world_size = encoder_model_size * data_parallel_size
@@ -999,20 +1002,23 @@ def get_tensor_and_context_parallel_group():
9991002

10001003

10011004
def get_expert_model_parallel_group():
1005+
"""Get the expert model parallel group the caller rank belongs to."""
10021006
assert (
10031007
_EXPERT_MODEL_PARALLEL_GROUP is not None
10041008
), 'expert model parallel group is not initialized'
10051009
return _EXPERT_MODEL_PARALLEL_GROUP
10061010

10071011

10081012
def get_tensor_and_expert_parallel_group():
1013+
"""Get the tensor and expert parallel group the caller rank belongs to."""
10091014
assert (
10101015
_TENSOR_AND_EXPERT_PARALLEL_GROUP is not None
10111016
), 'tensor and expert parallel group is not initialized'
10121017
return _TENSOR_AND_EXPERT_PARALLEL_GROUP
10131018

10141019

10151020
def get_data_modulo_expert_parallel_group(with_context_parallel=False):
1021+
"""Get the data modulo expert parallel group the caller rank belongs to."""
10161022
if with_context_parallel:
10171023
assert (
10181024
_DATA_MODULO_EXPERT_PARALLEL_GROUP_WITH_CP is not None
@@ -1026,6 +1032,7 @@ def get_data_modulo_expert_parallel_group(with_context_parallel=False):
10261032

10271033

10281034
def get_data_modulo_expert_parallel_group_gloo(with_context_parallel=False):
1035+
"""Get the data modulo expert parallel group gloo the caller rank belongs to."""
10291036
if with_context_parallel:
10301037
assert (
10311038
_DATA_MODULO_EXPERT_PARALLEL_GROUP_WITH_CP_GLOO is not None
@@ -1039,6 +1046,7 @@ def get_data_modulo_expert_parallel_group_gloo(with_context_parallel=False):
10391046

10401047

10411048
def set_expert_model_parallel_world_size(world_size):
1049+
"""Sets the expert model parallel world size."""
10421050
global _MPU_EXPERT_MODEL_PARALLEL_WORLD_SIZE
10431051
_MPU_EXPERT_MODEL_PARALLEL_WORLD_SIZE = world_size
10441052

@@ -1327,7 +1335,8 @@ def get_pipeline_model_parallel_last_rank():
13271335

13281336
def get_pipeline_model_parallel_next_rank():
13291337
"""Return the global rank that follows the caller in the pipeline, for each pipeline group that
1330-
the rank is part of. If it's just part of one group, an int is returned, otherwise a list of ints.
1338+
the rank is part of. If it's just part of one group, an int is returned,
1339+
otherwise a list of ints.
13311340
"""
13321341
assert _PIPELINE_GLOBAL_RANKS is not None, "Pipeline parallel group is not initialized"
13331342
rank_in_pipeline = get_pipeline_model_parallel_rank()
@@ -1343,7 +1352,8 @@ def get_pipeline_model_parallel_next_rank():
13431352

13441353
def get_pipeline_model_parallel_prev_rank():
13451354
"""Return the global rank that preceeds the caller in the pipeline, for each pipeline group that
1346-
the rank is part of. If it's just part of one group, an int is returned, otherwise a list of ints.
1355+
the rank is part of. If it's just part of one group, an int is returned,
1356+
otherwise a list of ints.
13471357
"""
13481358
assert _PIPELINE_GLOBAL_RANKS is not None, "Pipeline parallel group is not initialized"
13491359
rank_in_pipeline = get_pipeline_model_parallel_rank()

0 commit comments

Comments
 (0)